getElementsBySelector() - CSS Query Selector for HTML DOM
getElementsBySelector is a python function which takes a standard CSS style selector and returns an array of elements objects from the document that match that selector. This is a frequently used function in JavaScript - if you use a library. Its kind of meaningless to have this function in the server side - unless you are doing screen-scarping. Then its very useful. Recently, I had to work on a Django app that does a bit of screen-scrapping - so I created this function to aid me.
This is a direct port of my JavaScript getElementsBySelector function. I'll be creating a similar function for PHP soon. Also for Ruby if time permits.
The Code
#Get DOM elements based on the given CSS Selector - V 1.00.A Beta
#Direct port of http://www.openjs.com/scripts/dom/css_selector/
def getElementsBySelector(all_selectors, document):
selected = []
import re, string
all_selectors = re.sub(r'\s*([^\w])\s*',r'\1', all_selectors) #Remove the 'beautification' spaces
# Grab all of the tagName elements within current context
def getElements(context,tag):
if (tag == ""): tag = '*'
# Get elements matching tag, filter them for class selector
found = []
for con in context:
eles = con.getElementsByTagName(tag)
found.extend(eles)
return found
context = [document]
inheriters = string.split(all_selectors, " ")
# Space
for element in inheriters:
#This part is to make sure that it is not part of a CSS3 Selector
left_bracket = string.find(element,"[")
right_bracket = string.find(element,"]")
pos = string.find(element,"#") #ID
if(pos+1 and not(pos>left_bracket and pos<right_bracket)):
parts = string.split(element, "#")
tag = parts[0]
id = parts[1]
ele = document.getElementById(id)
context = [](ele)
continue
pos = string.find(element,".")#Class
if(pos+1 and not(pos>left_bracket and pos<right_bracket)):
parts = string.split(element, '.')
tag = parts[0]
class_name = parts[1]
found = getElements(context, tag)
context = []
for fnd in found:
if(fnd.getAttribute("class") and re.search(r'(^|\s)'+class_name+'(\s|$)', fnd.getAttribute("class"))): context.append(fnd)
continue
if(string.find(element,'[')+1):#If the char '[' appears, that means it needs CSS 3 parsing
# Code to deal with attribute selectors
m = re.match(r'^(\w*)\[(\w+)([=~\|\^\$\*]?)=?[\'"]?([^\]\'"]*)[\'"]?\]$', element)
if (m):
tag = m.group(1)
attr = m.group(2)
operator = m.group(3)
value = m.group(4)
found = getElements(context,tag)
context = []
for fnd in found:
if(operator=='=' and fnd.getAttribute(attr) != value): continue
if(operator=='~' and not(re.search(r'(^|\\s)'+value+'(\\s|$)', fnd.getAttribute(attr)))): continue
if(operator=='|' and not(re.search(r'^'+value+'-?', fnd.getAttribute(attr)))): continue
if(operator=='^' and string.find(fnd.getAttribute(attr), value)!=0): continue
if(operator=='$' and string.rfind(fnd.getAttribute(attr), value) != (fnd.getAttribute(attr).length-value.length)): continue
if(operator=='*' and not(string.find(fnd.getAttribute(attr), value)+1)): continue
elif(not fnd.getAttribute(attr)): continue
context.append(fnd)
continue
#Tag selectors - no class or id specified.
found = getElements(context,element)
context = found
selected.extend(context)
return selected
Sample Usage
import urllib2
from xml.dom.minidom import parseString
html = urllib2.urlopen("http://search.twitter.com/search?q=RT+http").read()
dom = parseString(html)
links = getElementsBySelector("a[rel=nofollow]", dom)
for a in links: print a.getAttribute("href")
This should work. Except that it doesn't. Not my fault - the XML parser will only parse valid XML documents - and Twitter's search page is not a valid XML file. So we have to run the code through HTMLTidy. To do that, tidy must be installed. Once that is installed, use the code...
import urllib2
import tidy
html = urllib2.urlopen("http://search.twitter.com/search?q=RT+http").read()
html = str(tidy.parseString(html, output_xhtml=1))
dom = parseString(html)
links = getElementsBySelector("a[rel=nofollow]", dom)
for a in links: print a.getAttribute("href")
Other XML/HTML Parsers in Python
Beta Release
This is a beta release - so expect bugs.