{
Mark Pilgrim's excellent Dive Into Python has a section on using SGMLParser and having seen nothing similar (and imagining its many uses!) I thought I'd give it a whirl in IronPython. I thought a good proof of concept would be creating a database out of link heavy sites. Since I visit Arts & Letters Daily every so often and the closet intellectual in me likes to hang onto what I find there, I thought I'd target it:
import urllib2
import sgmllib
from sgmllib import SGMLParser
import clr
clr.AddReference("System.Data")
clr.AddReference("System.Net")
from System import *
from System.Data import *
from System.Net import *
class AlReader(SGMLParser):
def reset(self):
SGMLParser.reset(self)
self.urls = []
self.pieces = []
self.track = 0
self.prePend = "No Category"
self.counter = 0
def start_a(self, attrs):
href = [v for k,v in attrs if k == "href"]
key = [v for k,v in attrs if k == "name"]
if href:
self.urls.extend(href)
self.track = 1
elif key:
self.prePend = attrs[0][1]
def handle_data(self, text):
if self.track:
self.pieces.append("|".join([self.prePend, text]))
self.counter = self.counter + 1
def end_a(self):
self.track = 0
def get_links(self):
links = []
for i in range(0, len(self.urls)):
links.append("|".join([self.pieces[i], self.urls[i]]))
return links
#print "%s %s" % (self.counter, "Total links")
def get_link_datatable(self):
d = DataTable()
d.Columns.Add(DataColumn("Category", Type.GetType("System.String")))
d.Columns.Add(DataColumn("Site", Type.GetType("System.String")))
d.Columns.Add(DataColumn("Url", Type.GetType("System.String")))
for text in self.get_links():
newRow = d.NewRow()
newRow["Category"], newRow["Site"], newRow["Url"] = text.split("|")
d.Rows.Add(newRow)
return d
response = urllib2.urlopen("http://www.aldaily.com")
a = AlReader()
a.feed(response.read())
linkdata = a.get_link_datatable()
# write it out to prove we got it.
ds = DataSet()
ds.Tables.Add(linkdata)
ds.WriteXml("c:\\temp\\arts and letters links.xml")
If you find tihs interesting do make sure you look at Pilgrim's chapter on HTML Processing.