""" %s: parse an html file. useful for parsing meta data into different properties """ from sgmllib import SGMLParser import string def join_attrs(attrs): attr_list = [] for attrname, value in attrs: attr_list.append('%s="%s"' % (attrname, string.strip(value))) if attr_list: s = " " + string.join(attr_list, " ") else: s = "" return s class HeadParser(SGMLParser): def __init__(self, content_start_comment='', content_end_comment=''): SGMLParser.__init__(self) self.seen_starthead = 0 self.seen_endhead = 0 self.seen_startbody = 0 self.seen_startcontent = 0 self.seen_endcontent = 0 self.content_start_comment = content_start_comment self.content_end_comment = content_end_comment self.head = "" self.title = "" self.meta = {} self.accumulator = "" def handle_data(self, data): if data: if self.seen_startcontent and not self.seen_endcontent: self.accumulator = self.accumulator + data def handle_charref(self, ref): self.handle_data("&#%s;" % ref) def handle_entityref(self, ref): self.handle_data("&%s;" % ref) def handle_comment(self, data): if data: if string.find(data,self.content_start_comment)>=0: self.seen_startcontent = 1 return if string.find(data,self.content_end_comment)>=0: self.seen_endcontent = 1 return if self.seen_startcontent and not self.seen_endcontent: self.accumulator = self.accumulator + "<!--%s-->" % data def start_head(self, attrs): if not self.seen_starthead: self.seen_starthead = 1 self.head = "" self.title = "" self.accumulator = "" def end_head(self): if not self.seen_endhead: self.seen_endhead = 1 self.head = self.head + self.accumulator self.accumulator = "" def start_title(self, attrs): self.head = self.head + self.accumulator self.accumulator = "" def end_title(self): self.title = self.accumulator self.accumulator = "" def start_meta(self, attrs): #get the metas into a dict if self.seen_starthead and not self.seen_endhead: meta_key = '' for attrname, value in attrs: if attrname == 'name': meta_key = string.lower(string.strip(value)) if attrname == 'content' and meta_key != '': self.meta[meta_key] = string.strip(value) def start_body(self, attrs): if not self.seen_startbody: self.seen_startbody = 1 self.accumulator = "" if self.content_start_comment == '': self.seen_startcontent = 1 def end_body(self): pass # Do not put </BODY> and </HTML> def end_html(self): pass # into output stream # Pass other tags unmodified def unknown_starttag(self, tag, attrs): if self.seen_startcontent and not self.seen_endcontent: self.accumulator = self.accumulator + "<%s%s>" % (string.upper(tag), join_attrs(attrs)) def unknown_endtag(self, tag): if self.seen_startcontent and not self.seen_endcontent: self.accumulator = self.accumulator + "</%s>" % string.upper(tag) def parse_html(infile,content_start_comment='', content_end_comment=''): parser = HeadParser(content_start_comment, content_end_comment) while 1: line = infile.readline() if not line: break parser.feed(line) parser.close() infile.close() return (string.strip(parser.title), string.strip(parser.head), parser.meta,string.strip(parser.accumulator)) def test(): f = open('test.html','r') (title,head,meta,content) = parse_html(f,'content_starts_here','content_ends_here') f.close() print "title = ", title print meta['keywords'] print meta['description'] print meta['author'] print meta['copyright'] print content f.close() if __name__=='__main__': test()
Since you are seeing this, it means that your browser does not support cascading style sheets. Please download and use one of the many browsers that support web standards.