jump to content

"""
%s: parse an html file. useful for parsing meta data into different properties
"""
from sgmllib import SGMLParser
import string

def join_attrs(attrs):
    attr_list = []
    for attrname, value in attrs:
        attr_list.append('%s="%s"' % (attrname, string.strip(value)))

    if attr_list:
        s = " " + string.join(attr_list, " ")
    else:
        s = ""
    return s


class HeadParser(SGMLParser):
    def __init__(self, content_start_comment='', content_end_comment=''):
        SGMLParser.__init__(self)

        self.seen_starthead = 0
        self.seen_endhead   = 0
        self.seen_startbody = 0

        self.seen_startcontent = 0
        self.seen_endcontent = 0

        self.content_start_comment = content_start_comment
        self.content_end_comment = content_end_comment


        self.head = ""
        self.title = ""
        self.meta = {}
        self.accumulator = ""


    def handle_data(self, data):
        if data:
            if self.seen_startcontent and not self.seen_endcontent:
                self.accumulator = self.accumulator + data

    def handle_charref(self, ref):
        self.handle_data("&#%s;" % ref)

    def handle_entityref(self, ref):
        self.handle_data("&%s;" % ref)

    def handle_comment(self, data):
        if data:
            if string.find(data,self.content_start_comment)>=0:
                self.seen_startcontent = 1
                return
            if string.find(data,self.content_end_comment)>=0:
                self.seen_endcontent = 1
                return
            if self.seen_startcontent and not self.seen_endcontent:
                self.accumulator = self.accumulator + "<!--%s-->" % data


    def start_head(self, attrs):
        if not self.seen_starthead:
            self.seen_starthead = 1
            self.head = ""
            self.title = ""
            self.accumulator = ""

    def end_head(self):
        if not self.seen_endhead:
            self.seen_endhead = 1
            self.head = self.head + self.accumulator
            self.accumulator = ""


    def start_title(self, attrs):
        self.head = self.head + self.accumulator
        self.accumulator = ""

    def end_title(self):
        self.title = self.accumulator
        self.accumulator = ""

    def start_meta(self, attrs): #get the metas into a dict
        if self.seen_starthead and not self.seen_endhead:
            meta_key = ''
            for attrname, value in attrs:
                if attrname == 'name':
                    meta_key = string.lower(string.strip(value))
                if attrname == 'content' and meta_key != '':
                    self.meta[meta_key] = string.strip(value)

    def start_body(self, attrs):
        if not self.seen_startbody:
            self.seen_startbody = 1
            self.accumulator = ""
            if self.content_start_comment == '':
                self.seen_startcontent = 1

    def end_body(self): pass # Do not put </BODY> and </HTML>
    def end_html(self): pass # into output stream

    # Pass other tags unmodified
    def unknown_starttag(self, tag, attrs):
        if self.seen_startcontent and not self.seen_endcontent:
            self.accumulator = self.accumulator + "<%s%s>" % (string.upper(tag), join_attrs(attrs))

    def unknown_endtag(self, tag):
        if self.seen_startcontent and not self.seen_endcontent:
            self.accumulator = self.accumulator + "</%s>" % string.upper(tag)

def parse_html(infile,content_start_comment='', content_end_comment=''):
    parser = HeadParser(content_start_comment, content_end_comment)
    while 1:
        line = infile.readline()
        if not line: break
        parser.feed(line)
    parser.close()
    infile.close()
    return (string.strip(parser.title), string.strip(parser.head),
            parser.meta,string.strip(parser.accumulator))

def test():
    f = open('test.html','r')
    (title,head,meta,content) = parse_html(f,'content_starts_here','content_ends_here')
    f.close()
    print "title = ", title
    print meta['keywords']
    print meta['description']
    print meta['author']
    print meta['copyright']
    print content
    f.close()

if __name__=='__main__': test()