Web Development :: Parse and HTML page into meta data and body

Parse and HTML page into meta data and body

"""
%s: parse an html file. useful for parsing meta data into different properties
"""
from sgmllib import SGMLParser
import string

def join_attrs(attrs):
    attr_list = []
    for attrname, value in attrs:
        attr_list.append('%s="%s"' % (attrname, string.strip(value)))

    if attr_list:
        s = " " + string.join(attr_list, " ")
    else:
        s = ""
    return s


class HeadParser(SGMLParser):
    def __init__(self, content_start_comment='', content_end_comment=''):
        SGMLParser.__init__(self)

        self.seen_starthead = 0
        self.seen_endhead   = 0
        self.seen_startbody = 0

        self.seen_startcontent = 0
        self.seen_endcontent = 0

        self.content_start_comment = content_start_comment
        self.content_end_comment = content_end_comment


        self.head = ""
        self.title = ""
        self.meta = {}
        self.accumulator = ""


    def handle_data(self, data):
        if data:
            if self.seen_startcontent and not self.seen_endcontent:
                self.accumulator = self.accumulator + data

    def handle_charref(self, ref):
        self.handle_data("&#%s;" % ref)

    def handle_entityref(self, ref):
        self.handle_data("&%s;" % ref)

    def handle_comment(self, data):
        if data:
            if string.find(data,self.content_start_comment)>=0:
                self.seen_startcontent = 1
                return
            if string.find(data,self.content_end_comment)>=0:
                self.seen_endcontent = 1
                return
            if self.seen_startcontent and not self.seen_endcontent:
                self.accumulator = self.accumulator + "<!--%s-->" % data


    def start_head(self, attrs):
        if not self.seen_starthead:
            self.seen_starthead = 1
            self.head = ""
            self.title = ""
            self.accumulator = ""

    def end_head(self):
        if not self.seen_endhead:
            self.seen_endhead = 1
            self.head = self.head + self.accumulator
            self.accumulator = ""


    def start_title(self, attrs):
        self.head = self.head + self.accumulator
        self.accumulator = ""

    def end_title(self):
        self.title = self.accumulator
        self.accumulator = ""

    def start_meta(self, attrs): #get the metas into a dict
        if self.seen_starthead and not self.seen_endhead:
            meta_key = ''
            for attrname, value in attrs:
                if attrname == 'name':
                    meta_key = string.lower(string.strip(value))
                if attrname == 'content' and meta_key != '':
                    self.meta[meta_key] = string.strip(value)

    def start_body(self, attrs):
        if not self.seen_startbody:
            self.seen_startbody = 1
            self.accumulator = ""
            if self.content_start_comment == '':
                self.seen_startcontent = 1

    def end_body(self): pass # Do not put </BODY> and </HTML>
    def end_html(self): pass # into output stream

    # Pass other tags unmodified
    def unknown_starttag(self, tag, attrs):
        if self.seen_startcontent and not self.seen_endcontent:
            self.accumulator = self.accumulator + "<%s%s>" % (string.upper(tag), join_attrs(attrs))

    def unknown_endtag(self, tag):
        if self.seen_startcontent and not self.seen_endcontent:
            self.accumulator = self.accumulator + "</%s>" % string.upper(tag)

def parse_html(infile,content_start_comment='', content_end_comment=''):
    parser = HeadParser(content_start_comment, content_end_comment)
    while 1:
        line = infile.readline()
        if not line: break
        parser.feed(line)
    parser.close()
    infile.close()
    return (string.strip(parser.title), string.strip(parser.head),
            parser.meta,string.strip(parser.accumulator))

def test():
    f = open('test.html','r')
    (title,head,meta,content) = parse_html(f,'content_starts_here','content_ends_here')
    f.close()
    print "title = ", title
    print meta['keywords']
    print meta['description']
    print meta['author']
    print meta['copyright']
    print content
    f.close()

if __name__=='__main__': test()
Since you are seeing this, it means that your browser does not support cascading style sheets. Please download and use one of the many browsers that support web standards.
vsbabu.org

Parse and HTML page into meta data and body