#!/usr/bin/env python
"""

THIS IS DEPRECATED: PLEASE REFER CVS FOR LATEST VERSION
http://vsbabu.org/tools/viewcvs.cgi/pyblagg/

pyblagg: quick aggregator for Linux/Unix machines.

  This  is an AGGREGATOR, not a news reader. Most readers out there
  still force me to go to each channel I subscribe to and figure out
  what the latest news is. This collects all the channels, aggregates
  the news and then display latest news first.

installation and usage:
- get Mark Pilgrim's ultraliberal parser - see URL below
    - you need to patch it. Add the following 4 lines to the class RSSParser
        start_date = start_dc_date
        end_date = end_dc_date 
        start_pubdate = start_dc_date
        end_pubdate = end_dc_date       
- get log4py module - see URL below
- edit configuration settings in your file ~/.pyblagg
- give it execute permission
- run it from command line periodically. I don't recommend running these from CGI.

TODO:
- Take timezone into consideration

- At the moment, this uses my template. Need to change it to use some
nice templating language.

- It is a bad bad world out there for RSS/RDF. Everyone seem to have their
  own definitions. Make more graceful error checking.

- Use pickle or some other persistent module instead of saving feeds in files.
"""
__author__ = "S Babu <vsbabu_at_vsbabu_dot_org>"
__date__ = "20 September 2002"
__credits__ = """Mark Pilgrim, for liberal RSS parser
http://www.diveintomark.org/archives/2002/08/13.html#ultraliberal_rss_parser
Martin Preishuber, for log4py
http://sourceforge.net/project/showfiles.php?group_id=36216
"""

__version__ = "$Revision: 1.2 $"
#$Source: /home/vsbabu/repository/python/pyblagg/pyblagg.py,v $

#$Log: pyblagg.py,v $
#Revision 1.2  2002/09/23 20:51:33  vsbabu
#Removed setup options from the script to a configuration file. It is parsed using
#ConfigParser module. See the end of the script for sample.


# ---------- start configuration ------------------------
config_file = "/home/vsbabu/.pyblagg"
log_level = 2
#log level can be 0 .. 4 as specified in log4py
#0 = none. 1 = only errors. 2 = normal.
#3 = verbose. 4 = debug
#This is overridden later by configuration file setting
# ---------- end configuration ------------------------
import rssparser
import os
import time
import re


from log4py import Logger

log4py = Logger().get_instance()
log4py.set_loglevel(1<<log_level)

invalid_stuff = re.compile(r"\W+")
def permalink2localfile(link):
    """Given a full url, tries to make a file name by replacing unwelcome characters with periods"""
    filenm = link
    filenm = filenm.replace('http://','')
    filenm = filenm.replace('www.','')
    filenm = invalid_stuff.sub('.', filenm)
    filenm = re.sub('\.+', '.', filenm)
    return filenm

months = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
def parsetime(str_time, fmt="%Y-%m-%dT%H:%M:%S"):
    """Returns the time from the string; as seconds since epoch.
    
    fmt is ignored at the moment
    """
    str_time = str_time.strip()
    #This is not right. I don't have strptime on my machines, so
    #if you have that function, this might not work :-)
    #str_time = str_time[:len(time.strftime(fmt,time.localtime()))]
    #try:
    #    ret = time.strptime(str_time, fmt)
    #    return time.mktime(ret)
    #except:
    #    pass
    if str_time[3] == ',':
        log4py.debug("RSS2 " + str_time)
        #RSS2.0(or is it just Radio?) has a format Sun, 22 Sep 2002 15:04:54 GMT
        #this assumption doesn't make me feel comfortable. okay!!!
        try:
            yyyy = int(str_time[12:16])
            dd = int(str_time[5:7])
            hh = int(str_time[17:19])
            mi = int(str_time[20:22])
            ss = int(str_time[23:25])
            mm = months.index(str_time[8:11].lower())+1
        except ValueError:
            log4py.error( "RSS2 value error " + str_time)
            return None
    else:
        try:
            (yyyy, mm, dd, hh, mi, ss)=(int(str_time[0:4]),
                                    int(str_time[5:7]),
                                    int(str_time[8:10]),
                                    int(str_time[11:13]),
                                    int(str_time[14:16]),
                                    int(str_time[17:19])
                                    )
        except ValueError:
            log4py.error("value error " + str_time)
            return None
    #the last 1 is for DST
    #timezone is still not considered
    ret = (yyyy, mm, dd, hh, mi, ss, 0, 0, 1)
    return time.mktime(ret)

def titleOrdesc(dict):
    """Return a tuple, (title,description) from the dictionary.

    I want a title, then description, if it exists. some feeds have title.
    some have only description. some have both.
    """
    k = dict.keys()
    title = None
    description = None
    if 'title' in k: 
        title = dict['title']
    else:
        title = None
    if 'description' in k and not 'title' in k: 
        title = dict['description']
        description = None
    if 'description' in k and 'title' in k: 
        description = dict['description']
        title = dict['title']
    if 'description' not in k:
        description = None
    return (title, description)

class Channel:
    """An RSS channel
    rss : url to the RSS or RDF file
    
    From this url, it figures out the rest;
    title : Title as specified in RSS file UNLESS you specify it while creating object
    link : url for the site, as specified in the RSS file
    description : optional textual description.
    items[]: collection of type NewsItem for the items in the feed
    """
    
    def __init__(self, rss, title=None):
        self.rss = rss
        self.title = title
        self.link = None
        self.description = None
        self.items = []

        
    def loadChannel(self, rss=None):
        """Downloads and parses a channel
        
        sets the feed's title, link and description.
        sets and returns items[], for NewsItems defined
        """
        if rss is None:
            rss = self.rss
        channel, items = rssparser.parse(rss)
        if 'link' in channel.keys():
            self.link = channel['link']
        title, self.description = titleOrdesc(channel)
        if self.title is None:
            self.title = title
        self.items = []
        for item in items:
            self.items.append(NewsItem(item))
        return self.items

    def htmlLink(self):
        """Returns an HTML string that represents a link to the channel"""
        if self.title is not None and self.description is not None:
            return """<a href="%s" title="%s">%s</a>""" % (self.link, self.description, self.title)
        else:
            return """<a href="%s">%s</a>""" % (self.link, self.title)
            
class NewsItem:
    """Each item in a channel"""
    
    def __init__(self, dict):
        self.link = dict['link']
        self.title, self.description = titleOrdesc(dict)
        if 'date' in dict.keys():
            self.date = parsetime(dict['date'])
        else:
            self.date = None
        self.filenm = permalink2localfile(self.link)
            

if __name__ == '__main__':
    log4py.info("startup")

    log4py.info("get configuration options from " + config_file)
    from ConfigParser import ConfigParser
    config = ConfigParser()
    config.readfp(open(config_file))

    datadir = config.get("DEFAULT", "data_directory")
    output = config.get("DEFAULT", "output_html_file")
    blog_items_to_show = config.getint("DEFAULT", "blog_items_to_show")
    log_level = config.getint("DEFAULT", "log_level")
    dirlist_rev_sort = config.get("DEFAULT", "directory_command")

    log4py.info("got default configuration options")

    log4py.debug("Data directory = " + datadir)
    log4py.debug("Output file = " + output)
    log4py.debug("Number of items shown = %d" % blog_items_to_show)
    blogroll = {}

    for blog in config.sections():
        if config.has_option(blog, "name"):
            blogroll[blog] = config.get(blog, "name")
        else:
            blogroll[blog] = None

    log4py.info("got blog information")

    # let us first update our mini database
    channels = []
    
    for blog in blogroll.keys():
        log4py.info("creating channel [%s]" % blog)
        channel = Channel(blog, blogroll[blog])
        try:
            channel.loadChannel()
        except:
            log4py.error("Could not download channel => " + blog)
            continue
        channels.append(channel)
    log4py.info("collected all channels")
    
    log4py.info("start processing channels")
    blogroll_string = ""
    for channel in channels:
        log4py.debug("\t channel:" + channel.title)
        blogroll_string = blogroll_string + channel.htmlLink() + "<br/>"
        for item in channel.items:
            filenm = datadir + os.sep  + item.filenm
            log4py.debug("\t\t" + item.title[:20] + "[" + filenm + "]")
            if item.date is None:
                if os.path.exists(filenm):
                    item.date = os.stat(filenm)[8]
                    log4py.warn("[%s]%s - had to use existing file time" % (channel.title, item.title[:20]) )
                else:
                    item.date = time.mktime(time.localtime())
                    log4py.warn("[%s]%s - had to use system time" % (channel.title, item.title[:20]) )
            #this is probably a bad idea to update it always. then again,
            #people don't put 1001 things on their rdf's
            entry = """<a href="%s">%s</a>\n""" % (item.link, item.title)
            if item.description is not None:
                entry = entry + "\n" + item.description
            entry = entry + '<br/>\n<div class="bloglinks">' + time.strftime("%B %d, %Y %I:%M %p", time.localtime(item.date))
            entry = entry + '\n<span class="separator">|</span> ' + channel.htmlLink() + '</div>'
            try:
                fp = open(filenm,"w")
                fp.write(entry)
                fp.close()
            except:
                log4py.error("Could not save =>", filenm)
                continue
            if item.date is not None:
                os.utime(filenm, (item.date, item.date))
    log4py.info("finished processing channels")
    
    log4py.info("preparing output file " + output)
    ofp = open(output, "w")
    ofp.write("""
    <!--#include virtual="/mt/header.php?page_title=NewsAggregation&index_instr=noindex,follow"-->
    <div id="content">
    """)
    log4py.info("getting channel data")
    aggritems = os.popen(dirlist_rev_sort + ' ' + datadir).readlines()[:blog_items_to_show]
    for blogitem in aggritems:
        blogitem = datadir + os.sep + blogitem.strip()
        blogdate = time.strftime("%Y.%m.%d %I:%M", time.localtime(os.stat(blogitem)[8]))
        ofp.write("<p>"+ open(blogitem, "r").read() + "</p>\n")
    
    ofp.write("""
    </div>
    <div id="links">
    <div class="side"><em>Updated on %s</em></div>
    <h5 class="sidetitle">Subscriptions</h5>
    <div class="side">
    %s
    </div>
    <hr/>
    <div class="side">
    <a href="pyblagg.py.txt">Python Script</a>
    <br/>
    - that generates this. Doesn't have a real name yet!
    </div>
    
    </div>
    <!--#include virtual="/mt/footer.php"-->
    """ % (time.strftime("%B %d, %Y %I:%M %p", time.localtime()), blogroll_string) )
    ofp.close()
    log4py.info("finished making output " + output)

#---------------- sample config file ------------------------
#[DEFAULT]
#data_directory = /home/vsbabu/software/pyblagg/data
#output_html_file = /home/vsbabu/www/mt/feeds/index.html
#blog_items_to_show = 60
#directory_command = ls -t
#log_level = 2
#
#[http://www.diveintomark.org/xml/rss.xml]
#name = Mark Pilgrim
#
#[http://www.brunningonline.net/simon/blog/index.rdf]
#name = Simon Brunning
#
#[http://vsbabu.org/tharunya/blog/index.rdf]
#name = Tharunya Bhasker