jump to content

Very often, I had to download pure HTML from a URL and save it to the disk. Simple options include using Lynx/IE/Netscape and dumping the file. Or, take 2 hours to figure out proper options for wgetexternal link options. So, I wrote this script. The goal was to write some thing in Pythonexternal link and also to add to my way of maintaining the site - Make in Zope, download to PC, upload to Csoftexternal link .

http://ctx:8080 is the address of my development Zope machine. You might want to change it :-)

#Given a URL, get the data, into a file of the
#same name into corresponding folder
# used to slurp Zope documents  into filefolder
# vsbabu-removethis@hotmail.com

import string, urllib, urlparse, os, getopt, sys, re

default_filename = 'index.html'
list_filename = 'URLFILE.txt'

help = '''
slurp.py: Downloads the URLs specified to a local file.

    A list of URLs
    -i input-file
When input-file is specified, it is assumed that that file
contains one URL per line. The program downloads all those
urls. Typically, that file is name URLFILE.txt.

Files are downloaded to a directory structure from the
current directory.
class myURLOpener(urllib.FancyURLopener):
    # read an URL, with automatic HTTP authentication

    def setpasswd(self, user, passwd):
        self.__user = user
        self.__passwd = passwd

    def prompt_user_passwd(self, host, realm):
        return self.__user, self.__passwd

urlopener = myURLOpener()
#if you are facing some stupid problem with Zope 2.2.2,
#you might not be able to open the site without a login.
#so create a userid/password at the root folder of Zope
#and set those values in the line below and uncomment it.
#urlopener.setpasswd("userid", "password")

def readURL(url):
    """Returns the file found in the URL
    f = urlopener.open(url)
    data = f.read()
    return data

def saveURL(url):
    """Reads the given URL, saves the file
    #get the folder name 
    folder = (urlparse.urlparse(url)[2])[1:]
    #this will have the filename too
    filename = os.path.basename(folder)
    foldername = os.path.dirname(folder)
    #use a default filename if not given
    if filename == '':
        filename = default_filename
    if foldername == '':
        foldername = os.curdir
    #make it if it is not there
    if cut_folder != '':
        foldername = string.replace(foldername,cut_folder,'')
        # this assumes no error other than folder existence is raised

    #read the data from URL
    data = readURL(url)

    #do the server change processing
    #ideally this should process the file content
    #to make the links relative. The function
    #above is a good start.
    if filename != list_filename:
        fileext = ''
            fileext = os.path.splitext(filename)[1]
        if fileext in ('.txt','.htm','.html','.inc','.php3'):
            #change the base URLs to NULL
            data = re.sub('http://ctx:8080','',data)
            data = re.sub('<base href=.*?>','',data)
    #save the data into the file
        f = open(foldername + os.sep + filename,'wb')
        print foldername + os.sep + filename
    return len(data)

#                                                                #
#                   MAIN SECTION                                 #
#                                                                #

#Process command line arguments
    optlist, args = getopt.getopt(sys.argv[1:], 'c:i:x:')
    print "Error :", sys.exc_info()[1]
    print help
cut_folder = ''  #create folders only from this folder onwards
inc_url = ''     #include only urls starting with this
if optlist:
    for opt,val in optlist:
        if opt == "-c":
            cut_folder = val
        if opt == "-x":
            inc_url = val
        if opt == "-i":
            list_filename = val
            #read the URLs from input file, one per line
                f = open(val,'r')
                args = f.readlines()
                print "Error :", sys.exc_info()[1]

for url in args:
    url = string.strip(url)
    if url:   # we don't want blank lines
        if string.find(url,inc_url)==0:
            print url,'...',saveURL(url)