Web Development :: Download and Save URL

Download and Save URL

Very often, I had to download pure HTML from a URL and save it to the disk. Simple options include using Lynx/IE/Netscape and dumping the file. Or, take 2 hours to figure out proper options for wget options. So, I wrote this script. The goal was to write some thing in Python and also to add to my way of maintaining the site - Make in Zope, download to PC, upload to Csoft .

http://ctx:8080 is the address of my development Zope machine. You might want to change it :-)

#Given a URL, get the data, into a file of the
#same name into corresponding folder
# used to slurp Zope documents  into filefolder
# vsbabu-removethis@hotmail.com

import string, urllib, urlparse, os, getopt, sys, re

default_filename = 'index.html'
list_filename = 'URLFILE.txt'

help = '''
slurp.py: Downloads the URLs specified to a local file.

arguments:
    A list of URLs
    OR
    -i input-file
When input-file is specified, it is assumed that that file
contains one URL per line. The program downloads all those
urls. Typically, that file is name URLFILE.txt.

Files are downloaded to a directory structure from the
current directory.
'''
class myURLOpener(urllib.FancyURLopener):
    # read an URL, with automatic HTTP authentication

    def setpasswd(self, user, passwd):
        self.__user = user
        self.__passwd = passwd

    def prompt_user_passwd(self, host, realm):
        return self.__user, self.__passwd

urlopener = myURLOpener()
#if you are facing some stupid problem with Zope 2.2.2,
#you might not be able to open the site without a login.
#so create a userid/password at the root folder of Zope
#and set those values in the line below and uncomment it.
#urlopener.setpasswd("userid", "password")

def readURL(url):
    """Returns the file found in the URL
    """
    f = urlopener.open(url)
    data = f.read()
    f.close()
    return data

def saveURL(url):
    """Reads the given URL, saves the file
    """
    #get the folder name 
    folder = (urlparse.urlparse(url)[2])[1:]
    #this will have the filename too
    filename = os.path.basename(folder)
    foldername = os.path.dirname(folder)
    #use a default filename if not given
    if filename == '':
        filename = default_filename
    if foldername == '':
        foldername = os.curdir
    
    #make it if it is not there
    if cut_folder != '':
        foldername = string.replace(foldername,cut_folder,'')
    try:
        os.makedirs(foldername)
    except:
        pass
        # this assumes no error other than folder existence is raised

    #read the data from URL
    data = readURL(url)

    #do the server change processing
    #ideally this should process the file content
    #to make the links relative. The function
    #above is a good start.
    if filename != list_filename:
        fileext = ''
        try:
            fileext = os.path.splitext(filename)[1]
        except:
            pass
        if fileext in ('.txt','.htm','.html','.inc','.php3'):
            #change the base URLs to NULL
            data = re.sub('http://ctx:8080','',data)
            data = re.sub('<base href=.*?>','',data)
    
    #save the data into the file
    try:
        f = open(foldername + os.sep + filename,'wb')
        f.write(data)
        f.close()
    except:
        print foldername + os.sep + filename
    
    return len(data)

##################################################################
#                                                                #
#                   MAIN SECTION                                 #
#                                                                #
##################################################################

#Process command line arguments
try:
    optlist, args = getopt.getopt(sys.argv[1:], 'c:i:x:')
except:
    print "Error :", sys.exc_info()[1]
    print help
    sys.exit(0)
    
cut_folder = ''  #create folders only from this folder onwards
inc_url = ''     #include only urls starting with this
if optlist:
    for opt,val in optlist:
        if opt == "-c":
            cut_folder = val
        if opt == "-x":
            inc_url = val
        if opt == "-i":
            list_filename = val
            #read the URLs from input file, one per line
            try:
                f = open(val,'r')
                args = f.readlines()
                f.close()
            except:
                print "Error :", sys.exc_info()[1]
                sys.exit(0)


for url in args:
    url = string.strip(url)
    if url:   # we don't want blank lines
        if string.find(url,inc_url)==0:
            print url,'...',saveURL(url)

Since you are seeing this, it means that your browser does not support cascading style sheets. Please download and use one of the many browsers that support web standards.

vsbabu.org

Download and Save URL