Very often, I had to download pure HTML from a URL and save it to the disk. Simple options include using Lynx/IE/Netscape and dumping the file. Or, take 2 hours to figure out proper options for wget options. So, I wrote this script. The goal was to write some thing in Python and also to add to my way of maintaining the site - Make in Zope, download to PC, upload to Csoft .
http://ctx:8080 is the address of my development Zope machine. You might want to change it :-)
#Given a URL, get the data, into a file of the #same name into corresponding folder # used to slurp Zope documents into filefolder # vsbabu-removethis@hotmail.com import string, urllib, urlparse, os, getopt, sys, re default_filename = 'index.html' list_filename = 'URLFILE.txt' help = ''' slurp.py: Downloads the URLs specified to a local file. arguments: A list of URLs OR -i input-file When input-file is specified, it is assumed that that file contains one URL per line. The program downloads all those urls. Typically, that file is name URLFILE.txt. Files are downloaded to a directory structure from the current directory. ''' class myURLOpener(urllib.FancyURLopener): # read an URL, with automatic HTTP authentication def setpasswd(self, user, passwd): self.__user = user self.__passwd = passwd def prompt_user_passwd(self, host, realm): return self.__user, self.__passwd urlopener = myURLOpener() #if you are facing some stupid problem with Zope 2.2.2, #you might not be able to open the site without a login. #so create a userid/password at the root folder of Zope #and set those values in the line below and uncomment it. #urlopener.setpasswd("userid", "password") def readURL(url): """Returns the file found in the URL """ f = urlopener.open(url) data = f.read() f.close() return data def saveURL(url): """Reads the given URL, saves the file """ #get the folder name folder = (urlparse.urlparse(url)[2])[1:] #this will have the filename too filename = os.path.basename(folder) foldername = os.path.dirname(folder) #use a default filename if not given if filename == '': filename = default_filename if foldername == '': foldername = os.curdir #make it if it is not there if cut_folder != '': foldername = string.replace(foldername,cut_folder,'') try: os.makedirs(foldername) except: pass # this assumes no error other than folder existence is raised #read the data from URL data = readURL(url) #do the server change processing #ideally this should process the file content #to make the links relative. The function #above is a good start. if filename != list_filename: fileext = '' try: fileext = os.path.splitext(filename)[1] except: pass if fileext in ('.txt','.htm','.html','.inc','.php3'): #change the base URLs to NULL data = re.sub('http://ctx:8080','',data) data = re.sub('<base href=.*?>','',data) #save the data into the file try: f = open(foldername + os.sep + filename,'wb') f.write(data) f.close() except: print foldername + os.sep + filename return len(data) ################################################################## # # # MAIN SECTION # # # ################################################################## #Process command line arguments try: optlist, args = getopt.getopt(sys.argv[1:], 'c:i:x:') except: print "Error :", sys.exc_info()[1] print help sys.exit(0) cut_folder = '' #create folders only from this folder onwards inc_url = '' #include only urls starting with this if optlist: for opt,val in optlist: if opt == "-c": cut_folder = val if opt == "-x": inc_url = val if opt == "-i": list_filename = val #read the URLs from input file, one per line try: f = open(val,'r') args = f.readlines() f.close() except: print "Error :", sys.exc_info()[1] sys.exit(0) for url in args: url = string.strip(url) if url: # we don't want blank lines if string.find(url,inc_url)==0: print url,'...',saveURL(url)
Since you are seeing this, it means that your browser does not support cascading style sheets. Please download and use one of the many browsers that support web standards.