Very often, I had to download pure HTML from a URL and save it
to the disk. Simple options include using Lynx/IE/Netscape and
dumping the file. Or, take 2 hours to figure out proper
options for wget
options.
So, I wrote this script. The goal was to write some thing
in Python
and also
to add to my way of maintaining the site - Make in Zope,
download to PC, upload to Csoft
.
http://ctx:8080 is the address of my development Zope machine. You might want to change it :-)
#Given a URL, get the data, into a file of the
#same name into corresponding folder
# used to slurp Zope documents into filefolder
# vsbabu-removethis@hotmail.com
import string, urllib, urlparse, os, getopt, sys, re
default_filename = 'index.html'
list_filename = 'URLFILE.txt'
help = '''
slurp.py: Downloads the URLs specified to a local file.
arguments:
A list of URLs
OR
-i input-file
When input-file is specified, it is assumed that that file
contains one URL per line. The program downloads all those
urls. Typically, that file is name URLFILE.txt.
Files are downloaded to a directory structure from the
current directory.
'''
class myURLOpener(urllib.FancyURLopener):
# read an URL, with automatic HTTP authentication
def setpasswd(self, user, passwd):
self.__user = user
self.__passwd = passwd
def prompt_user_passwd(self, host, realm):
return self.__user, self.__passwd
urlopener = myURLOpener()
#if you are facing some stupid problem with Zope 2.2.2,
#you might not be able to open the site without a login.
#so create a userid/password at the root folder of Zope
#and set those values in the line below and uncomment it.
#urlopener.setpasswd("userid", "password")
def readURL(url):
"""Returns the file found in the URL
"""
f = urlopener.open(url)
data = f.read()
f.close()
return data
def saveURL(url):
"""Reads the given URL, saves the file
"""
#get the folder name
folder = (urlparse.urlparse(url)[2])[1:]
#this will have the filename too
filename = os.path.basename(folder)
foldername = os.path.dirname(folder)
#use a default filename if not given
if filename == '':
filename = default_filename
if foldername == '':
foldername = os.curdir
#make it if it is not there
if cut_folder != '':
foldername = string.replace(foldername,cut_folder,'')
try:
os.makedirs(foldername)
except:
pass
# this assumes no error other than folder existence is raised
#read the data from URL
data = readURL(url)
#do the server change processing
#ideally this should process the file content
#to make the links relative. The function
#above is a good start.
if filename != list_filename:
fileext = ''
try:
fileext = os.path.splitext(filename)[1]
except:
pass
if fileext in ('.txt','.htm','.html','.inc','.php3'):
#change the base URLs to NULL
data = re.sub('http://ctx:8080','',data)
data = re.sub('<base href=.*?>','',data)
#save the data into the file
try:
f = open(foldername + os.sep + filename,'wb')
f.write(data)
f.close()
except:
print foldername + os.sep + filename
return len(data)
##################################################################
# #
# MAIN SECTION #
# #
##################################################################
#Process command line arguments
try:
optlist, args = getopt.getopt(sys.argv[1:], 'c:i:x:')
except:
print "Error :", sys.exc_info()[1]
print help
sys.exit(0)
cut_folder = '' #create folders only from this folder onwards
inc_url = '' #include only urls starting with this
if optlist:
for opt,val in optlist:
if opt == "-c":
cut_folder = val
if opt == "-x":
inc_url = val
if opt == "-i":
list_filename = val
#read the URLs from input file, one per line
try:
f = open(val,'r')
args = f.readlines()
f.close()
except:
print "Error :", sys.exc_info()[1]
sys.exit(0)
for url in args:
url = string.strip(url)
if url: # we don't want blank lines
if string.find(url,inc_url)==0:
print url,'...',saveURL(url)
Since you are seeing this, it means that your browser does not support cascading style sheets. Please download and use one of the many browsers that support web standards.