This is the hack21.py example from the mechanize source examples directory. It shows, among other things, how to login to a password-protected site.
#/usr/bin/env python # Port of Hack 21 from the O'Reilly book "Spidering Hacks" by Tara # Calishain and Kevin Hemenway. Of course, there's no need to explicitly # catch exceptions in Python, unlike checking error return values in Perl, # but I've left those in for the sake of a direct port. import sys, os, re from urllib2 import HTTPError import mechanize assert mechanize.__version__ >= (0, 0, 6, "a") mech = mechanize.Browser() # Addition 2005-01-05: Be naughty, since robots.txt asks not to # access /search now. We're not madly searching for everything, so # I don't feel too guilty. mech.set_handle_robots(False) #mech.set_debug_http(True) # Get the starting search page try: mech.open("http://search.cpan.org") except HTTPError, e: sys.exit("%d: %s" % (e.code, e.msg)) # Select the form, fill the fields, and submit mech.select_form(nr=0) mech["query"] = "Lester" mech["mode"] = ["author"] try: mech.submit() except HTTPError, e: sys.exit("post failed: %d: %s" % (e.code, e.msg)) # Find the link for "Andy" try: mech.follow_link(text_regex=re.compile("Andy")) except HTTPError, e: sys.exit("post failed: %d: %s" % (e.code, e.msg)) # Get all the tarballs urls = [link.absolute_url for link in mech.links(url_regex=re.compile(r"\.tar\.gz$"))] print "Found", len(urls), "tarballs to download" for url in urls: filename = os.path.basename(url) f = open(filename, "wb") print "%s -->" % filename, r = mech.open(url) while 1: data = r.read(1024) if not data: break f.write(data) f.close() print os.stat(filename).st_size, "bytes"
No comments:
Post a Comment