wget Web Crawler Thingie
Project of KatieRivard, listed in ProjectIdeas
"Smart" wget (python)
from time import sleep from urlparse import urlparse import httplib import sys import re def getDisallows(conn): conn.connect() conn.request('GET', "/robots.txt") disallowtable = [] try: resp = conn.getresponse() tmp = resp.read() if resp.status != 200: #expect 404, otherwise some weird problem if resp.status != 404: print resp.status, resp.reason else: lines = re.split("\n", tmp) for i in lines: if re.match("^#", i) or i == '': # ignore pass else: [tag, value] = re.split(":", i, 1) if tag.lower() == "disallow": disallowtable.append(value.strip()) except httplib.ResponseNotReady, msg: print "Response not ready. ", msg return disallowtable def getfile(conn, path, count=5): conn.request('GET', path) while count > 0: try: resp = conn.getresponse() tmp = resp.read() if resp.status == 200: return tmp elif resp.status == 302: newpath = resp.getheader("Location", None) if newpath != None: count = count - 1 conn.request('GET', newpath) else: print "Document moved, no forwarding address. :(" return None elif resp.status in [404, 400]: print resp.status, resp.reason return None except httplib.ResponseNotReady, msg: count = count - 2 sleep(1) print "Trial count: ",count print "Response not ready or Document moved too many times." return None def smrtWget(url): (scheme, loc, path, pram, query, frag) = urlparse(url) conn = httplib.HTTPConnection(loc) dis = getDisallows(conn) go = True for i in dis: if re.search(i, path): go = False print "Matched %s on entry \"%s\" in robots.txt" % (path, i) if go: return getfile(conn, path) else: print "Robot traversal not allowed on "+loc return None
Crawler (python)
import sys sys.path.append("/home/krivard/prog/wget/python") from smrtWget import smrtWget from re import split class UrlsDocument: def __init__(self, str): self.docstr = str def nextUrl(self): if not self.docstr.find("<a href=\"http:") < 0: self.docstr = split("<a href=\"http:", self.docstr, 1)[1] [url, self.docstr] = split("\"", self.docstr, 1) return "http:"+url return None class UrlQueue: def __init__(self): self.q = [] self.past = [] def next(self): if len(self.q) == 0: raise RuntimeError, 'Queue empty.' item = self.q.pop(0) self.past.append(item) return item def add(self, url): if url not in self.q + self.past: self.q.append(url) class Crawler: def __init__(self, docstr): self.rootfile = docstr self.queue = UrlQueue() self.initqueue() def initqueue(self): doc = UrlsDocument(self.rootfile) url = doc.nextUrl() while url != None: self.queue.add(url) url = doc.nextUrl() def crawlOneDoc(self): try: url = None url = self.queue.next() str = smrtWget(url) except RuntimeError, msg: print "Error: %s" % msg except: print "No good: %s" % url else: print "Crawling %s" % url if str != None: doc = UrlsDocument(str) url = doc.nextUrl() while url != None: self.queue.add(url) url = doc.nextUrl()
a note on Scheme wget
It's definitely possible. It'd be even more possible if we could either figure out the PLT/Drscheme libraries or write some of our own to even out the lack of text processing utilities(the regexp libraries I've tried to look at are not easy to figure out)... I spent quite a bit of time trying to figure out how to get the words I wanted out of such-and-such a string, and it's still not easy. I think my approach is wrong.
Needless to say, writing this stuff in Scheme appears to be significantly more timeconsuming than in Python or Java (though take that with a gigantic hunk of salt: my limited experience with the language was probably fouling things up, too). Many things may exist that I just didn't know how to look for, and ended up writing myself. This is not necessarily a bad thing, but it does take time, so this should be taken into account when summing up hours and whatnot.
Bits of code I wrote:
- basic wget using a CLI (not arguments)
- reading a tag from an input port
- separating tags from regular text
- reading a word from an input port (used Dybvig as a starting point)
- reading a line from a string
- reading a word from a string
- stacks (used SICP as a starting point)
- Recognizing presence of a robots.txt file at root on the server (haven't dealt with parsing it yet)