wget Web Crawler Thingie
Project of KatieRivard, listed in ProjectIdeas
"Smart" wget (python)
from time import sleep
from urlparse import urlparse
import httplib
import sys
import re
def getDisallows(conn):
conn.connect()
conn.request('GET', "/robots.txt")
disallowtable = []
try:
resp = conn.getresponse()
tmp = resp.read()
if resp.status != 200:
#expect 404, otherwise some weird problem
if resp.status != 404: print resp.status, resp.reason
else:
lines = re.split("\n", tmp)
for i in lines:
if re.match("^#", i) or i == '':
# ignore
pass
else:
[tag, value] = re.split(":", i, 1)
if tag.lower() == "disallow":
disallowtable.append(value.strip())
except httplib.ResponseNotReady, msg:
print "Response not ready. ", msg
return disallowtable
def getfile(conn, path, count=5):
conn.request('GET', path)
while count > 0:
try:
resp = conn.getresponse()
tmp = resp.read()
if resp.status == 200:
return tmp
elif resp.status == 302:
newpath = resp.getheader("Location", None)
if newpath != None:
count = count - 1
conn.request('GET', newpath)
else:
print "Document moved, no forwarding address. :("
return None
elif resp.status in [404, 400]:
print resp.status, resp.reason
return None
except httplib.ResponseNotReady, msg:
count = count - 2
sleep(1)
print "Trial count: ",count
print "Response not ready or Document moved too many times."
return None
def smrtWget(url):
(scheme, loc, path, pram, query, frag) = urlparse(url)
conn = httplib.HTTPConnection(loc)
dis = getDisallows(conn)
go = True
for i in dis:
if re.search(i, path):
go = False
print "Matched %s on entry \"%s\" in robots.txt" % (path, i)
if go:
return getfile(conn, path)
else:
print "Robot traversal not allowed on "+loc
return None
Crawler (python)
import sys
sys.path.append("/home/krivard/prog/wget/python")
from smrtWget import smrtWget
from re import split
class UrlsDocument:
def __init__(self, str):
self.docstr = str
def nextUrl(self):
if not self.docstr.find("<a href=\"http:") < 0:
self.docstr = split("<a href=\"http:", self.docstr, 1)[1]
[url, self.docstr] = split("\"", self.docstr, 1)
return "http:"+url
return None
class UrlQueue:
def __init__(self):
self.q = []
self.past = []
def next(self):
if len(self.q) == 0:
raise RuntimeError, 'Queue empty.'
item = self.q.pop(0)
self.past.append(item)
return item
def add(self, url):
if url not in self.q + self.past:
self.q.append(url)
class Crawler:
def __init__(self, docstr):
self.rootfile = docstr
self.queue = UrlQueue()
self.initqueue()
def initqueue(self):
doc = UrlsDocument(self.rootfile)
url = doc.nextUrl()
while url != None:
self.queue.add(url)
url = doc.nextUrl()
def crawlOneDoc(self):
try:
url = None
url = self.queue.next()
str = smrtWget(url)
except RuntimeError, msg:
print "Error: %s" % msg
except:
print "No good: %s" % url
else:
print "Crawling %s" % url
if str != None:
doc = UrlsDocument(str)
url = doc.nextUrl()
while url != None:
self.queue.add(url)
url = doc.nextUrl()
a note on Scheme wget
It's definitely possible. It'd be even more possible if we could either figure out the PLT/Drscheme libraries or write some of our own to even out the lack of text processing utilities(the regexp libraries I've tried to look at are not easy to figure out)... I spent quite a bit of time trying to figure out how to get the words I wanted out of such-and-such a string, and it's still not easy. I think my approach is wrong.
Needless to say, writing this stuff in Scheme appears to be significantly more timeconsuming than in Python or Java (though take that with a gigantic hunk of salt: my limited experience with the language was probably fouling things up, too). Many things may exist that I just didn't know how to look for, and ended up writing myself. This is not necessarily a bad thing, but it does take time, so this should be taken into account when summing up hours and whatnot.
Bits of code I wrote:
- basic wget using a CLI (not arguments)
- reading a tag from an input port
- separating tags from regular text
- reading a word from an input port (used Dybvig as a starting point)
- reading a line from a string
- reading a word from a string
- stacks (used SICP as a starting point)
- Recognizing presence of a robots.txt file at root on the server (haven't dealt with parsing it yet)