wget Web Crawler Thingie

Project of KatieRivard, listed in ProjectIdeas

Contents

wget Web Crawler Thingie

"Smart" wget (python)

from time import sleep
from urlparse import urlparse
import httplib
import sys
import re

def getDisallows(conn):
    conn.connect()
    conn.request('GET', "/robots.txt")
    disallowtable = []
    try:
        resp = conn.getresponse()
        tmp = resp.read()
        if resp.status != 200:
            #expect 404, otherwise some weird problem
            if resp.status != 404: print resp.status, resp.reason
        else:
            lines = re.split("\n", tmp)
            for i in lines:
                if re.match("^#", i) or i == '':
                    # ignore
                    pass
                else:
                    [tag, value] = re.split(":", i, 1)
                    if tag.lower() == "disallow":
                        disallowtable.append(value.strip())
    except httplib.ResponseNotReady, msg:
        print "Response not ready. ", msg
    return disallowtable
    
def getfile(conn, path, count=5):
    conn.request('GET', path)
    while count > 0:
        try:
            resp = conn.getresponse()
            tmp = resp.read()
            if resp.status == 200:
                return tmp
            elif resp.status == 302:
                newpath = resp.getheader("Location", None)
                if newpath != None:
                    count = count - 1
                    conn.request('GET', newpath)
                else:
                    print "Document moved, no forwarding address. :("
                    return None
            elif resp.status in [404, 400]:
                print resp.status, resp.reason
                return None
        except httplib.ResponseNotReady, msg:
            count = count - 2
            sleep(1)
    print "Trial count: ",count
    print "Response not ready or Document moved too many times."
    return None
    
def smrtWget(url):
    (scheme, loc, path, pram, query, frag) = urlparse(url)
    conn = httplib.HTTPConnection(loc)
    dis = getDisallows(conn)
    go = True
    for i in dis:
        if re.search(i, path):
            go = False
            print "Matched %s on entry \"%s\" in robots.txt" % (path, i)
    if go:
        return getfile(conn, path)
    else:
        print "Robot traversal not allowed on "+loc
        return None

Crawler (python)

import sys
sys.path.append("/home/krivard/prog/wget/python")
from smrtWget import smrtWget
from re import split

class UrlsDocument:
    def __init__(self, str):
        self.docstr = str
    def nextUrl(self):
        if not self.docstr.find("<a href=\"http:") < 0:
            self.docstr = split("<a href=\"http:", self.docstr, 1)[1]
            [url, self.docstr] = split("\"", self.docstr, 1)
            return "http:"+url
        return None

class UrlQueue:
    def __init__(self):
        self.q = []
        self.past = []
    def next(self):
        if len(self.q) == 0:
            raise RuntimeError, 'Queue empty.'
        item = self.q.pop(0)
        self.past.append(item)
        return item
    def add(self, url):
        if url not in self.q + self.past:
            self.q.append(url)

class Crawler:
    def __init__(self, docstr):
        self.rootfile = docstr
        self.queue = UrlQueue()
        self.initqueue()
    def initqueue(self):
        doc = UrlsDocument(self.rootfile)
        url = doc.nextUrl()
        while url != None:
            self.queue.add(url)
            url = doc.nextUrl()
    def crawlOneDoc(self):
        try:
            url = None
            url = self.queue.next()
            str = smrtWget(url)
        except RuntimeError, msg:
            print "Error: %s" % msg
        except:
            print "No good: %s" % url
        else:
            print "Crawling %s" % url
            if str != None:
                doc = UrlsDocument(str)
                url = doc.nextUrl()
                while url != None:
                    self.queue.add(url)
                    url = doc.nextUrl()

a note on Scheme wget

It's definitely possible. It'd be even more possible if we could either figure out the PLT/Drscheme libraries or write some of our own to even out the lack of text processing utilities(the regexp libraries I've tried to look at are not easy to figure out)... I spent quite a bit of time trying to figure out how to get the words I wanted out of such-and-such a string, and it's still not easy. I think my approach is wrong.

Needless to say, writing this stuff in Scheme appears to be significantly more timeconsuming than in Python or Java (though take that with a gigantic hunk of salt: my limited experience with the language was probably fouling things up, too). Many things may exist that I just didn't know how to look for, and ended up writing myself. This is not necessarily a bad thing, but it does take time, so this should be taken into account when summing up hours and whatnot.

Bits of code I wrote:

basic wget using a CLI (not arguments)
reading a tag from an input port
- separating tags from regular text
reading a word from an input port (used Dybvig as a starting point)
reading a line from a string
reading a word from a string
stacks (used SICP as a starting point)
Recognizing presence of a robots.txt file at root on the server (haven't dealt with parsing it yet)