[FrontPage] [TitleIndex] [WordIndex

wget Web Crawler Thingie

Project of KatieRivard, listed in ProjectIdeas

"Smart" wget (python)

from time import sleep
from urlparse import urlparse
import httplib
import sys
import re

def getDisallows(conn):
    conn.connect()
    conn.request('GET', "/robots.txt")
    disallowtable = []
    try:
        resp = conn.getresponse()
        tmp = resp.read()
        if resp.status != 200:
            #expect 404, otherwise some weird problem
            if resp.status != 404: print resp.status, resp.reason
        else:
            lines = re.split("\n", tmp)
            for i in lines:
                if re.match("^#", i) or i == '':
                    # ignore
                    pass
                else:
                    [tag, value] = re.split(":", i, 1)
                    if tag.lower() == "disallow":
                        disallowtable.append(value.strip())
    except httplib.ResponseNotReady, msg:
        print "Response not ready. ", msg
    return disallowtable
    
def getfile(conn, path, count=5):
    conn.request('GET', path)
    while count > 0:
        try:
            resp = conn.getresponse()
            tmp = resp.read()
            if resp.status == 200:
                return tmp
            elif resp.status == 302:
                newpath = resp.getheader("Location", None)
                if newpath != None:
                    count = count - 1
                    conn.request('GET', newpath)
                else:
                    print "Document moved, no forwarding address. :("
                    return None
            elif resp.status in [404, 400]:
                print resp.status, resp.reason
                return None
        except httplib.ResponseNotReady, msg:
            count = count - 2
            sleep(1)
    print "Trial count: ",count
    print "Response not ready or Document moved too many times."
    return None
    
def smrtWget(url):
    (scheme, loc, path, pram, query, frag) = urlparse(url)
    conn = httplib.HTTPConnection(loc)
    dis = getDisallows(conn)
    go = True
    for i in dis:
        if re.search(i, path):
            go = False
            print "Matched %s on entry \"%s\" in robots.txt" % (path, i)
    if go:
        return getfile(conn, path)
    else:
        print "Robot traversal not allowed on "+loc
        return None

Crawler (python)

import sys
sys.path.append("/home/krivard/prog/wget/python")
from smrtWget import smrtWget
from re import split

class UrlsDocument:
    def __init__(self, str):
        self.docstr = str
    def nextUrl(self):
        if not self.docstr.find("<a href=\"http:") < 0:
            self.docstr = split("<a href=\"http:", self.docstr, 1)[1]
            [url, self.docstr] = split("\"", self.docstr, 1)
            return "http:"+url
        return None

class UrlQueue:
    def __init__(self):
        self.q = []
        self.past = []
    def next(self):
        if len(self.q) == 0:
            raise RuntimeError, 'Queue empty.'
        item = self.q.pop(0)
        self.past.append(item)
        return item
    def add(self, url):
        if url not in self.q + self.past:
            self.q.append(url)

class Crawler:
    def __init__(self, docstr):
        self.rootfile = docstr
        self.queue = UrlQueue()
        self.initqueue()
    def initqueue(self):
        doc = UrlsDocument(self.rootfile)
        url = doc.nextUrl()
        while url != None:
            self.queue.add(url)
            url = doc.nextUrl()
    def crawlOneDoc(self):
        try:
            url = None
            url = self.queue.next()
            str = smrtWget(url)
        except RuntimeError, msg:
            print "Error: %s" % msg
        except:
            print "No good: %s" % url
        else:
            print "Crawling %s" % url
            if str != None:
                doc = UrlsDocument(str)
                url = doc.nextUrl()
                while url != None:
                    self.queue.add(url)
                    url = doc.nextUrl()

a note on Scheme wget

It's definitely possible. It'd be even more possible if we could either figure out the PLT/Drscheme libraries or write some of our own to even out the lack of text processing utilities(the regexp libraries I've tried to look at are not easy to figure out)... I spent quite a bit of time trying to figure out how to get the words I wanted out of such-and-such a string, and it's still not easy. I think my approach is wrong.

Needless to say, writing this stuff in Scheme appears to be significantly more timeconsuming than in Python or Java (though take that with a gigantic hunk of salt: my limited experience with the language was probably fouling things up, too). Many things may exist that I just didn't know how to look for, and ended up writing myself. This is not necessarily a bad thing, but it does take time, so this should be taken into account when summing up hours and whatnot.

Bits of code I wrote:


2013-07-17 10:43