HTML Organizer

Inputs a URL,
reads the file,
recursively loads the tag hierarchy into some data structure.

This went through at least three wildly different revisions, including a rather cute little state machine(HtmlParserFsm) that might've worked if doing lambda-like things in Java were easier(for managing the side effects of state changes)(no, it wouldn't've -- needs to know where it's been, not just where it is.). I could probably take the solution below and work it backwards towards something prettier, and get rid of the rather horrid while/if construct I ended up using instead, but I don't know that that's wholly useful for just playing around with stuff.

The other bit of it is that I'm finally getting the hang of hash tables, so I've been tending to use them, lots. Java has TreeMap which lets you do ordered tables, which is nice. Python dictionaries aren't ordered, but I faked the effect by using lists of (key, item) tuples. Anyhoo, yay for source code.

Contents

HTML Organizer

Java

=== Main class: HtmlOrganizer === {{{import java.net.URL;

import java.io.*; import java.util.*;

public class HtmlOrganizer {

public static void main(String[] args) {
- if(args.length == 0) {
  - System.out.println("Usage:\n\t$ java HtmlOrganizer <url>"); System.exit(0);
  }
  HtmlOrganizer c = new HtmlOrganizer(); String s = c.getPage(args[0]); c.organize(s);
} public String getPage(String urlstr) {
- String s = ""; try {
  - URL url = new URL(urlstr);
    BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream())); while(reader.ready()) {
    - s += reader.readLine(); /** there should be a '+"\n"' here, but newlines are making my parser output hard to read **/
    }
  } catch(java.io.IOException ioe) {
  - System.out.println(ioe.getStackTrace()); System.out.println("Houston? Problem: "+ioe.getMessage());
  } return s;
} public void organize(String s) {
- //this is a hack -- tabulate() should handle the initial bit if possible Map h = new Hashtable(); h.put(new Key("html"),tabulate(s.substring(6),"html").table); printRMap(h, "");
} public void printRMap(Map h, String lineprefix) {
- Iterator it = h.keySet().iterator(); //System.out.println(lineprefix+"MAP SIZE: "+h.size()); while(it.hasNext()) {
  - Object i = it.next(); Object o = h.get(i); System.out.print(lineprefix+i.toString()+": "); if(o instanceof Map) {
    - System.out.println(); printRMap((Map)o, lineprefix+" ");
    } else {
    - System.out.println(o);
    }
  }
} public class Dual {
- public Map table; public String string; public Dual(Map h, String s) {
  - this.table = h; this.string = s;
  }
} public Dual tabulate(String s, String name) {
- String[] spec = {"br","p","li","hr"}; final Vector specials = new Vector();
  for(int i=0;i<spec.length; i++) {specials.add(spec[i]);}
  Map h = new TreeMap(new Comparator() {
  - public int compare(Object i, Object j) {
    - if(i instanceof Key && j instanceof Key) {
      - Key a = (Key)i; Key b = (Key)j; return (int)(a.tstamp - b.tstamp); //this is probably bad.
      } else throw new ClassCastException("Key objects must be of class Key");
    } });
  int index = 0;
  while(s.length()>0) {
  - /** grab the first "<" **/ int i=s.indexOf("<", index); /** if it's first, then deal with the tag **/ if(i==0) {
    - /** it's a close tag **/ if(s.charAt(1) == '/') {
      - /** it'd better be our close tag; clean up and return **/
        String tagname = s.split("[<> ]")[1]; if (tagname.substring(1).equals(name)) {
        return new Dual(h, s.substring(s.indexOf(">")+1));
        } else if (specials.contains(tagname.substring(1).toLowerCase())) {
        /** do nothing; probably a stray </p> or </li> **/
        } else {
        /** it's not our close tag -- situation fubar **/ System.out.println("Badly formed. Want: "+name+"\n\tHave: "+s+"\nReturning anyway..."); return new Dual(h,s);
        }
      } else {
      - /** it's a regular tag -- get the name **/
        String inner = s.split("[<> ]")[1]; /** if it's not a special tag, recurse, add the new map, continue with remaining string **/ if(! specials.contains(inner.toLowerCase())) {
        Dual d = tabulate(s.substring(s.indexOf(">")+1).trim(), inner); h.put(new Key(inner), d.table); s = d.string.trim(); index = 0;
        } else {
        /** it's a special tag -- put our "lookfor" index after it; it'll get cycled in with "text" **/
        index = s.indexOf(">")+1;
        }
      }
    } else {
    - /** put in all text up to the next "<" as "text" in our map **/ h.put(new Key("text"),s.substring(0,i)); s = s.substring(i); index = 0;
    }
  } /** this shouldn't happen, but might: **/ System.out.println("Encountered end of string before tags properly closed. Returning anyway..."); return new Dual(h,s);
}

}

}}} And because inner classes aren't allowed to have static components:angry:, === the Key class ===

/** Allows us to keep the tag modules in the order we add them to the Maps.  
  Otherwise Java puts them in a box and shakes them, and the result is hard to read.  **/
public class Key {
    private static int tally=0;
    private String string;
    public long tstamp;
    public Key(String s) {
        this.string = s;
        this.tstamp = Key.tally++;
    }
    public String toString() { return this.string; }
}

Python (all code)

..This is basically just a port of the Java version, but looks nicer because Python has tuples.

import urllib
import sys
import re

def tabulate(htmlstr, name=""):
    specials = ["br","hr","p","li","link"]
    table = []
    index = 0
    while len(htmlstr)>0:
        i = htmlstr.index("<",index)
        #first thing in string is a tag
        if i is 0:
            #it's a close tag
            if htmlstr[1]=='/':
                [tagname, rest] = re.split("[/> ]",htmlstr,2)[1:]
                if tagname == name:
                    return (table, rest.strip())
                elif tagname in specials:
                    index = htmlstr.index(rest)
                else:
                    print "Badly formed.  Want: "+name+"\n\tHave: "+tagname
                    return (table, htmlstr.strip())
            else:
                #it's a regular tag
                tagname = re.split("[<> ]",htmlstr)[1]
                rest = re.split(">",htmlstr,1)[1]
                if tagname in specials:
                    index = htmlstr.index(rest)
                else:   
                    (tagtable, htmlstr) = tabulate(rest,tagname)
                    table.append((tagname, tagtable))
                    index = 0
        else:
            (text, htmlstr) = (htmlstr[0:i], htmlstr[i:])
            table.append(("text", text.strip()))
            index = 0
    if name != "":
        print "Encountered end of string before tags properly closed at "+name+".  Returning anyway..."
        return (table, htmlstr.strip())
    return table

def printRTable(table, prefix=""):
    for (i,j) in table:
        print "\n"+prefix+i+": ",
        if isinstance(j, list):
            printRTable(j, prefix+"  ")
        else:
            print j,

if len(sys.argv) < 2:
    print "Usage: \n\t$ htmlorganizer.py <url>"
    sys.exit()

f = urllib.urlopen(sys.argv[1])
s = f.read()
printRTable(tabulate(s))

Sample Output

krivard@localhost python $ ./htmlorganizer.py http://faclin01.olin.edu/~krivard/index.html
Badly formed.  Want: blockquote
        Have: body
 
html:
  text:
  head:
    title:
      text:  krivard
  body:
    text:
    h3:
      text:  krivard on faclin01
    text:  <hr>
    text:  <br>
    b:
      text:  New and improved Clicker:
    blockquote:
      text:  The server runs under
      a:
        text:  this jar
      text:  .  The client is
      a:
        text:  here
      text:  .  Run
      tt:
        text:  java -jar questn.jar
      text:  , then load the client in your browser, then enter a question into 
the server and hit "Send," and the client should register it.  You can also hit 
"Send" from the little Server Control widget(small) and it'll send the client a 
timestamp.  Also recommended is running the client using JDK's
      tt:
        text:  appletviewer
      text:  in another thread from the command line. To do:
      ol:
        text:
        text:  <li>Finish the client UI so you can see all the bits of the question(answers too).
        text:  <li>Make the client active so you can send back your response.
        text:  <li>Make the server care about responses.
        text:  <li>Make a
        i:
          text:  simple
        text:  display UI.
        text:  <li>Make the server UI less hacked. (Well, make everything less hacked)
        text:  <li>Revise as necessary.  Specifically, this app is much huger than 
I initially thought it would be, which may not be necessary.
        text:  <li>Finish support for different question types(radio vs checkbox) 
and different response types(final vs write-in); figure out a way to munge write-ins 
into the display UI.
        text:  <li>Add support for multiple questions.
      i:
        text:  ...Just kidding.  To do first: fix the AccessControl bug.  Eww....

Lex/YACC Parser

This doesn't build a parse tree, but it does tell you what tags the text it's looking at is nested in, and a recursive treebuilder wouldn't be too hard to implement. Plus, this uses Lex/YACC, which is just cool.

I used PLY, Python Lex - Yacc. It's not too bad once you start to get to know it.

S statement; T tagexp; Ot opentagexp; E expression; Ct closetagexp; X text

S	=>	T
T	=>	Ot E Ct
E	=>	E E this doesn't look happy, but..
	\|	X
	\|	T

import re
import sys

tokens = (
    'TEXT','OPENTAG', 'CLOSETAG',
    )

# tokens:

t_TEXT     = r'[^<>]+'

def t_OPENTAG(t):
    r'<[a-zA-Z]+( [^><])*>'
    bits = re.split('[^a-zA-Z]',t.value)
    t.value = bits[1]
    return t
        
def t_CLOSETAG(t):
    r'</[a-zA-Z]+>'
    t.value = t.value[2:-1]
    return t

t_ignore = "\n"

def t_error(t):
    print "Illegal character '%s'" % t.value[0]
    t.skip(1)

# build the lexer
import lex
lex.lex()

# test data
data = '''<html>
<head>
<title>My Test</title></head>
<body bgcolor="#ffffff">
<i>this</i> is a <b>test.</b>
</body></html>'''
if len(sys.argv) > 1:
    f = open(sys.argv[1], 'r')
    data = f.read()
    f.close()

# i need to figure out how this actually works, and not just guess
precedence = (
    ('left','TEXT','expression', 'tagexp'),
    )

# parse "tree" (not really)
tagstack = []

def p_statement_expr(t):
    'statement : tagexp'
    print t[1]

def p_expression_xfirst(t):
    '''expression : TEXT
                  | TEXT expression'''
    textadded(t[1])
    
def textadded(s):
    print "Added '%s' in %s" % (s, tagstack)

def p_expression_xsecond(t):
    'expression : expression TEXT'
    textadded(t[2])
    
def p_expression_tagexp(t):
    '''expression : tagexp
                  | expression expression'''

def p_tagexp(t):
    'tagexp : opentagexp expression closetagexp'

def p_opentagexp(t):
    'opentagexp : OPENTAG'
    tagstack.append(t[1])

def p_closetagexp(t):
    'closetagexp : CLOSETAG'
    tagstack.remove(t[1])
    
def p_error(t):
    print "Syntax error at '%s'" % t

import yacc
yacc.yacc()

result = yacc.parse(data)
print result

print tagstack