HTML Organizer
(Clickback to KatieRivard)
- Inputs a URL,
- reads the file,
- recursively loads the tag hierarchy into some data structure.
This went through at least three wildly different revisions, including a rather cute little state machine(HtmlParserFsm) that might've worked if doing lambda-like things in Java were easier(for managing the side effects of state changes)(no, it wouldn't've -- needs to know where it's been, not just where it is.). I could probably take the solution below and work it backwards towards something prettier, and get rid of the rather horrid while/if construct I ended up using instead, but I don't know that that's wholly useful for just playing around with stuff.
The other bit of it is that I'm finally getting the hang of hash tables, so I've been tending to use them, lots. Java has TreeMap which lets you do ordered tables, which is nice. Python dictionaries aren't ordered, but I faked the effect by using lists of (key, item) tuples. Anyhoo, yay for source code.
Java
=== Main class: HtmlOrganizer === {{{import java.net.URL;
import java.io.*; import java.util.*;
public class HtmlOrganizer {
- public static void main(String[] args) {
- if(args.length == 0) {
System.out.println("Usage:\n\t$ java HtmlOrganizer <url>"); System.exit(0);
HtmlOrganizer c = new HtmlOrganizer(); String s = c.getPage(args[0]); c.organize(s);
- String s = ""; try {
- URL url = new URL(urlstr);
BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream())); while(reader.ready()) {
- s += reader.readLine(); /** there should be a '+"\n"' here, but newlines are making my parser output hard to read **/
- System.out.println(ioe.getStackTrace()); System.out.println("Houston? Problem: "+ioe.getMessage());
- URL url = new URL(urlstr);
- //this is a hack -- tabulate() should handle the initial bit if possible Map h = new Hashtable(); h.put(new Key("html"),tabulate(s.substring(6),"html").table); printRMap(h, "");
- Iterator it = h.keySet().iterator(); //System.out.println(lineprefix+"MAP SIZE: "+h.size()); while(it.hasNext()) {
- Object i = it.next(); Object o = h.get(i); System.out.print(lineprefix+i.toString()+": "); if(o instanceof Map) {
- System.out.println(); printRMap((Map)o, lineprefix+" ");
- System.out.println(o);
- Object i = it.next(); Object o = h.get(i); System.out.print(lineprefix+i.toString()+": "); if(o instanceof Map) {
- public Map table; public String string; public Dual(Map h, String s) {
- this.table = h; this.string = s;
- String[] spec = {"br","p","li","hr"}; final Vector specials = new Vector();
for(int i=0;i<spec.length; i++) {specials.add(spec[i]);}
Map h = new TreeMap(new Comparator() {
- public int compare(Object i, Object j) {
if(i instanceof Key && j instanceof Key) {
- Key a = (Key)i; Key b = (Key)j; return (int)(a.tstamp - b.tstamp); //this is probably bad.
} else throw new ClassCastException("Key objects must be of class Key");
while(s.length()>0) {
/** grab the first "<" **/ int i=s.indexOf("<", index); /** if it's first, then deal with the tag **/ if(i==0) {
- /** it's a close tag **/ if(s.charAt(1) == '/') {
- /** it'd better be our close tag; clean up and return **/
String tagname = s.split("[<> ]")[1]; if (tagname.substring(1).equals(name)) {
return new Dual(h, s.substring(s.indexOf(">")+1));
/** do nothing; probably a stray </p> or </li> **/
- /** it's not our close tag -- situation fubar **/ System.out.println("Badly formed. Want: "+name+"\n\tHave: "+s+"\nReturning anyway..."); return new Dual(h,s);
- /** it's a regular tag -- get the name **/
String inner = s.split("[<> ]")[1]; /** if it's not a special tag, recurse, add the new map, continue with remaining string **/ if(! specials.contains(inner.toLowerCase())) {
Dual d = tabulate(s.substring(s.indexOf(">")+1).trim(), inner); h.put(new Key(inner), d.table); s = d.string.trim(); index = 0;
- /** it's a special tag -- put our "lookfor" index after it; it'll get cycled in with "text" **/
index = s.indexOf(">")+1;
- /** it'd better be our close tag; clean up and return **/
/** put in all text up to the next "<" as "text" in our map **/ h.put(new Key("text"),s.substring(0,i)); s = s.substring(i); index = 0;
- /** it's a close tag **/ if(s.charAt(1) == '/') {
- public int compare(Object i, Object j) {
- if(args.length == 0) {
}
- }}} And because inner classes aren't allowed to have static components:angry:, === the Key class ===
/** Allows us to keep the tag modules in the order we add them to the Maps. Otherwise Java puts them in a box and shakes them, and the result is hard to read. **/ public class Key { private static int tally=0; private String string; public long tstamp; public Key(String s) { this.string = s; this.tstamp = Key.tally++; } public String toString() { return this.string; } }
Python (all code)
- ..This is basically just a port of the Java version, but looks nicer because Python has tuples.
import urllib import sys import re def tabulate(htmlstr, name=""): specials = ["br","hr","p","li","link"] table = [] index = 0 while len(htmlstr)>0: i = htmlstr.index("<",index) #first thing in string is a tag if i is 0: #it's a close tag if htmlstr[1]=='/': [tagname, rest] = re.split("[/> ]",htmlstr,2)[1:] if tagname == name: return (table, rest.strip()) elif tagname in specials: index = htmlstr.index(rest) else: print "Badly formed. Want: "+name+"\n\tHave: "+tagname return (table, htmlstr.strip()) else: #it's a regular tag tagname = re.split("[<> ]",htmlstr)[1] rest = re.split(">",htmlstr,1)[1] if tagname in specials: index = htmlstr.index(rest) else: (tagtable, htmlstr) = tabulate(rest,tagname) table.append((tagname, tagtable)) index = 0 else: (text, htmlstr) = (htmlstr[0:i], htmlstr[i:]) table.append(("text", text.strip())) index = 0 if name != "": print "Encountered end of string before tags properly closed at "+name+". Returning anyway..." return (table, htmlstr.strip()) return table def printRTable(table, prefix=""): for (i,j) in table: print "\n"+prefix+i+": ", if isinstance(j, list): printRTable(j, prefix+" ") else: print j, if len(sys.argv) < 2: print "Usage: \n\t$ htmlorganizer.py <url>" sys.exit() f = urllib.urlopen(sys.argv[1]) s = f.read() printRTable(tabulate(s))
Sample Output
krivard@localhost python $ ./htmlorganizer.py http://faclin01.olin.edu/~krivard/index.html Badly formed. Want: blockquote Have: body html: text: head: title: text: krivard body: text: h3: text: krivard on faclin01 text: <hr> text: <br> b: text: New and improved Clicker: blockquote: text: The server runs under a: text: this jar text: . The client is a: text: here text: . Run tt: text: java -jar questn.jar text: , then load the client in your browser, then enter a question into the server and hit "Send," and the client should register it. You can also hit "Send" from the little Server Control widget(small) and it'll send the client a timestamp. Also recommended is running the client using JDK's tt: text: appletviewer text: in another thread from the command line. To do: ol: text: text: <li>Finish the client UI so you can see all the bits of the question(answers too). text: <li>Make the client active so you can send back your response. text: <li>Make the server care about responses. text: <li>Make a i: text: simple text: display UI. text: <li>Make the server UI less hacked. (Well, make everything less hacked) text: <li>Revise as necessary. Specifically, this app is much huger than I initially thought it would be, which may not be necessary. text: <li>Finish support for different question types(radio vs checkbox) and different response types(final vs write-in); figure out a way to munge write-ins into the display UI. text: <li>Add support for multiple questions. i: text: ...Just kidding. To do first: fix the AccessControl bug. Eww....
Lex/YACC Parser
This doesn't build a parse tree, but it does tell you what tags the text it's looking at is nested in, and a recursive treebuilder wouldn't be too hard to implement. Plus, this uses Lex/YACC, which is just cool.
I used PLY, Python Lex - Yacc. It's not too bad once you start to get to know it.
S statement; T tagexp; Ot opentagexp; E expression; Ct closetagexp; X text
S |
=> |
T |
T |
=> |
Ot E Ct |
E |
=> |
E E this doesn't look happy, but.. |
|
| |
X |
|
| |
T |
import re import sys tokens = ( 'TEXT','OPENTAG', 'CLOSETAG', ) # tokens: t_TEXT = r'[^<>]+' def t_OPENTAG(t): r'<[a-zA-Z]+( [^><])*>' bits = re.split('[^a-zA-Z]',t.value) t.value = bits[1] return t def t_CLOSETAG(t): r'</[a-zA-Z]+>' t.value = t.value[2:-1] return t t_ignore = "\n" def t_error(t): print "Illegal character '%s'" % t.value[0] t.skip(1) # build the lexer import lex lex.lex() # test data data = '''<html> <head> <title>My Test</title></head> <body bgcolor="#ffffff"> <i>this</i> is a <b>test.</b> </body></html>''' if len(sys.argv) > 1: f = open(sys.argv[1], 'r') data = f.read() f.close() # i need to figure out how this actually works, and not just guess precedence = ( ('left','TEXT','expression', 'tagexp'), ) # parse "tree" (not really) tagstack = [] def p_statement_expr(t): 'statement : tagexp' print t[1] def p_expression_xfirst(t): '''expression : TEXT | TEXT expression''' textadded(t[1]) def textadded(s): print "Added '%s' in %s" % (s, tagstack) def p_expression_xsecond(t): 'expression : expression TEXT' textadded(t[2]) def p_expression_tagexp(t): '''expression : tagexp | expression expression''' def p_tagexp(t): 'tagexp : opentagexp expression closetagexp' def p_opentagexp(t): 'opentagexp : OPENTAG' tagstack.append(t[1]) def p_closetagexp(t): 'closetagexp : CLOSETAG' tagstack.remove(t[1]) def p_error(t): print "Syntax error at '%s'" % t import yacc yacc.yacc() result = yacc.parse(data) print result print tagstack