My local about.com page posts some health dept ratings every so often but I hate the format so I wrote this to parse it into something I can use. The pages are hard coded but it's a decent example of using tables and xpath in htmlunit:
private HtmlElement getHtmlElementByXPath(final String exp, final HtmlPage page)
throws JaxenException {
final HtmlUnitXPath xpath = new HtmlUnitXPath(exp);
return (HtmlElement) xpath.selectSingleNode(page);
}
private String rowToCSV(final HtmlTableRow row){
final StringBuffer retval = new StringBuffer();
final Iterator headerIterator = row.getCells().iterator();
while (headerIterator.hasNext()) {
final String thisHeader = ((HtmlTableCell) headerIterator.next()).asText();
retval.append('"');
retval.append(thisHeader);
retval.append('"');
if (headerIterator.hasNext()) {
retval.append(',');
} else {
retval.append('\n');
}
}
return retval.toString();
}
private String tableToCSV(final HtmlTable table, final boolean skipheader) {
final StringBuffer retval = new StringBuffer();
final Iterator rowIterator = table.getRows().iterator();
final HtmlTableRow headerRow = (HtmlTableRow) rowIterator.next();
if (!skipheader) {
retval.append(rowToCSV(headerRow));
}
while (rowIterator.hasNext()) {
final HtmlTableRow htmlRow = (HtmlTableRow) rowIterator.next();
retval.append(rowToCSV(htmlRow));
}
return retval.toString();
}
public void testScores() throws Exception {
final List urlList = new ArrayList();
final String urlBase = "http://huntsville.about.com/library/blhealthdept031805#.htm";
for (int i = 1; i < 11; i++) {
urlList.add(new URL(urlBase.replace("#", Integer.toString(i))));
}
final WebClient client = new WebClient();
client.setJavaScriptEnabled(false);
final Iterator iter = urlList.iterator();
final StringBuffer csv = new StringBuffer();
boolean skipheader = false;
while (iter.hasNext()) {
final HtmlPage page = (HtmlPage) client.getPage((URL) iter.next());
final HtmlTable table = (HtmlTable) getHtmlElementByXPath("//table[@height=145]", page);
csv.append(tableToCSV(table, skipheader));
skipheader = true;
}
System.out.println(csv);
}