I had played with Python before to scrape data from the web and thought I would give it a go again. Here is what I used to get the desired content and create HTML files that I could use on my own website:
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib | |
import re | |
def find_between( s, first, last ): | |
try: | |
start = s.index( first ) + len( first ) | |
end = s.index( last, start ) | |
return s[start:end] | |
except ValueError: | |
return "" | |
fileName = 1 | |
for n in range(1, 50): | |
#get the full webpage code | |
content = urllib.urlopen("http://TheURLHere/look_up.php?"+str(n)).read() | |
#print content | |
#found that the title of the page is between this HTML markup | |
title = find_between(content,'<font color="#660000" size="2" face="Verdana">','<br>') | |
#let's create and open a new file | |
fo = open("temp/" + str(fileName) +".html", "wb") | |
#i will be creating a Rails Action.method that will use the numbers 1..endNumber for accessing the files | |
fileName += 1 | |
#HTML for the new files and search | |
htmlStart = "<!DOCTYPE html><html><head><title>" + title + "</title></head><body>" | |
tableStart = "<table border=\"0\" cellpadding=\"0\" cellspacing=\"0\" style=\"border-collapse: collapse\" bordercolor=\"#111111\" width=\"97%\" id=\"AutoNumber4\">" | |
tableEnd = "</table></body></html>" | |
#get the content that I actually want | |
scrapedText = find_between( content, tableStart, "<td width=\"98%\"><p align=\"right\"><font color=\"#000000\" size=\"2\" face=\"Verdana\">By More stuff here</font><br>" ) | |
#print scrapedText | |
fo.write(htmlStart + tableStart + scrapedText + tableEnd) | |
fo.close() | |