#!/usr/bin/python
# matt.joyce@gmail.com
# April 2007
#
# fetch a web page and extract any amazon asin numbers
# store asin in a list and pickle them to a file.
#
# based on work by William K Turkel http://digitalhistoryhacks.blogspot.com

import urllib
import re
import pickle


#the page we want to scrape
URL='http://digitalhistoryhacks.blogspot.com/2007/01/readings-for-field-in-digital-history.html'

# the pattern we want to look for
PAT="dp\/([0-9]+[X]*)"

# the prefix for the filename
filename='dhh'

# download the webpage
page=urllib.urlopen(URL)
pattern = re.compile(PAT, re.IGNORECASE)
scrapelist = []

# test each line of html looking fro a our pattern
for line in page.readlines():
    scrapelist += pattern.findall(line)

# use a set to remove duplicate
initial_asins=set(scrapelist)
print len(initial_asins)

#save results
pickle.dump(initial_asins,open(filename+'_asins.pik','w'))
