BeautifulSoup
BeautifulSoup is an excellent python library to parse HTML content. This library is invaluable when you are trying to extract content from web pages. BeautifulSoup can parse even badly formed HTML files with no trouble.
Example usage
import urllib2, re from BeautifulSoup import BeautifulSoup class FlickrPickr(object): def __init__(self, url): self.url = url sq = '%ssizes/sq/' % (url, ) page = urllib2.urlopen(sq) soup = BeautifulSoup(page) p = soup.findAll(src=re.compile('jpg$')) if p: img = p[0]['src'] bits = img.split('/') pic = bits[-1] self.baseurl = '/'.join(bits[:-1]) bits = pic.split('.')[0].split('_') self.id = bits[0] self.extra = bits[1] def get_any(self, endbit): return '/'.join([self.baseurl, '_'.join([self.id, self.extra, endbit+'.jpg'])]) def get_square(self): return self.get_any('s') def get_thumb(self): return self.get_any('t') def get_small(self): return self.get_any('m') # doesnt work :( #def get_original(self): # return self.get_any('o') def get_medium(self): return '/'.join([self.baseurl, '_'.join([self.id, self.extra+'.jpg'])]) def main(): fp = FlickrPickr('http://flickr.com/photos/btbytes/2818112372/') print fp.get_square() print fp.get_thumb() print fp.get_small() print fp.get_medium() print fp.get_original() if __name__ == '__main__': main()
§