web mining exercise 2 - crawler template

This commit is contained in:
Michael Scholz 2013-05-13 22:58:35 +02:00
parent 2f35427371
commit 3b3a106b6e

View File

@ -0,0 +1,53 @@
import urllib
import random
from sgmllib import SGMLParser
from urlparse import urlparse
'''
TODO:
- canonize urls -> canonize? slides?
- server timeout -> safe crawled host, set timeout for crawled host
- statistics -> http://www.ke.tu-darmstadt.de/lehre/ss13/web-mining/uebung2.html
'''
class URLLister(SGMLParser):
def reset(self):
SGMLParser.reset(self)
self.urls = []
def start_a(self, attrs):
href = [v for k, v in attrs if k=='href']
if href:
# canonize url
o = urlparse(href[0])
if o.scheme=='http' and (o.geturl() not in self.urls) and not "pdf" in o.path: # only use absolute urls....
self.urls.extend([o.geturl()])
startsite = "http://www.ke.tu-darmstadt.de/lehre/arbeiten"
page = urllib.urlopen(startsite)
print "currently visited url: "+startsite
extractor = URLLister()
extractor.feed(page.read())
i = 1
# crawl 100 sites...
while(i <= 100):
url = random.choice(extractor.urls)
## remove url from queue
extractor.urls.remove(url)
print "currently visited url: "+url
page = urllib.urlopen(url)
extractor.feed(page.read())
i = i + 1
extractor.close()
for u in extractor.urls:
pass
print "\n \n ==== url queue ===="
print u