web mining exercise 2 - crawler template
This commit is contained in:
parent
2f35427371
commit
3b3a106b6e
53
ss2013/1_Web Mining/Uebungen/2_Uebung/crawler.py
Normal file
53
ss2013/1_Web Mining/Uebungen/2_Uebung/crawler.py
Normal file
@ -0,0 +1,53 @@
|
||||
import urllib
|
||||
import random
|
||||
from sgmllib import SGMLParser
|
||||
from urlparse import urlparse
|
||||
|
||||
'''
|
||||
TODO:
|
||||
- canonize urls -> canonize? slides?
|
||||
- server timeout -> safe crawled host, set timeout for crawled host
|
||||
- statistics -> http://www.ke.tu-darmstadt.de/lehre/ss13/web-mining/uebung2.html
|
||||
|
||||
'''
|
||||
|
||||
class URLLister(SGMLParser):
|
||||
def reset(self):
|
||||
SGMLParser.reset(self)
|
||||
self.urls = []
|
||||
|
||||
def start_a(self, attrs):
|
||||
|
||||
href = [v for k, v in attrs if k=='href']
|
||||
if href:
|
||||
# canonize url
|
||||
o = urlparse(href[0])
|
||||
|
||||
if o.scheme=='http' and (o.geturl() not in self.urls) and not "pdf" in o.path: # only use absolute urls....
|
||||
self.urls.extend([o.geturl()])
|
||||
|
||||
|
||||
startsite = "http://www.ke.tu-darmstadt.de/lehre/arbeiten"
|
||||
page = urllib.urlopen(startsite)
|
||||
print "currently visited url: "+startsite
|
||||
extractor = URLLister()
|
||||
extractor.feed(page.read())
|
||||
|
||||
i = 1
|
||||
# crawl 100 sites...
|
||||
while(i <= 100):
|
||||
url = random.choice(extractor.urls)
|
||||
## remove url from queue
|
||||
extractor.urls.remove(url)
|
||||
print "currently visited url: "+url
|
||||
page = urllib.urlopen(url)
|
||||
extractor.feed(page.read())
|
||||
i = i + 1
|
||||
|
||||
|
||||
extractor.close()
|
||||
|
||||
for u in extractor.urls:
|
||||
pass
|
||||
print "\n \n ==== url queue ===="
|
||||
print u
|
||||
Loading…
x
Reference in New Issue
Block a user