From 3b3a106b6e2708dc7acb824bf2d739101a196c7e Mon Sep 17 00:00:00 2001 From: Michael Scholz Date: Mon, 13 May 2013 22:58:35 +0200 Subject: [PATCH] web mining exercise 2 - crawler template --- .../1_Web Mining/Uebungen/2_Uebung/crawler.py | 53 +++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 ss2013/1_Web Mining/Uebungen/2_Uebung/crawler.py diff --git a/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler.py b/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler.py new file mode 100644 index 00000000..b5fef232 --- /dev/null +++ b/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler.py @@ -0,0 +1,53 @@ +import urllib +import random +from sgmllib import SGMLParser +from urlparse import urlparse + +''' +TODO: + - canonize urls -> canonize? slides? + - server timeout -> safe crawled host, set timeout for crawled host + - statistics -> http://www.ke.tu-darmstadt.de/lehre/ss13/web-mining/uebung2.html + +''' + +class URLLister(SGMLParser): + def reset(self): + SGMLParser.reset(self) + self.urls = [] + + def start_a(self, attrs): + + href = [v for k, v in attrs if k=='href'] + if href: + # canonize url + o = urlparse(href[0]) + + if o.scheme=='http' and (o.geturl() not in self.urls) and not "pdf" in o.path: # only use absolute urls.... + self.urls.extend([o.geturl()]) + + +startsite = "http://www.ke.tu-darmstadt.de/lehre/arbeiten" +page = urllib.urlopen(startsite) +print "currently visited url: "+startsite +extractor = URLLister() +extractor.feed(page.read()) + +i = 1 +# crawl 100 sites... +while(i <= 100): + url = random.choice(extractor.urls) + ## remove url from queue + extractor.urls.remove(url) + print "currently visited url: "+url + page = urllib.urlopen(url) + extractor.feed(page.read()) + i = i + 1 + + +extractor.close() + +for u in extractor.urls: + pass + print "\n \n ==== url queue ====" + print u \ No newline at end of file