web mining exercise 2 - crawler template

2013-05-13 22:58:35 +02:00 · 2013-05-13 22:58:35 +02:00 · 3b3a106b6e
commit 3b3a106b6e
parent 2f35427371
1 changed files with 53 additions and 0 deletions
--- a/Mining/Uebungen/2_Uebung/crawler.py
+++ b/Mining/Uebungen/2_Uebung/crawler.py
@ -0,0 +1,53 @@
+import urllib
+import random
+from sgmllib import SGMLParser
+from urlparse import urlparse
+
+'''
+TODO:
+    - canonize urls -> canonize? slides?
+    - server timeout -> safe crawled host, set timeout for crawled host
+    - statistics -> http://www.ke.tu-darmstadt.de/lehre/ss13/web-mining/uebung2.html
+
+'''
+
+class URLLister(SGMLParser):
+    def reset(self):                              
+        SGMLParser.reset(self)
+        self.urls = []
+        
+    def start_a(self, attrs):                     
+        
+        href = [v for k, v in attrs if k=='href'] 
+        if href:
+            # canonize url
+            o = urlparse(href[0])
+            
+            if o.scheme=='http' and (o.geturl() not in self.urls) and not "pdf" in o.path: # only use absolute urls....            
+                self.urls.extend([o.geturl()])
+                
+
+startsite = "http://www.ke.tu-darmstadt.de/lehre/arbeiten"
+page = urllib.urlopen(startsite)
+print "currently visited url: "+startsite
+extractor = URLLister()
+extractor.feed(page.read())
+
+i = 1
+# crawl 100 sites...
+while(i <= 100): 
+    url = random.choice(extractor.urls)
+    ## remove url from queue
+    extractor.urls.remove(url)
+    print "currently visited url: "+url
+    page = urllib.urlopen(url)
+    extractor.feed(page.read())
+    i = i + 1
+
+
+extractor.close()
+    
+for u in extractor.urls:
+    pass
+    print "\n \n ==== url queue ===="
+    print u