crawler: removed old imports, added timeout

2013-05-14 19:22:27 +02:00 · 2013-05-14 19:22:27 +02:00 · 4935da85eb
commit 4935da85eb
parent a136dc18f5
1 changed files with 9 additions and 6 deletions
--- a/Mining/Uebungen/2_Uebung/crawler/crawler.py
+++ b/Mining/Uebungen/2_Uebung/crawler/crawler.py
@ -1,11 +1,10 @@
-import urllib
+import urllib2
 import random
 import robotparser
 from sgmllib import SGMLParser
 from urlparse import urlparse
 import time
-import sys
-from termcolor import colored, cprint
+from termcolor import colored

 '''
 TODO:
@ -61,7 +60,11 @@ def checkRobotsTxt(url):
 def canonicalUrl(url):
    o = urlparse(url)
    if o.scheme=='http' and not "pdf" in o.path and not ".." in o.geturl():
-        return True
+        if ".html" in o.path:
+            return True
+        if "." not in o.path:
+            return True
+        return False
    else:
        return False
        
@ -122,7 +125,7 @@ class URLLister(SGMLParser):
                

 startsite = "http://www.ke.tu-darmstadt.de/lehre/arbeiten"
-page = urllib.urlopen(startsite)
+page = urllib2.urlopen(startsite, timeout = 5)
 print "currently visited url: "+startsite
 extractor = URLLister()
 extractor.feed(page.read())
@ -135,7 +138,7 @@ while(i <= numberOfSites):
    url = getNextUrlToVisit()
    print colored("("+str(i)+"/"+str(numberOfSites)+") currently visiting url: "+url, "blue")
    try:
-        page = urllib.urlopen(url)
+        page = urllib2.urlopen(url, timeout = 5)
        extractor.feed(page.read())
        global visitedSites
        visitedSites += 1