diff --git a/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/crawler.py b/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/crawler.py index 4a2f3e05..ef142433 100644 --- a/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/crawler.py +++ b/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/crawler.py @@ -1,11 +1,10 @@ -import urllib +import urllib2 import random import robotparser from sgmllib import SGMLParser from urlparse import urlparse import time -import sys -from termcolor import colored, cprint +from termcolor import colored ''' TODO: @@ -61,7 +60,11 @@ def checkRobotsTxt(url): def canonicalUrl(url): o = urlparse(url) if o.scheme=='http' and not "pdf" in o.path and not ".." in o.geturl(): - return True + if ".html" in o.path: + return True + if "." not in o.path: + return True + return False else: return False @@ -122,7 +125,7 @@ class URLLister(SGMLParser): startsite = "http://www.ke.tu-darmstadt.de/lehre/arbeiten" -page = urllib.urlopen(startsite) +page = urllib2.urlopen(startsite, timeout = 5) print "currently visited url: "+startsite extractor = URLLister() extractor.feed(page.read()) @@ -135,7 +138,7 @@ while(i <= numberOfSites): url = getNextUrlToVisit() print colored("("+str(i)+"/"+str(numberOfSites)+") currently visiting url: "+url, "blue") try: - page = urllib.urlopen(url) + page = urllib2.urlopen(url, timeout = 5) extractor.feed(page.read()) global visitedSites visitedSites += 1