diff --git a/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/crawler.py b/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/crawler.py index 11e9da43..4a2f3e05 100644 --- a/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/crawler.py +++ b/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/crawler.py @@ -50,8 +50,6 @@ def checkRobotsTxt(url): else: if rp.can_fetch("*", url): print "checking robots.txt ("+robotsUrl+") \n "+colored("-> allowed to visit :) "+url, "green") - global visitedSites - visitedSites += 1 return True else: print "checking robots.txt ("+robotsUrl+") \n "+colored("-> not allowed to visit :( "+url, "red") @@ -139,6 +137,8 @@ while(i <= numberOfSites): try: page = urllib.urlopen(url) extractor.feed(page.read()) + global visitedSites + visitedSites += 1 except: print colored("("+str(i)+"/"+str(numberOfSites)+") can't read url: "+url, "red") i += 1