From a136dc18f599b70458c3a22287958532062a14dd Mon Sep 17 00:00:00 2001 From: Michael Scholz Date: Tue, 14 May 2013 18:47:34 +0200 Subject: [PATCH] last small fix for today --- ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/crawler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/crawler.py b/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/crawler.py index 11e9da43..4a2f3e05 100644 --- a/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/crawler.py +++ b/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/crawler.py @@ -50,8 +50,6 @@ def checkRobotsTxt(url): else: if rp.can_fetch("*", url): print "checking robots.txt ("+robotsUrl+") \n "+colored("-> allowed to visit :) "+url, "green") - global visitedSites - visitedSites += 1 return True else: print "checking robots.txt ("+robotsUrl+") \n "+colored("-> not allowed to visit :( "+url, "red") @@ -139,6 +137,8 @@ while(i <= numberOfSites): try: page = urllib.urlopen(url) extractor.feed(page.read()) + global visitedSites + visitedSites += 1 except: print colored("("+str(i)+"/"+str(numberOfSites)+") can't read url: "+url, "red") i += 1