From 2e6037954bb22c0dd2e9f1f8c2565bf6d55bd336 Mon Sep 17 00:00:00 2001 From: Michael Scholz Date: Tue, 14 May 2013 18:39:03 +0200 Subject: [PATCH] crawler update: first statistics + some fixes --- .../Uebungen/2_Uebung/crawler/crawler.py | 60 ++++++++++++++++--- 1 file changed, 53 insertions(+), 7 deletions(-) diff --git a/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/crawler.py b/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/crawler.py index 2fadb4e4..11e9da43 100644 --- a/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/crawler.py +++ b/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/crawler.py @@ -22,6 +22,9 @@ visitedSites = 0 prohibitedSites = 0 visitedUrls = [] # safe already visited urls, so no url will be visited more than once +numberHyperlink = {} # safe number of hyperlinks... +numberHyperlinksPerPage = {} # safe number of hyperlinks per page + visitedHostsWithTimestamp = {} # safe visited hosts with timestamp robotsTxtResults = {} # safe robots.txt @@ -46,12 +49,12 @@ def checkRobotsTxt(url): return True # return true if robots.txt doesn't exist else: if rp.can_fetch("*", url): - print "Checking robots.txt ("+robotsUrl+") \n "+colored("-> Allowed to visit :) "+url, "green") + print "checking robots.txt ("+robotsUrl+") \n "+colored("-> allowed to visit :) "+url, "green") global visitedSites visitedSites += 1 return True else: - print "Checking robots.txt ("+robotsUrl+") \n "+colored("-> Not allowed to visit :( "+url, "red") + print "checking robots.txt ("+robotsUrl+") \n "+colored("-> not allowed to visit :( "+url, "red") global prohibitedSites prohibitedSites += 1 return False @@ -67,6 +70,13 @@ def canonicalUrl(url): def getNextUrlToVisit(): url = random.choice(extractor.urls) + + if url in numberHyperlink: + numberHyperlink[url] += 1 + else: + numberHyperlink[url] = 1 + + host = urlparse(url).netloc ## check if url is blocked by robots.txt or was already visited ## @@ -77,12 +87,14 @@ def getNextUrlToVisit(): ## check if host got a timeout (2 seconds) if host in visitedHostsWithTimestamp: timestamp = visitedHostsWithTimestamp[host] - if (int(time.time()) - timestamp) < timeBetweenSameHost: + secondsSinceLastVisit = int(time.time()) - timestamp + if secondsSinceLastVisit > timeBetweenSameHost: visitedHostsWithTimestamp[host] = int(time.time()) visitedUrls.append(url) return url else: - print colored(" -> give Host ("+host+") a break", "red") + secondsToWait = timeBetweenSameHost - secondsSinceLastVisit + print colored(" -> give host ("+host+") a break (wait at least: "+str(secondsToWait)+" seconds)", "red") return getNextUrlToVisit() else: visitedHostsWithTimestamp[host] = int(time.time()) @@ -100,7 +112,15 @@ class URLLister(SGMLParser): href = [v for k, v in attrs if k=='href'] if href: if canonicalUrl(href[0]): - self.urls.append(href[0]) + self.urls.append(href[0]) + + # count number of links on actual site + if urlparse(href[0]).netloc in numberHyperlinksPerPage: + numberHyperlinksPerPage[urlparse(href[0]).netloc] += 1 + else: + numberHyperlinksPerPage[urlparse(href[0]).netloc] = 1 + + startsite = "http://www.ke.tu-darmstadt.de/lehre/arbeiten" @@ -116,8 +136,11 @@ numberOfSites = 1000 while(i <= numberOfSites): url = getNextUrlToVisit() print colored("("+str(i)+"/"+str(numberOfSites)+") currently visiting url: "+url, "blue") - page = urllib.urlopen(url) - extractor.feed(page.read()) + try: + page = urllib.urlopen(url) + extractor.feed(page.read()) + except: + print colored("("+str(i)+"/"+str(numberOfSites)+") can't read url: "+url, "red") i += 1 @@ -127,6 +150,29 @@ print "\n \n ==== robots.txt ====" print "Visited Sites: "+str(visitedSites) print "Prohibited by robots.txt: "+str(prohibitedSites) + +## print table number hyperlinks per website ## +print "\n \n ==== numberHyperlink ====" +print "#Hyperlinks \t Website" +keys = numberHyperlink.keys() +keys.sort( lambda x,y: cmp(numberHyperlink[y], numberHyperlink[x]) ) # sort keys + +for u in keys: + pass + print str(numberHyperlink[u])+"\t \t \t"+u + + +## print table number hyperlinks to page ## +print "\n \n ==== numberHyperlinksPerPage ====" +print "#HyperlinksToPage \t Website" +keys = numberHyperlinksPerPage.keys() +keys.sort( lambda x,y: cmp(numberHyperlinksPerPage[y], numberHyperlinksPerPage[x]) ) # sort keys + +for u in keys: + pass + print str(numberHyperlinksPerPage[u])+"\t \t \t"+u + + print "\n \n ==== url queue ====" for u in extractor.urls: pass