crawler update: first statistics + some fixes

This commit is contained in:
Michael Scholz 2013-05-14 18:39:03 +02:00
parent c95757f693
commit 2e6037954b

View File

@ -22,6 +22,9 @@ visitedSites = 0
prohibitedSites = 0
visitedUrls = [] # safe already visited urls, so no url will be visited more than once
numberHyperlink = {} # safe number of hyperlinks...
numberHyperlinksPerPage = {} # safe number of hyperlinks per page
visitedHostsWithTimestamp = {} # safe visited hosts with timestamp
robotsTxtResults = {} # safe robots.txt
@ -46,12 +49,12 @@ def checkRobotsTxt(url):
return True # return true if robots.txt doesn't exist
else:
if rp.can_fetch("*", url):
print "Checking robots.txt ("+robotsUrl+") \n "+colored("-> Allowed to visit :) "+url, "green")
print "checking robots.txt ("+robotsUrl+") \n "+colored("-> allowed to visit :) "+url, "green")
global visitedSites
visitedSites += 1
return True
else:
print "Checking robots.txt ("+robotsUrl+") \n "+colored("-> Not allowed to visit :( "+url, "red")
print "checking robots.txt ("+robotsUrl+") \n "+colored("-> not allowed to visit :( "+url, "red")
global prohibitedSites
prohibitedSites += 1
return False
@ -67,6 +70,13 @@ def canonicalUrl(url):
def getNextUrlToVisit():
url = random.choice(extractor.urls)
if url in numberHyperlink:
numberHyperlink[url] += 1
else:
numberHyperlink[url] = 1
host = urlparse(url).netloc
## check if url is blocked by robots.txt or was already visited ##
@ -77,12 +87,14 @@ def getNextUrlToVisit():
## check if host got a timeout (2 seconds)
if host in visitedHostsWithTimestamp:
timestamp = visitedHostsWithTimestamp[host]
if (int(time.time()) - timestamp) < timeBetweenSameHost:
secondsSinceLastVisit = int(time.time()) - timestamp
if secondsSinceLastVisit > timeBetweenSameHost:
visitedHostsWithTimestamp[host] = int(time.time())
visitedUrls.append(url)
return url
else:
print colored(" -> give Host ("+host+") a break", "red")
secondsToWait = timeBetweenSameHost - secondsSinceLastVisit
print colored(" -> give host ("+host+") a break (wait at least: "+str(secondsToWait)+" seconds)", "red")
return getNextUrlToVisit()
else:
visitedHostsWithTimestamp[host] = int(time.time())
@ -100,7 +112,15 @@ class URLLister(SGMLParser):
href = [v for k, v in attrs if k=='href']
if href:
if canonicalUrl(href[0]):
self.urls.append(href[0])
self.urls.append(href[0])
# count number of links on actual site
if urlparse(href[0]).netloc in numberHyperlinksPerPage:
numberHyperlinksPerPage[urlparse(href[0]).netloc] += 1
else:
numberHyperlinksPerPage[urlparse(href[0]).netloc] = 1
startsite = "http://www.ke.tu-darmstadt.de/lehre/arbeiten"
@ -116,8 +136,11 @@ numberOfSites = 1000
while(i <= numberOfSites):
url = getNextUrlToVisit()
print colored("("+str(i)+"/"+str(numberOfSites)+") currently visiting url: "+url, "blue")
page = urllib.urlopen(url)
extractor.feed(page.read())
try:
page = urllib.urlopen(url)
extractor.feed(page.read())
except:
print colored("("+str(i)+"/"+str(numberOfSites)+") can't read url: "+url, "red")
i += 1
@ -127,6 +150,29 @@ print "\n \n ==== robots.txt ===="
print "Visited Sites: "+str(visitedSites)
print "Prohibited by robots.txt: "+str(prohibitedSites)
## print table number hyperlinks per website ##
print "\n \n ==== numberHyperlink ===="
print "#Hyperlinks \t Website"
keys = numberHyperlink.keys()
keys.sort( lambda x,y: cmp(numberHyperlink[y], numberHyperlink[x]) ) # sort keys
for u in keys:
pass
print str(numberHyperlink[u])+"\t \t \t"+u
## print table number hyperlinks to page ##
print "\n \n ==== numberHyperlinksPerPage ===="
print "#HyperlinksToPage \t Website"
keys = numberHyperlinksPerPage.keys()
keys.sort( lambda x,y: cmp(numberHyperlinksPerPage[y], numberHyperlinksPerPage[x]) ) # sort keys
for u in keys:
pass
print str(numberHyperlinksPerPage[u])+"\t \t \t"+u
print "\n \n ==== url queue ===="
for u in extractor.urls:
pass