update crawler

This commit is contained in:
Michael Scholz 2013-05-15 15:27:09 +02:00
parent 4935da85eb
commit a64659eb1b

View File

@ -1,4 +1,5 @@
import urllib2
import urllib, urllib2
import sys
import random
import robotparser
from sgmllib import SGMLParser
@ -15,12 +16,20 @@ TODO:
'''
# crawler attributes
entrypoint = "http://www.ke.tu-darmstadt.de/lehre/arbeiten"
entrypoint = "http://www.spiegel.de"
#entrypoint = "http://www.buchaktuell.de/"
numberOfPagesToCrawl = 1000
timeBetweenSameHost = 5 # 5 sec
#some variables
timeBetweenSameHost = 2 # 2 sec
visitedSites = 0
prohibitedSites = 0
visitedUrls = [] # safe already visited urls, so no url will be visited more than once
pages = {} # downloaded pages
numberHyperlink = {} # safe number of hyperlinks...
numberHyperlinksPerPage = {} # safe number of hyperlinks per page
@ -28,7 +37,7 @@ visitedHostsWithTimestamp = {} # safe visited hosts with timestamp
robotsTxtResults = {} # safe robots.txt
def checkRobotsTxt(url):
def blockedByRobotsTxt(url):
o = urlparse(url)
robotsUrl = o.scheme+"://"+o.netloc+"/robots.txt"
@ -45,28 +54,29 @@ def checkRobotsTxt(url):
robotsTxtResults[robotsUrl] = None # robots.txt doesn't exist
if robotsTxtResults[robotsUrl] == None:
return True # return true if robots.txt doesn't exist
return False # return false if robots.txt doesn't exist
else:
if rp.can_fetch("*", url):
print "checking robots.txt ("+robotsUrl+") \n "+colored("-> allowed to visit :) "+url, "green")
return True
return False
else:
print "checking robots.txt ("+robotsUrl+") \n "+colored("-> not allowed to visit :( "+url, "red")
print colored("-> not allowed to visit :( "+url, "red")
global prohibitedSites
prohibitedSites += 1
return False
return True
## TODO: canonical url not only check if url is valid. Transfer relative url to absolute one
def canonicalUrl(url):
url = url.lower().replace(" ", "")
o = urlparse(url)
if o.scheme=='http' and not "pdf" in o.path and not ".." in o.geturl():
if ".html" in o.path:
return True
return [url]
if "." not in o.path:
return True
return False
return [url]
return []
else:
return False
return []
def getNextUrlToVisit():
@ -81,25 +91,28 @@ def getNextUrlToVisit():
host = urlparse(url).netloc
## check if url is blocked by robots.txt or was already visited ##
if not checkRobotsTxt(url) or url in visitedUrls:
if blockedByRobotsTxt(url) or url in visitedUrls:
print str(len(extractor.urls))
extractor.urls.remove(url)
return getNextUrlToVisit()
## check if host got a timeout (2 seconds)
## check if host got a timeout
if host in visitedHostsWithTimestamp:
timestamp = visitedHostsWithTimestamp[host]
secondsSinceLastVisit = int(time.time()) - timestamp
if secondsSinceLastVisit > timeBetweenSameHost:
if secondsSinceLastVisit >= timeBetweenSameHost:
visitedHostsWithTimestamp[host] = int(time.time())
visitedUrls.append(url)
extractor.urls.remove(url)
return url
else:
secondsToWait = timeBetweenSameHost - secondsSinceLastVisit
print colored(" -> give host ("+host+") a break (wait at least: "+str(secondsToWait)+" seconds)", "red")
print colored(" -> give host ("+host+") a break (wait at least: "+str(secondsToWait)+" seconds)", "red")
return getNextUrlToVisit()
else:
visitedHostsWithTimestamp[host] = int(time.time())
visitedUrls.append(url)
extractor.urls.remove(url)
return url
@ -112,71 +125,83 @@ class URLLister(SGMLParser):
href = [v for k, v in attrs if k=='href']
if href:
if canonicalUrl(href[0]):
self.urls.append(href[0])
# count number of links on actual site
if urlparse(href[0]).netloc in numberHyperlinksPerPage:
numberHyperlinksPerPage[urlparse(href[0]).netloc] += 1
else:
numberHyperlinksPerPage[urlparse(href[0]).netloc] = 1
url = canonicalUrl(href[0])
self.urls.extend(url)
# count number of links on actual site
if href[0] in numberHyperlinksPerPage:
numberHyperlinksPerPage[href[0]] += 1
else:
numberHyperlinksPerPage[href[0]] = 1
if __name__ == "__main__":
startsite = "http://www.ke.tu-darmstadt.de/lehre/arbeiten"
page = urllib2.urlopen(startsite, timeout = 5)
print "currently visited url: "+startsite
extractor = URLLister()
extractor.feed(page.read())
page = urllib2.urlopen(entrypoint, timeout = 5)
print "currently visited url: "+entrypoint
extractor = URLLister()
extractor.feed(page.read())
page.close()
i = 1
while(i <= numberOfPagesToCrawl):
url = getNextUrlToVisit()
print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") currently visiting url: "+url, "blue")
try:
page = urllib2.urlopen(url, timeout = 6)
pageContent = page.read()
page.close()
extractor.feed(pageContent)
pages[url] = pageContent
i += 1
except urllib2.HTTPError, err:
if err.code == 404:
print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: not found: "+url, "red")
if err.code == 400:
print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: bad request: "+url, "red")
if err.code == 403:
print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: forbidden: "+url, "red")
except urllib2.URLError:
print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: urllib2.URLError: "+url, "red")
except:
print "Unexpected error:", sys.exc_info()[0]
i = 1010
i = 1
numberOfSites = 1000
# crawl 100 sites...
while(i <= numberOfSites):
url = getNextUrlToVisit()
print colored("("+str(i)+"/"+str(numberOfSites)+") currently visiting url: "+url, "blue")
try:
page = urllib2.urlopen(url, timeout = 5)
extractor.feed(page.read())
global visitedSites
visitedSites += 1
except:
print colored("("+str(i)+"/"+str(numberOfSites)+") can't read url: "+url, "red")
i += 1
extractor.close()
print "\n \n ==== robots.txt ===="
print "Visited Sites: "+str(visitedSites)
print "Prohibited by robots.txt: "+str(prohibitedSites)
## print table number hyperlinks per website ##
print "\n \n ==== numberHyperlink ===="
print "#Hyperlinks \t Website"
keys = numberHyperlink.keys()
keys.sort( lambda x,y: cmp(numberHyperlink[y], numberHyperlink[x]) ) # sort keys
for u in keys:
pass
print str(numberHyperlink[u])+"\t \t \t"+u
## print table number hyperlinks to page ##
print "\n \n ==== numberHyperlinksPerPage ===="
print "#HyperlinksToPage \t Website"
keys = numberHyperlinksPerPage.keys()
keys.sort( lambda x,y: cmp(numberHyperlinksPerPage[y], numberHyperlinksPerPage[x]) ) # sort keys
for u in keys:
pass
print str(numberHyperlinksPerPage[u])+"\t \t \t"+u
print "\n \n ==== url queue ===="
for u in extractor.urls:
pass
#print u
extractor.close()
print "\n \n ==== robots.txt ===="
print "prohibit by robots.txt: "+str(prohibitedSites)
## print table number hyperlinks per website ##
print "\n \n ==== numberHyperlink ===="
print "#Hyperlinks \t Website"
keys = numberHyperlink.keys()
keys.sort( lambda x,y: cmp(numberHyperlink[y], numberHyperlink[x]) ) # sort keys
i = 0
for u in keys:
pass
if i < 50:
print str(numberHyperlink[u])+"\t \t \t"+u
i += 1
## print table number hyperlinks to page ##
print "\n \n ==== numberHyperlinksPerPage ===="
print "#HyperlinksToPage \t Website"
keys = numberHyperlinksPerPage.keys()
keys.sort( lambda x,y: cmp(numberHyperlinksPerPage[y], numberHyperlinksPerPage[x]) ) # sort keys
i = 0
for u in keys:
pass
if i < 50:
print str(numberHyperlinksPerPage[u])+"\t \t \t"+u
i += 1
print "\n \n ==== url queue ===="
for u in extractor.urls:
pass
#print u