crawler update: robots.txt, give host a break of 2 seconds
This commit is contained in:
parent
a7a937d205
commit
c95757f693
@ -3,60 +3,92 @@ import random
|
||||
import robotparser
|
||||
from sgmllib import SGMLParser
|
||||
from urlparse import urlparse
|
||||
import time
|
||||
import sys
|
||||
from termcolor import colored, cprint
|
||||
|
||||
'''
|
||||
TODO:
|
||||
- canonize urls -> canonize? slides?
|
||||
- server timeout -> safe crawled host, set timeout for crawled host
|
||||
- DONE with getNextUrlToVisit():
|
||||
server timeout -> safe crawled host, set timeout for crawled host
|
||||
- statistics -> http://www.ke.tu-darmstadt.de/lehre/ss13/web-mining/uebung2.html
|
||||
|
||||
'''
|
||||
|
||||
#some variables
|
||||
timeBetweenSameHost = 2 # 2 sec
|
||||
visitedSites = 0
|
||||
prohibitedSites = 0
|
||||
visitedUrls = [] # safe already visited urls, so no url will be visited more than once
|
||||
|
||||
robotsTxtResults = {}
|
||||
visitedHostsWithTimestamp = {} # safe visited hosts with timestamp
|
||||
robotsTxtResults = {} # safe robots.txt
|
||||
|
||||
|
||||
def checkRobotsTxt(url):
|
||||
|
||||
o = urlparse(url)
|
||||
robotsUrl = o.scheme+"://"+o.netloc+"/robots.txt"
|
||||
rp = robotparser.RobotFileParser()
|
||||
rp.set_url(robotsUrl)
|
||||
|
||||
if url in robotsTxtResults:
|
||||
rp = robotsTxtResults[robotsUrl]
|
||||
else:
|
||||
rp = robotparser.RobotFileParser()
|
||||
rp.set_url(robotsUrl)
|
||||
|
||||
try:
|
||||
rp.read()
|
||||
deadLink = 0
|
||||
except:
|
||||
deadLink = 1
|
||||
if deadLink:
|
||||
return 1 # return true if robots.txt doesn't exist
|
||||
try:
|
||||
rp.read()
|
||||
robotsTxtResults[robotsUrl] = rp
|
||||
except:
|
||||
robotsTxtResults[robotsUrl] = None # robots.txt doesn't exist
|
||||
|
||||
if robotsTxtResults[robotsUrl] == None:
|
||||
return True # return true if robots.txt doesn't exist
|
||||
else:
|
||||
if rp.can_fetch("*", url):
|
||||
print "Checking robots.txt ("+robotsUrl+") \n "+colored("-> Allowed to visit :) "+url, "green")
|
||||
global visitedSites
|
||||
visitedSites += 1
|
||||
return 1
|
||||
return True
|
||||
else:
|
||||
print "Checking robots.txt ("+robotsUrl+") \n "+colored("-> Not allowed to visit :( "+url, "red")
|
||||
global prohibitedSites
|
||||
prohibitedSites += 1
|
||||
return 0
|
||||
return False
|
||||
|
||||
## TODO: canonical url not only check if url is valid. Transfer relative url to absolute one
|
||||
def canonicalUrl(url):
|
||||
o = urlparse(url)
|
||||
if o.scheme=='http' and (o.geturl() not in extractor.urls) and not "pdf" in o.path:
|
||||
return 1
|
||||
if o.scheme=='http' and not "pdf" in o.path and not ".." in o.geturl():
|
||||
return True
|
||||
else:
|
||||
return 0
|
||||
return False
|
||||
|
||||
|
||||
def getNextUrlToVisit():
|
||||
url = random.choice(extractor.urls)
|
||||
host = urlparse(url).netloc
|
||||
|
||||
## check if url is blocked by robots.txt or was already visited ##
|
||||
if not checkRobotsTxt(url) or url in visitedUrls:
|
||||
extractor.urls.remove(url)
|
||||
return getNextUrlToVisit()
|
||||
|
||||
## check if host got a timeout (2 seconds)
|
||||
if host in visitedHostsWithTimestamp:
|
||||
timestamp = visitedHostsWithTimestamp[host]
|
||||
if (int(time.time()) - timestamp) < timeBetweenSameHost:
|
||||
visitedHostsWithTimestamp[host] = int(time.time())
|
||||
visitedUrls.append(url)
|
||||
return url
|
||||
else:
|
||||
print colored(" -> give Host ("+host+") a break", "red")
|
||||
return getNextUrlToVisit()
|
||||
else:
|
||||
visitedHostsWithTimestamp[host] = int(time.time())
|
||||
visitedUrls.append(url)
|
||||
return url
|
||||
|
||||
|
||||
class URLLister(SGMLParser):
|
||||
def reset(self):
|
||||
@ -80,26 +112,15 @@ extractor.feed(page.read())
|
||||
|
||||
i = 1
|
||||
numberOfSites = 1000
|
||||
lastHost = ""
|
||||
visitedHosts = []
|
||||
# crawl 100 sites...
|
||||
while(i <= numberOfSites):
|
||||
# get random url from queue
|
||||
url = random.choice(extractor.urls)
|
||||
|
||||
# check if lastHost == currentHost && robots.txt && already visited
|
||||
if urlparse(url).netloc != lastHost and checkRobotsTxt(url) and url not in visitedUrls:
|
||||
## remove url from queue
|
||||
extractor.urls.remove(url)
|
||||
print colored("("+str(i)+"/"+str(numberOfSites)+") currently visiting url: "+url, "blue")
|
||||
page = urllib.urlopen(url)
|
||||
visitedUrls.append(url)
|
||||
extractor.feed(page.read())
|
||||
i = i + 1
|
||||
lastHost = urlparse(url).netloc
|
||||
#visitedHosts[urlparse(url).netloc] = 5
|
||||
|
||||
url = getNextUrlToVisit()
|
||||
print colored("("+str(i)+"/"+str(numberOfSites)+") currently visiting url: "+url, "blue")
|
||||
page = urllib.urlopen(url)
|
||||
extractor.feed(page.read())
|
||||
i += 1
|
||||
|
||||
|
||||
extractor.close()
|
||||
|
||||
print "\n \n ==== robots.txt ===="
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user