diff --git a/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/crawler.py b/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/crawler.py index 96075f75..2fadb4e4 100644 --- a/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/crawler.py +++ b/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/crawler.py @@ -3,60 +3,92 @@ import random import robotparser from sgmllib import SGMLParser from urlparse import urlparse +import time import sys from termcolor import colored, cprint ''' TODO: - canonize urls -> canonize? slides? - - server timeout -> safe crawled host, set timeout for crawled host + - DONE with getNextUrlToVisit(): + server timeout -> safe crawled host, set timeout for crawled host - statistics -> http://www.ke.tu-darmstadt.de/lehre/ss13/web-mining/uebung2.html ''' #some variables +timeBetweenSameHost = 2 # 2 sec visitedSites = 0 prohibitedSites = 0 visitedUrls = [] # safe already visited urls, so no url will be visited more than once -robotsTxtResults = {} +visitedHostsWithTimestamp = {} # safe visited hosts with timestamp +robotsTxtResults = {} # safe robots.txt def checkRobotsTxt(url): - o = urlparse(url) robotsUrl = o.scheme+"://"+o.netloc+"/robots.txt" - rp = robotparser.RobotFileParser() - rp.set_url(robotsUrl) + + if url in robotsTxtResults: + rp = robotsTxtResults[robotsUrl] + else: + rp = robotparser.RobotFileParser() + rp.set_url(robotsUrl) - try: - rp.read() - deadLink = 0 - except: - deadLink = 1 - if deadLink: - return 1 # return true if robots.txt doesn't exist + try: + rp.read() + robotsTxtResults[robotsUrl] = rp + except: + robotsTxtResults[robotsUrl] = None # robots.txt doesn't exist + + if robotsTxtResults[robotsUrl] == None: + return True # return true if robots.txt doesn't exist else: if rp.can_fetch("*", url): print "Checking robots.txt ("+robotsUrl+") \n "+colored("-> Allowed to visit :) "+url, "green") global visitedSites visitedSites += 1 - return 1 + return True else: print "Checking robots.txt ("+robotsUrl+") \n "+colored("-> Not allowed to visit :( "+url, "red") global prohibitedSites prohibitedSites += 1 - return 0 + return False ## TODO: canonical url not only check if url is valid. Transfer relative url to absolute one def canonicalUrl(url): o = urlparse(url) - if o.scheme=='http' and (o.geturl() not in extractor.urls) and not "pdf" in o.path: - return 1 + if o.scheme=='http' and not "pdf" in o.path and not ".." in o.geturl(): + return True else: - return 0 + return False +def getNextUrlToVisit(): + url = random.choice(extractor.urls) + host = urlparse(url).netloc + + ## check if url is blocked by robots.txt or was already visited ## + if not checkRobotsTxt(url) or url in visitedUrls: + extractor.urls.remove(url) + return getNextUrlToVisit() + + ## check if host got a timeout (2 seconds) + if host in visitedHostsWithTimestamp: + timestamp = visitedHostsWithTimestamp[host] + if (int(time.time()) - timestamp) < timeBetweenSameHost: + visitedHostsWithTimestamp[host] = int(time.time()) + visitedUrls.append(url) + return url + else: + print colored(" -> give Host ("+host+") a break", "red") + return getNextUrlToVisit() + else: + visitedHostsWithTimestamp[host] = int(time.time()) + visitedUrls.append(url) + return url + class URLLister(SGMLParser): def reset(self): @@ -80,26 +112,15 @@ extractor.feed(page.read()) i = 1 numberOfSites = 1000 -lastHost = "" -visitedHosts = [] # crawl 100 sites... while(i <= numberOfSites): - # get random url from queue - url = random.choice(extractor.urls) - - # check if lastHost == currentHost && robots.txt && already visited - if urlparse(url).netloc != lastHost and checkRobotsTxt(url) and url not in visitedUrls: - ## remove url from queue - extractor.urls.remove(url) - print colored("("+str(i)+"/"+str(numberOfSites)+") currently visiting url: "+url, "blue") - page = urllib.urlopen(url) - visitedUrls.append(url) - extractor.feed(page.read()) - i = i + 1 - lastHost = urlparse(url).netloc - #visitedHosts[urlparse(url).netloc] = 5 - + url = getNextUrlToVisit() + print colored("("+str(i)+"/"+str(numberOfSites)+") currently visiting url: "+url, "blue") + page = urllib.urlopen(url) + extractor.feed(page.read()) + i += 1 + extractor.close() print "\n \n ==== robots.txt ===="