From a64659eb1b2d9270b3c66af2123d60ca892135f5 Mon Sep 17 00:00:00 2001 From: Michael Scholz Date: Wed, 15 May 2013 15:27:09 +0200 Subject: [PATCH] update crawler --- .../Uebungen/2_Uebung/crawler/crawler.py | 185 ++++++++++-------- 1 file changed, 105 insertions(+), 80 deletions(-) diff --git a/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/crawler.py b/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/crawler.py index ef142433..9cfc0967 100644 --- a/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/crawler.py +++ b/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/crawler.py @@ -1,4 +1,5 @@ -import urllib2 +import urllib, urllib2 +import sys import random import robotparser from sgmllib import SGMLParser @@ -15,12 +16,20 @@ TODO: ''' +# crawler attributes +entrypoint = "http://www.ke.tu-darmstadt.de/lehre/arbeiten" +entrypoint = "http://www.spiegel.de" +#entrypoint = "http://www.buchaktuell.de/" +numberOfPagesToCrawl = 1000 +timeBetweenSameHost = 5 # 5 sec + + #some variables -timeBetweenSameHost = 2 # 2 sec -visitedSites = 0 prohibitedSites = 0 visitedUrls = [] # safe already visited urls, so no url will be visited more than once +pages = {} # downloaded pages + numberHyperlink = {} # safe number of hyperlinks... numberHyperlinksPerPage = {} # safe number of hyperlinks per page @@ -28,7 +37,7 @@ visitedHostsWithTimestamp = {} # safe visited hosts with timestamp robotsTxtResults = {} # safe robots.txt -def checkRobotsTxt(url): +def blockedByRobotsTxt(url): o = urlparse(url) robotsUrl = o.scheme+"://"+o.netloc+"/robots.txt" @@ -45,28 +54,29 @@ def checkRobotsTxt(url): robotsTxtResults[robotsUrl] = None # robots.txt doesn't exist if robotsTxtResults[robotsUrl] == None: - return True # return true if robots.txt doesn't exist + return False # return false if robots.txt doesn't exist else: if rp.can_fetch("*", url): - print "checking robots.txt ("+robotsUrl+") \n "+colored("-> allowed to visit :) "+url, "green") - return True + return False else: - print "checking robots.txt ("+robotsUrl+") \n "+colored("-> not allowed to visit :( "+url, "red") + print colored("-> not allowed to visit :( "+url, "red") global prohibitedSites prohibitedSites += 1 - return False + return True ## TODO: canonical url not only check if url is valid. Transfer relative url to absolute one def canonicalUrl(url): + url = url.lower().replace(" ", "") + o = urlparse(url) if o.scheme=='http' and not "pdf" in o.path and not ".." in o.geturl(): if ".html" in o.path: - return True + return [url] if "." not in o.path: - return True - return False + return [url] + return [] else: - return False + return [] def getNextUrlToVisit(): @@ -81,25 +91,28 @@ def getNextUrlToVisit(): host = urlparse(url).netloc ## check if url is blocked by robots.txt or was already visited ## - if not checkRobotsTxt(url) or url in visitedUrls: + if blockedByRobotsTxt(url) or url in visitedUrls: + print str(len(extractor.urls)) extractor.urls.remove(url) return getNextUrlToVisit() - ## check if host got a timeout (2 seconds) + ## check if host got a timeout if host in visitedHostsWithTimestamp: timestamp = visitedHostsWithTimestamp[host] secondsSinceLastVisit = int(time.time()) - timestamp - if secondsSinceLastVisit > timeBetweenSameHost: + if secondsSinceLastVisit >= timeBetweenSameHost: visitedHostsWithTimestamp[host] = int(time.time()) visitedUrls.append(url) + extractor.urls.remove(url) return url else: secondsToWait = timeBetweenSameHost - secondsSinceLastVisit - print colored(" -> give host ("+host+") a break (wait at least: "+str(secondsToWait)+" seconds)", "red") + print colored(" -> give host ("+host+") a break (wait at least: "+str(secondsToWait)+" seconds)", "red") return getNextUrlToVisit() else: visitedHostsWithTimestamp[host] = int(time.time()) visitedUrls.append(url) + extractor.urls.remove(url) return url @@ -112,71 +125,83 @@ class URLLister(SGMLParser): href = [v for k, v in attrs if k=='href'] if href: - if canonicalUrl(href[0]): - self.urls.append(href[0]) - - # count number of links on actual site - if urlparse(href[0]).netloc in numberHyperlinksPerPage: - numberHyperlinksPerPage[urlparse(href[0]).netloc] += 1 - else: - numberHyperlinksPerPage[urlparse(href[0]).netloc] = 1 + url = canonicalUrl(href[0]) + self.urls.extend(url) + + # count number of links on actual site + if href[0] in numberHyperlinksPerPage: + numberHyperlinksPerPage[href[0]] += 1 + else: + numberHyperlinksPerPage[href[0]] = 1 +if __name__ == "__main__": -startsite = "http://www.ke.tu-darmstadt.de/lehre/arbeiten" -page = urllib2.urlopen(startsite, timeout = 5) -print "currently visited url: "+startsite -extractor = URLLister() -extractor.feed(page.read()) + page = urllib2.urlopen(entrypoint, timeout = 5) + print "currently visited url: "+entrypoint + extractor = URLLister() + extractor.feed(page.read()) + page.close() + + + i = 1 + while(i <= numberOfPagesToCrawl): + url = getNextUrlToVisit() + print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") currently visiting url: "+url, "blue") + try: + page = urllib2.urlopen(url, timeout = 6) + pageContent = page.read() + page.close() + extractor.feed(pageContent) + pages[url] = pageContent + i += 1 + except urllib2.HTTPError, err: + if err.code == 404: + print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: not found: "+url, "red") + if err.code == 400: + print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: bad request: "+url, "red") + if err.code == 403: + print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: forbidden: "+url, "red") + except urllib2.URLError: + print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: urllib2.URLError: "+url, "red") + except: + print "Unexpected error:", sys.exc_info()[0] + + i = 1010 - -i = 1 -numberOfSites = 1000 -# crawl 100 sites... -while(i <= numberOfSites): - url = getNextUrlToVisit() - print colored("("+str(i)+"/"+str(numberOfSites)+") currently visiting url: "+url, "blue") - try: - page = urllib2.urlopen(url, timeout = 5) - extractor.feed(page.read()) - global visitedSites - visitedSites += 1 - except: - print colored("("+str(i)+"/"+str(numberOfSites)+") can't read url: "+url, "red") - i += 1 - - -extractor.close() - -print "\n \n ==== robots.txt ====" -print "Visited Sites: "+str(visitedSites) -print "Prohibited by robots.txt: "+str(prohibitedSites) - - -## print table number hyperlinks per website ## -print "\n \n ==== numberHyperlink ====" -print "#Hyperlinks \t Website" -keys = numberHyperlink.keys() -keys.sort( lambda x,y: cmp(numberHyperlink[y], numberHyperlink[x]) ) # sort keys - -for u in keys: - pass - print str(numberHyperlink[u])+"\t \t \t"+u - - -## print table number hyperlinks to page ## -print "\n \n ==== numberHyperlinksPerPage ====" -print "#HyperlinksToPage \t Website" -keys = numberHyperlinksPerPage.keys() -keys.sort( lambda x,y: cmp(numberHyperlinksPerPage[y], numberHyperlinksPerPage[x]) ) # sort keys - -for u in keys: - pass - print str(numberHyperlinksPerPage[u])+"\t \t \t"+u - - -print "\n \n ==== url queue ====" -for u in extractor.urls: - pass - #print u \ No newline at end of file + + extractor.close() + + print "\n \n ==== robots.txt ====" + print "prohibit by robots.txt: "+str(prohibitedSites) + + + ## print table number hyperlinks per website ## + print "\n \n ==== numberHyperlink ====" + print "#Hyperlinks \t Website" + keys = numberHyperlink.keys() + keys.sort( lambda x,y: cmp(numberHyperlink[y], numberHyperlink[x]) ) # sort keys + i = 0 + for u in keys: + pass + if i < 50: + print str(numberHyperlink[u])+"\t \t \t"+u + i += 1 + + ## print table number hyperlinks to page ## + print "\n \n ==== numberHyperlinksPerPage ====" + print "#HyperlinksToPage \t Website" + keys = numberHyperlinksPerPage.keys() + keys.sort( lambda x,y: cmp(numberHyperlinksPerPage[y], numberHyperlinksPerPage[x]) ) # sort keys + i = 0 + for u in keys: + pass + if i < 50: + print str(numberHyperlinksPerPage[u])+"\t \t \t"+u + i += 1 + + print "\n \n ==== url queue ====" + for u in extractor.urls: + pass + #print u \ No newline at end of file