crawler update: first statistics + some fixes

2013-05-14 18:39:03 +02:00 · 2013-05-14 18:39:03 +02:00 · 2e6037954b
commit 2e6037954b
parent c95757f693
1 changed files with 53 additions and 7 deletions
--- a/Mining/Uebungen/2_Uebung/crawler/crawler.py
+++ b/Mining/Uebungen/2_Uebung/crawler/crawler.py
@ -22,6 +22,9 @@ visitedSites = 0
 prohibitedSites = 0
 visitedUrls = [] # safe already visited urls, so no url will be visited more than once

+numberHyperlink = {} # safe number of hyperlinks...
+numberHyperlinksPerPage = {} # safe number of hyperlinks per page
+
 visitedHostsWithTimestamp = {} # safe visited hosts with timestamp
 robotsTxtResults = {} # safe robots.txt

@ -46,12 +49,12 @@ def checkRobotsTxt(url):
        return True # return true if robots.txt doesn't exist
    else:
        if rp.can_fetch("*", url):
-            print "Checking robots.txt ("+robotsUrl+") \n   "+colored("-> Allowed to visit :)  "+url, "green")
+            print "checking robots.txt ("+robotsUrl+") \n   "+colored("-> allowed to visit :)  "+url, "green")
            global visitedSites
            visitedSites += 1
            return True
        else:
-            print "Checking robots.txt ("+robotsUrl+") \n   "+colored("-> Not allowed to visit :(  "+url, "red")
+            print "checking robots.txt ("+robotsUrl+") \n   "+colored("-> not allowed to visit :(  "+url, "red")
            global prohibitedSites
            prohibitedSites += 1            
            return False
@ -67,6 +70,13 @@ def canonicalUrl(url):
    
 def getNextUrlToVisit():
    url = random.choice(extractor.urls)
+    
+    if url in numberHyperlink:
+        numberHyperlink[url] += 1
+    else:
+        numberHyperlink[url] = 1
+        
+        
    host = urlparse(url).netloc
    
    ## check if url is blocked by robots.txt or was already visited ##
@ -77,12 +87,14 @@ def getNextUrlToVisit():
    ## check if host got a timeout (2 seconds)
    if host in visitedHostsWithTimestamp:
        timestamp = visitedHostsWithTimestamp[host]
-        if (int(time.time()) - timestamp) < timeBetweenSameHost:
+        secondsSinceLastVisit = int(time.time()) - timestamp
+        if secondsSinceLastVisit > timeBetweenSameHost:
            visitedHostsWithTimestamp[host] = int(time.time())
            visitedUrls.append(url)
            return url
        else:
-            print colored(" -> give Host ("+host+") a break", "red")
+            secondsToWait = timeBetweenSameHost - secondsSinceLastVisit
+            print colored(" -> give host ("+host+") a break (wait at least: "+str(secondsToWait)+" seconds)", "red")
            return getNextUrlToVisit()   
    else:
        visitedHostsWithTimestamp[host] = int(time.time())
@ -100,7 +112,15 @@ class URLLister(SGMLParser):
        href = [v for k, v in attrs if k=='href'] 
        if href:
            if canonicalUrl(href[0]):
-                    self.urls.append(href[0])
+                self.urls.append(href[0])
+                
+                # count number of links on actual site
+                if urlparse(href[0]).netloc in numberHyperlinksPerPage:
+                    numberHyperlinksPerPage[urlparse(href[0]).netloc] += 1
+                else:
+                    numberHyperlinksPerPage[urlparse(href[0]).netloc] = 1
+                    
+                    
                

 startsite = "http://www.ke.tu-darmstadt.de/lehre/arbeiten"
@ -116,8 +136,11 @@ numberOfSites = 1000
 while(i <= numberOfSites):
    url = getNextUrlToVisit()
    print colored("("+str(i)+"/"+str(numberOfSites)+") currently visiting url: "+url, "blue")
-    page = urllib.urlopen(url)
-    extractor.feed(page.read())
+    try:
+        page = urllib.urlopen(url)
+        extractor.feed(page.read())
+    except:
+        print colored("("+str(i)+"/"+str(numberOfSites)+") can't read url: "+url, "red")
    i += 1

        
@ -127,6 +150,29 @@ print "\n \n ==== robots.txt ===="
 print "Visited Sites: "+str(visitedSites)
 print "Prohibited by robots.txt: "+str(prohibitedSites)

+
+## print table number hyperlinks per website ##
+print "\n \n ==== numberHyperlink ===="
+print "#Hyperlinks \t Website"
+keys = numberHyperlink.keys()
+keys.sort( lambda x,y: cmp(numberHyperlink[y], numberHyperlink[x]) ) # sort keys
+
+for u in keys:
+    pass
+    print str(numberHyperlink[u])+"\t \t \t"+u
+
+
+## print table number hyperlinks to page ##
+print "\n \n ==== numberHyperlinksPerPage ===="
+print "#HyperlinksToPage \t Website"
+keys = numberHyperlinksPerPage.keys()
+keys.sort( lambda x,y: cmp(numberHyperlinksPerPage[y], numberHyperlinksPerPage[x]) ) # sort keys
+
+for u in keys:
+    pass
+    print str(numberHyperlinksPerPage[u])+"\t \t \t"+u
+
+
 print "\n \n ==== url queue ===="    
 for u in extractor.urls:
    pass