webmining u2: url kanonisierung -> erkennen relative urls

2013-05-19 04:31:16 +02:00 · 2013-05-19 04:31:16 +02:00 · a363992a15
commit a363992a15
parent 6511d9a07b
2 changed files with 12 additions and 1 deletions
--- a/Mining/Uebungen/2_Uebung/crawler/crawler.py
+++ b/Mining/Uebungen/2_Uebung/crawler/crawler.py
@ -8,6 +8,7 @@ import math
 from sgmllib import SGMLParser
 import sgmllib
 from urlparse import urlparse
+from urlparse import urljoin
 import time
 from termcolor import colored
 from collections import Counter
@ -47,6 +48,8 @@ numberHyperlinksPerPage = {} # safe number of hyperlinks per page
 visitedHostsWithTimestamp = {} # safe visited hosts with timestamp
 robotsTxtResults = {} # safe robots.txt

+lasthost = '' #last host
+
 def normalizeMap(m):
    s = sum(m.values())
 	
@ -105,9 +108,14 @@ def blockedByRobotsTxt(url):

 ## TODO: canonical url not only check if url is valid. Transfer relative url to absolute one
 def canonicalUrl(url):
+    global lasthost
    url = url.lower().replace(" ", "")

    o = urlparse(url)
+    
+    if o.netloc != '':
+        lasthost = o.scheme + '://' + o.netloc
+        
    if o.scheme=='http' and not "pdf" in o.path and not ".." in o.geturl():
        if ".html" in o.path:
            return [url]
@ -115,7 +123,10 @@ def canonicalUrl(url):
            return [url]
        return []
    else:
-        return []
+        if o.scheme=='':
+            return [urljoin(lasthost,o.path)]
+        else:
+            return []
        
    
 def getNextUrlToVisit():
--- a/Mining/Uebungen/2_Uebung/crawler/termcolor.pyc
+++ b/Mining/Uebungen/2_Uebung/crawler/termcolor.pyc