diff --git a/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/crawler.py b/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/crawler.py index 8f36cd28..5a664099 100644 --- a/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/crawler.py +++ b/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/crawler.py @@ -8,6 +8,7 @@ import math from sgmllib import SGMLParser import sgmllib from urlparse import urlparse +from urlparse import urljoin import time from termcolor import colored from collections import Counter @@ -47,6 +48,8 @@ numberHyperlinksPerPage = {} # safe number of hyperlinks per page visitedHostsWithTimestamp = {} # safe visited hosts with timestamp robotsTxtResults = {} # safe robots.txt +lasthost = '' #last host + def normalizeMap(m): s = sum(m.values()) @@ -105,9 +108,14 @@ def blockedByRobotsTxt(url): ## TODO: canonical url not only check if url is valid. Transfer relative url to absolute one def canonicalUrl(url): + global lasthost url = url.lower().replace(" ", "") o = urlparse(url) + + if o.netloc != '': + lasthost = o.scheme + '://' + o.netloc + if o.scheme=='http' and not "pdf" in o.path and not ".." in o.geturl(): if ".html" in o.path: return [url] @@ -115,7 +123,10 @@ def canonicalUrl(url): return [url] return [] else: - return [] + if o.scheme=='': + return [urljoin(lasthost,o.path)] + else: + return [] def getNextUrlToVisit(): diff --git a/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/termcolor.pyc b/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/termcolor.pyc index 6a83f8f3..505116bb 100644 Binary files a/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/termcolor.pyc and b/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/termcolor.pyc differ