From a363992a150abe79275c3d226938952c21d95ddf Mon Sep 17 00:00:00 2001 From: rylon Date: Sun, 19 May 2013 04:31:16 +0200 Subject: [PATCH] webmining u2: url kanonisierung -> erkennen relative urls --- .../Uebungen/2_Uebung/crawler/crawler.py | 13 ++++++++++++- .../Uebungen/2_Uebung/crawler/termcolor.pyc | Bin 3777 -> 3753 bytes 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/crawler.py b/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/crawler.py index 8f36cd28..5a664099 100644 --- a/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/crawler.py +++ b/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/crawler.py @@ -8,6 +8,7 @@ import math from sgmllib import SGMLParser import sgmllib from urlparse import urlparse +from urlparse import urljoin import time from termcolor import colored from collections import Counter @@ -47,6 +48,8 @@ numberHyperlinksPerPage = {} # safe number of hyperlinks per page visitedHostsWithTimestamp = {} # safe visited hosts with timestamp robotsTxtResults = {} # safe robots.txt +lasthost = '' #last host + def normalizeMap(m): s = sum(m.values()) @@ -105,9 +108,14 @@ def blockedByRobotsTxt(url): ## TODO: canonical url not only check if url is valid. Transfer relative url to absolute one def canonicalUrl(url): + global lasthost url = url.lower().replace(" ", "") o = urlparse(url) + + if o.netloc != '': + lasthost = o.scheme + '://' + o.netloc + if o.scheme=='http' and not "pdf" in o.path and not ".." in o.geturl(): if ".html" in o.path: return [url] @@ -115,7 +123,10 @@ def canonicalUrl(url): return [url] return [] else: - return [] + if o.scheme=='': + return [urljoin(lasthost,o.path)] + else: + return [] def getNextUrlToVisit(): diff --git a/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/termcolor.pyc b/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/termcolor.pyc index 6a83f8f3d7730ff4d6a8bf53115e9a5797bb6700..505116bbafe7c43c31e913dcc60a2e54400ba669 100644 GIT binary patch delta 98 zcmX>oyHb{&`7xkI9tVp7N-^!$Ao1jm*f}4gyy6vxM!A3-plDS ac`wIXd}=oD<$T19PXW*n=gD7rw*Ub5peq&t