webmining u2: url kanonisierung -> erkennen relative urls

This commit is contained in:
Ulf Gebhardt 2013-05-19 04:31:16 +02:00
parent 6511d9a07b
commit a363992a15
2 changed files with 12 additions and 1 deletions

View File

@ -8,6 +8,7 @@ import math
from sgmllib import SGMLParser
import sgmllib
from urlparse import urlparse
from urlparse import urljoin
import time
from termcolor import colored
from collections import Counter
@ -47,6 +48,8 @@ numberHyperlinksPerPage = {} # safe number of hyperlinks per page
visitedHostsWithTimestamp = {} # safe visited hosts with timestamp
robotsTxtResults = {} # safe robots.txt
lasthost = '' #last host
def normalizeMap(m):
s = sum(m.values())
@ -105,9 +108,14 @@ def blockedByRobotsTxt(url):
## TODO: canonical url not only check if url is valid. Transfer relative url to absolute one
def canonicalUrl(url):
global lasthost
url = url.lower().replace(" ", "")
o = urlparse(url)
if o.netloc != '':
lasthost = o.scheme + '://' + o.netloc
if o.scheme=='http' and not "pdf" in o.path and not ".." in o.geturl():
if ".html" in o.path:
return [url]
@ -115,7 +123,10 @@ def canonicalUrl(url):
return [url]
return []
else:
return []
if o.scheme=='':
return [urljoin(lasthost,o.path)]
else:
return []
def getNextUrlToVisit():