webmining u2: url kanonisierung -> erkennen relative urls
This commit is contained in:
parent
6511d9a07b
commit
a363992a15
@ -8,6 +8,7 @@ import math
|
||||
from sgmllib import SGMLParser
|
||||
import sgmllib
|
||||
from urlparse import urlparse
|
||||
from urlparse import urljoin
|
||||
import time
|
||||
from termcolor import colored
|
||||
from collections import Counter
|
||||
@ -47,6 +48,8 @@ numberHyperlinksPerPage = {} # safe number of hyperlinks per page
|
||||
visitedHostsWithTimestamp = {} # safe visited hosts with timestamp
|
||||
robotsTxtResults = {} # safe robots.txt
|
||||
|
||||
lasthost = '' #last host
|
||||
|
||||
def normalizeMap(m):
|
||||
s = sum(m.values())
|
||||
|
||||
@ -105,9 +108,14 @@ def blockedByRobotsTxt(url):
|
||||
|
||||
## TODO: canonical url not only check if url is valid. Transfer relative url to absolute one
|
||||
def canonicalUrl(url):
|
||||
global lasthost
|
||||
url = url.lower().replace(" ", "")
|
||||
|
||||
o = urlparse(url)
|
||||
|
||||
if o.netloc != '':
|
||||
lasthost = o.scheme + '://' + o.netloc
|
||||
|
||||
if o.scheme=='http' and not "pdf" in o.path and not ".." in o.geturl():
|
||||
if ".html" in o.path:
|
||||
return [url]
|
||||
@ -115,7 +123,10 @@ def canonicalUrl(url):
|
||||
return [url]
|
||||
return []
|
||||
else:
|
||||
return []
|
||||
if o.scheme=='':
|
||||
return [urljoin(lasthost,o.path)]
|
||||
else:
|
||||
return []
|
||||
|
||||
|
||||
def getNextUrlToVisit():
|
||||
|
||||
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user