From 55935e7b22176aa48be12a850afa1210865a213d Mon Sep 17 00:00:00 2001 From: rylon Date: Sun, 19 May 2013 04:57:37 +0200 Subject: [PATCH] missing comment --- ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/crawler.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/crawler.py b/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/crawler.py index 5a664099..fdf63ed7 100644 --- a/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/crawler.py +++ b/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/crawler.py @@ -19,7 +19,7 @@ VN: - Sprachprüfer fertig TODO: - - canonize urls -> canonize? slides? + - DONE canonize urls -> canonize? slides? -> remember last host -> no magic here -> even using ugly global - DONE with getNextUrlToVisit(): server timeout -> safe crawled host, set timeout for crawled host - statistics -> http://www.ke.tu-darmstadt.de/lehre/ss13/web-mining/uebung2.html @@ -106,7 +106,6 @@ def blockedByRobotsTxt(url): prohibitedSites += 1 return True -## TODO: canonical url not only check if url is valid. Transfer relative url to absolute one def canonicalUrl(url): global lasthost url = url.lower().replace(" ", "")