diff --git a/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler.py b/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler.py index 2faa2d2a..939d4f3f 100644 --- a/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler.py +++ b/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler.py @@ -34,15 +34,22 @@ extractor = URLLister() extractor.feed(page.read()) i = 1 +numberOfSites = 1000 +lastHost = "" # crawl 100 sites... -while(i <= 100): +while(i <= numberOfSites): + # get random url from queue url = random.choice(extractor.urls) - ## remove url from queue - extractor.urls.remove(url) - print "currently visited url: "+url - page = urllib.urlopen(url) - extractor.feed(page.read()) - i = i + 1 + + # check if lastHost == currentHost + if urlparse(url).netloc != urlparse(lastHost).netloc: + ## remove url from queue + extractor.urls.remove(url) + print "("+str(i)+"/"+str(numberOfSites)+") currently visited url: "+url + page = urllib.urlopen(url) + extractor.feed(page.read()) + i = i + 1 + lastHost = url extractor.close()