From e81ddb5c3030c8e12aefc5462901031857803e17 Mon Sep 17 00:00:00 2001 From: Michael Scholz Date: Mon, 13 May 2013 23:16:38 +0200 Subject: [PATCH] crawler update --- .../1_Web Mining/Uebungen/2_Uebung/crawler.py | 21 ++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler.py b/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler.py index 2faa2d2a..939d4f3f 100644 --- a/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler.py +++ b/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler.py @@ -34,15 +34,22 @@ extractor = URLLister() extractor.feed(page.read()) i = 1 +numberOfSites = 1000 +lastHost = "" # crawl 100 sites... -while(i <= 100): +while(i <= numberOfSites): + # get random url from queue url = random.choice(extractor.urls) - ## remove url from queue - extractor.urls.remove(url) - print "currently visited url: "+url - page = urllib.urlopen(url) - extractor.feed(page.read()) - i = i + 1 + + # check if lastHost == currentHost + if urlparse(url).netloc != urlparse(lastHost).netloc: + ## remove url from queue + extractor.urls.remove(url) + print "("+str(i)+"/"+str(numberOfSites)+") currently visited url: "+url + page = urllib.urlopen(url) + extractor.feed(page.read()) + i = i + 1 + lastHost = url extractor.close()