From a7a937d205540fb49b37841513a872f9ec817b92 Mon Sep 17 00:00:00 2001 From: Michael Scholz Date: Tue, 14 May 2013 11:24:22 +0200 Subject: [PATCH] crawler update --- .../1_Web Mining/Uebungen/2_Uebung/crawler.py | 60 ------- .../Uebungen/2_Uebung/crawler/crawler.py | 112 ++++++++++++ .../Uebungen/2_Uebung/crawler/termcolor.py | 168 ++++++++++++++++++ .../Uebungen/2_Uebung/crawler/termcolor.pyc | Bin 0 -> 3720 bytes 4 files changed, 280 insertions(+), 60 deletions(-) delete mode 100644 ss2013/1_Web Mining/Uebungen/2_Uebung/crawler.py create mode 100644 ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/crawler.py create mode 100644 ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/termcolor.py create mode 100644 ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/termcolor.pyc diff --git a/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler.py b/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler.py deleted file mode 100644 index 939d4f3f..00000000 --- a/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler.py +++ /dev/null @@ -1,60 +0,0 @@ -import urllib -import random -from sgmllib import SGMLParser -from urlparse import urlparse - -''' -TODO: - - canonize urls -> canonize? slides? - - server timeout -> safe crawled host, set timeout for crawled host - - statistics -> http://www.ke.tu-darmstadt.de/lehre/ss13/web-mining/uebung2.html - -''' - -class URLLister(SGMLParser): - def reset(self): - SGMLParser.reset(self) - self.urls = [] - - def start_a(self, attrs): - - href = [v for k, v in attrs if k=='href'] - if href: - # canonize url - o = urlparse(href[0]) - - if o.scheme=='http' and (o.geturl() not in self.urls) and not "pdf" in o.path: # only use absolute urls.... - self.urls.extend([o.geturl()]) - - -startsite = "http://www.ke.tu-darmstadt.de/lehre/arbeiten" -page = urllib.urlopen(startsite) -print "currently visited url: "+startsite -extractor = URLLister() -extractor.feed(page.read()) - -i = 1 -numberOfSites = 1000 -lastHost = "" -# crawl 100 sites... -while(i <= numberOfSites): - # get random url from queue - url = random.choice(extractor.urls) - - # check if lastHost == currentHost - if urlparse(url).netloc != urlparse(lastHost).netloc: - ## remove url from queue - extractor.urls.remove(url) - print "("+str(i)+"/"+str(numberOfSites)+") currently visited url: "+url - page = urllib.urlopen(url) - extractor.feed(page.read()) - i = i + 1 - lastHost = url - - -extractor.close() - -print "\n \n ==== url queue ====" -for u in extractor.urls: - pass - print u \ No newline at end of file diff --git a/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/crawler.py b/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/crawler.py new file mode 100644 index 00000000..96075f75 --- /dev/null +++ b/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/crawler.py @@ -0,0 +1,112 @@ +import urllib +import random +import robotparser +from sgmllib import SGMLParser +from urlparse import urlparse +import sys +from termcolor import colored, cprint + +''' +TODO: + - canonize urls -> canonize? slides? + - server timeout -> safe crawled host, set timeout for crawled host + - statistics -> http://www.ke.tu-darmstadt.de/lehre/ss13/web-mining/uebung2.html + +''' + +#some variables +visitedSites = 0 +prohibitedSites = 0 +visitedUrls = [] # safe already visited urls, so no url will be visited more than once + +robotsTxtResults = {} + + +def checkRobotsTxt(url): + + o = urlparse(url) + robotsUrl = o.scheme+"://"+o.netloc+"/robots.txt" + rp = robotparser.RobotFileParser() + rp.set_url(robotsUrl) + + try: + rp.read() + deadLink = 0 + except: + deadLink = 1 + if deadLink: + return 1 # return true if robots.txt doesn't exist + else: + if rp.can_fetch("*", url): + print "Checking robots.txt ("+robotsUrl+") \n "+colored("-> Allowed to visit :) "+url, "green") + global visitedSites + visitedSites += 1 + return 1 + else: + print "Checking robots.txt ("+robotsUrl+") \n "+colored("-> Not allowed to visit :( "+url, "red") + global prohibitedSites + prohibitedSites += 1 + return 0 + +## TODO: canonical url not only check if url is valid. Transfer relative url to absolute one +def canonicalUrl(url): + o = urlparse(url) + if o.scheme=='http' and (o.geturl() not in extractor.urls) and not "pdf" in o.path: + return 1 + else: + return 0 + + + +class URLLister(SGMLParser): + def reset(self): + SGMLParser.reset(self) + self.urls = [] + + def start_a(self, attrs): + + href = [v for k, v in attrs if k=='href'] + if href: + if canonicalUrl(href[0]): + self.urls.append(href[0]) + + +startsite = "http://www.ke.tu-darmstadt.de/lehre/arbeiten" +page = urllib.urlopen(startsite) +print "currently visited url: "+startsite +extractor = URLLister() +extractor.feed(page.read()) + + +i = 1 +numberOfSites = 1000 +lastHost = "" +visitedHosts = [] +# crawl 100 sites... +while(i <= numberOfSites): + # get random url from queue + url = random.choice(extractor.urls) + + # check if lastHost == currentHost && robots.txt && already visited + if urlparse(url).netloc != lastHost and checkRobotsTxt(url) and url not in visitedUrls: + ## remove url from queue + extractor.urls.remove(url) + print colored("("+str(i)+"/"+str(numberOfSites)+") currently visiting url: "+url, "blue") + page = urllib.urlopen(url) + visitedUrls.append(url) + extractor.feed(page.read()) + i = i + 1 + lastHost = urlparse(url).netloc + #visitedHosts[urlparse(url).netloc] = 5 + + +extractor.close() + +print "\n \n ==== robots.txt ====" +print "Visited Sites: "+str(visitedSites) +print "Prohibited by robots.txt: "+str(prohibitedSites) + +print "\n \n ==== url queue ====" +for u in extractor.urls: + pass + #print u \ No newline at end of file diff --git a/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/termcolor.py b/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/termcolor.py new file mode 100644 index 00000000..f11b824b --- /dev/null +++ b/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/termcolor.py @@ -0,0 +1,168 @@ +# coding: utf-8 +# Copyright (c) 2008-2011 Volvox Development Team +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +# +# Author: Konstantin Lepa + +"""ANSII Color formatting for output in terminal.""" + +from __future__ import print_function +import os + + +__ALL__ = [ 'colored', 'cprint' ] + +VERSION = (1, 1, 0) + +ATTRIBUTES = dict( + list(zip([ + 'bold', + 'dark', + '', + 'underline', + 'blink', + '', + 'reverse', + 'concealed' + ], + list(range(1, 9)) + )) + ) +del ATTRIBUTES[''] + + +HIGHLIGHTS = dict( + list(zip([ + 'on_grey', + 'on_red', + 'on_green', + 'on_yellow', + 'on_blue', + 'on_magenta', + 'on_cyan', + 'on_white' + ], + list(range(40, 48)) + )) + ) + + +COLORS = dict( + list(zip([ + 'grey', + 'red', + 'green', + 'yellow', + 'blue', + 'magenta', + 'cyan', + 'white', + ], + list(range(30, 38)) + )) + ) + + +RESET = '\033[0m' + + +def colored(text, color=None, on_color=None, attrs=None): + """Colorize text. + + Available text colors: + red, green, yellow, blue, magenta, cyan, white. + + Available text highlights: + on_red, on_green, on_yellow, on_blue, on_magenta, on_cyan, on_white. + + Available attributes: + bold, dark, underline, blink, reverse, concealed. + + Example: + colored('Hello, World!', 'red', 'on_grey', ['blue', 'blink']) + colored('Hello, World!', 'green') + """ + if os.getenv('ANSI_COLORS_DISABLED') is None: + fmt_str = '\033[%dm%s' + if color is not None: + text = fmt_str % (COLORS[color], text) + + if on_color is not None: + text = fmt_str % (HIGHLIGHTS[on_color], text) + + if attrs is not None: + for attr in attrs: + text = fmt_str % (ATTRIBUTES[attr], text) + + text += RESET + return text + + +def cprint(text, color=None, on_color=None, attrs=None, **kwargs): + """Print colorize text. + + It accepts arguments of print function. + """ + + print((colored(text, color, on_color, attrs)), **kwargs) + + +if __name__ == '__main__': + print('Current terminal type: %s' % os.getenv('TERM')) + print('Test basic colors:') + cprint('Grey color', 'grey') + cprint('Red color', 'red') + cprint('Green color', 'green') + cprint('Yellow color', 'yellow') + cprint('Blue color', 'blue') + cprint('Magenta color', 'magenta') + cprint('Cyan color', 'cyan') + cprint('White color', 'white') + print(('-' * 78)) + + print('Test highlights:') + cprint('On grey color', on_color='on_grey') + cprint('On red color', on_color='on_red') + cprint('On green color', on_color='on_green') + cprint('On yellow color', on_color='on_yellow') + cprint('On blue color', on_color='on_blue') + cprint('On magenta color', on_color='on_magenta') + cprint('On cyan color', on_color='on_cyan') + cprint('On white color', color='grey', on_color='on_white') + print('-' * 78) + + print('Test attributes:') + cprint('Bold grey color', 'grey', attrs=['bold']) + cprint('Dark red color', 'red', attrs=['dark']) + cprint('Underline green color', 'green', attrs=['underline']) + cprint('Blink yellow color', 'yellow', attrs=['blink']) + cprint('Reversed blue color', 'blue', attrs=['reverse']) + cprint('Concealed Magenta color', 'magenta', attrs=['concealed']) + cprint('Bold underline reverse cyan color', 'cyan', + attrs=['bold', 'underline', 'reverse']) + cprint('Dark blink concealed white color', 'white', + attrs=['dark', 'blink', 'concealed']) + print(('-' * 78)) + + print('Test mixing:') + cprint('Underline red on grey color', 'red', 'on_grey', + ['underline']) + cprint('Reversed green on red color', 'green', 'on_red', ['reverse']) + diff --git a/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/termcolor.pyc b/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/termcolor.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b27a1e7ae397eeb0e31e1c705af8c821c3ac2093 GIT binary patch literal 3720 zcmcImU3c6>5-mygB%aubaT15b5ITVIune9A*k#E=7Jrd=v7J2{ha8;4nIliPC&C)Z zNIi+oah}2}kG%4*54-C2>NNnfcM zdX@Cm(nM|!hR)SOxkh@9^lPMFCw-mt8>AKK8#N1Wl78zq77+B?w1c2=&isF#e!#IW zlp?@)1eqp-2p4GIrS!-%2(?AVqi+%LyL`M^JPyNuV#zT$(R-w?u}2Zw9A7EC&ykkG zBF9_`9ge;fZt^)w;TE5$6h7dyi2#oulKzM;mHz)+`Tq;)k4aBMb|i$qiu_6?elL7`h-s*c{3k^qY8Jn?oRUtk0&5 zwhf+(Ldg0d9Evw?4{<3Ni$Y%QvWE3OAAjQht`BG z=!G}gA4f0FKwre3I8p%E&!;VX$QjuIXQQ;vpqS<@x!voSh3{aY>B^D zm?-~^2~zYeSP``&+_-<1(tj~&J{zOs9GRf>4jqq?^%%K&Otk0H>@yjrQ zM|f<_Ze9>=RaS${N(TUQADC2bFwZ?<@^WWrkTl#KSyS`FCEZqB&uvxJF&_a%+{Wpo zf^mkje2WvT9R|aL$eiS%=$+QWJucU_dYGhT-i zvibW~B*hE4roDs<XR+~2tGcYCYr-KFJ?wbhL4cJ9%<9?oZ5Ef%bcA<3l32gaKC ziL}C2f@bA%%?(IH-CMtVZv)SESHz|5ZGV0F!S-5Ldc|Msu5Gt?ix}Vn5nLiJyDk^F zPdL??w9($s_A;BwE=IX0Nrvn$W}OEaIHI!|_8$i(>a3>8!ESQc>2_~@*tykvXm-_R z7=zY256tc;M!$DHQ2DgO)FvJm4~~2$s;}^5vlwXlveR%TohxqsOgVGzlp{UL!SN}+ z-x!o3LF#7_-KbbP~26&-I)~yQqeRE(l6{LgF z5Ui4^WKRi)s*+?E<(jv|pyW=LCGr-B<-f_pTRgnY!#oDLr2VI`n*E;=$H#kuC*uh< zLva5hrMrn!zRc{hyj}FXjHJ{0qZm;>9*$mXSO=i_@kcIt{BQmDxyQujB zVB+_UE+qa%iA56^@+9Q1h3X4{cQ2z+3ULbJW={Aug%xm1A+o`TO#8WvOC@?g+`^bm zW03f2Op~#}`!RD@5uju}<2V4LYYV&lw#@aaSTE#IPDh1WVUAn7K-eob76f%#jhX7S zeW9!zk{UP(x<*?WdtL@*)x+jG$vDbYuvjhd%YYt~ujZVi3zvLf%gk^EWf$;8z<$nn zT1dRWuK-^u2~eHBgd1#%1FFcgBu(M{!h(X9L??^~u=p{MzZD84&b-{xFg!%Z-_GU$ zoE(5}B&pw*tXJ_0L9#-1b1ih*Ne@AZW1diWt^N2%%cd{*+= zaiO%du>mEf(y!M1?)v>Lsd^putxTdYvv_wCeIFi3Pp1J$%2z=AH+bNm3O*;vf1L*o z#OF>c`YaBH?3H`le+|P)lmFoqDIPx^CVCW^JFmi5_A`ctJMBz*lWxQHK$+gDPs6?L so$+3ZkG~vkcr$8d0;KCSov9Z-=iR0=ed;soFg9EJOk$lM#wTIpSNexX-T(jq literal 0 HcmV?d00001