diff --git a/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler.py b/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler.py deleted file mode 100644 index 939d4f3f..00000000 --- a/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler.py +++ /dev/null @@ -1,60 +0,0 @@ -import urllib -import random -from sgmllib import SGMLParser -from urlparse import urlparse - -''' -TODO: - - canonize urls -> canonize? slides? - - server timeout -> safe crawled host, set timeout for crawled host - - statistics -> http://www.ke.tu-darmstadt.de/lehre/ss13/web-mining/uebung2.html - -''' - -class URLLister(SGMLParser): - def reset(self): - SGMLParser.reset(self) - self.urls = [] - - def start_a(self, attrs): - - href = [v for k, v in attrs if k=='href'] - if href: - # canonize url - o = urlparse(href[0]) - - if o.scheme=='http' and (o.geturl() not in self.urls) and not "pdf" in o.path: # only use absolute urls.... - self.urls.extend([o.geturl()]) - - -startsite = "http://www.ke.tu-darmstadt.de/lehre/arbeiten" -page = urllib.urlopen(startsite) -print "currently visited url: "+startsite -extractor = URLLister() -extractor.feed(page.read()) - -i = 1 -numberOfSites = 1000 -lastHost = "" -# crawl 100 sites... -while(i <= numberOfSites): - # get random url from queue - url = random.choice(extractor.urls) - - # check if lastHost == currentHost - if urlparse(url).netloc != urlparse(lastHost).netloc: - ## remove url from queue - extractor.urls.remove(url) - print "("+str(i)+"/"+str(numberOfSites)+") currently visited url: "+url - page = urllib.urlopen(url) - extractor.feed(page.read()) - i = i + 1 - lastHost = url - - -extractor.close() - -print "\n \n ==== url queue ====" -for u in extractor.urls: - pass - print u \ No newline at end of file diff --git a/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/crawler.py b/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/crawler.py new file mode 100644 index 00000000..96075f75 --- /dev/null +++ b/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/crawler.py @@ -0,0 +1,112 @@ +import urllib +import random +import robotparser +from sgmllib import SGMLParser +from urlparse import urlparse +import sys +from termcolor import colored, cprint + +''' +TODO: + - canonize urls -> canonize? slides? + - server timeout -> safe crawled host, set timeout for crawled host + - statistics -> http://www.ke.tu-darmstadt.de/lehre/ss13/web-mining/uebung2.html + +''' + +#some variables +visitedSites = 0 +prohibitedSites = 0 +visitedUrls = [] # safe already visited urls, so no url will be visited more than once + +robotsTxtResults = {} + + +def checkRobotsTxt(url): + + o = urlparse(url) + robotsUrl = o.scheme+"://"+o.netloc+"/robots.txt" + rp = robotparser.RobotFileParser() + rp.set_url(robotsUrl) + + try: + rp.read() + deadLink = 0 + except: + deadLink = 1 + if deadLink: + return 1 # return true if robots.txt doesn't exist + else: + if rp.can_fetch("*", url): + print "Checking robots.txt ("+robotsUrl+") \n "+colored("-> Allowed to visit :) "+url, "green") + global visitedSites + visitedSites += 1 + return 1 + else: + print "Checking robots.txt ("+robotsUrl+") \n "+colored("-> Not allowed to visit :( "+url, "red") + global prohibitedSites + prohibitedSites += 1 + return 0 + +## TODO: canonical url not only check if url is valid. Transfer relative url to absolute one +def canonicalUrl(url): + o = urlparse(url) + if o.scheme=='http' and (o.geturl() not in extractor.urls) and not "pdf" in o.path: + return 1 + else: + return 0 + + + +class URLLister(SGMLParser): + def reset(self): + SGMLParser.reset(self) + self.urls = [] + + def start_a(self, attrs): + + href = [v for k, v in attrs if k=='href'] + if href: + if canonicalUrl(href[0]): + self.urls.append(href[0]) + + +startsite = "http://www.ke.tu-darmstadt.de/lehre/arbeiten" +page = urllib.urlopen(startsite) +print "currently visited url: "+startsite +extractor = URLLister() +extractor.feed(page.read()) + + +i = 1 +numberOfSites = 1000 +lastHost = "" +visitedHosts = [] +# crawl 100 sites... +while(i <= numberOfSites): + # get random url from queue + url = random.choice(extractor.urls) + + # check if lastHost == currentHost && robots.txt && already visited + if urlparse(url).netloc != lastHost and checkRobotsTxt(url) and url not in visitedUrls: + ## remove url from queue + extractor.urls.remove(url) + print colored("("+str(i)+"/"+str(numberOfSites)+") currently visiting url: "+url, "blue") + page = urllib.urlopen(url) + visitedUrls.append(url) + extractor.feed(page.read()) + i = i + 1 + lastHost = urlparse(url).netloc + #visitedHosts[urlparse(url).netloc] = 5 + + +extractor.close() + +print "\n \n ==== robots.txt ====" +print "Visited Sites: "+str(visitedSites) +print "Prohibited by robots.txt: "+str(prohibitedSites) + +print "\n \n ==== url queue ====" +for u in extractor.urls: + pass + #print u \ No newline at end of file diff --git a/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/termcolor.py b/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/termcolor.py new file mode 100644 index 00000000..f11b824b --- /dev/null +++ b/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/termcolor.py @@ -0,0 +1,168 @@ +# coding: utf-8 +# Copyright (c) 2008-2011 Volvox Development Team +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +# +# Author: Konstantin Lepa + +"""ANSII Color formatting for output in terminal.""" + +from __future__ import print_function +import os + + +__ALL__ = [ 'colored', 'cprint' ] + +VERSION = (1, 1, 0) + +ATTRIBUTES = dict( + list(zip([ + 'bold', + 'dark', + '', + 'underline', + 'blink', + '', + 'reverse', + 'concealed' + ], + list(range(1, 9)) + )) + ) +del ATTRIBUTES[''] + + +HIGHLIGHTS = dict( + list(zip([ + 'on_grey', + 'on_red', + 'on_green', + 'on_yellow', + 'on_blue', + 'on_magenta', + 'on_cyan', + 'on_white' + ], + list(range(40, 48)) + )) + ) + + +COLORS = dict( + list(zip([ + 'grey', + 'red', + 'green', + 'yellow', + 'blue', + 'magenta', + 'cyan', + 'white', + ], + list(range(30, 38)) + )) + ) + + +RESET = '\033[0m' + + +def colored(text, color=None, on_color=None, attrs=None): + """Colorize text. + + Available text colors: + red, green, yellow, blue, magenta, cyan, white. + + Available text highlights: + on_red, on_green, on_yellow, on_blue, on_magenta, on_cyan, on_white. + + Available attributes: + bold, dark, underline, blink, reverse, concealed. + + Example: + colored('Hello, World!', 'red', 'on_grey', ['blue', 'blink']) + colored('Hello, World!', 'green') + """ + if os.getenv('ANSI_COLORS_DISABLED') is None: + fmt_str = '\033[%dm%s' + if color is not None: + text = fmt_str % (COLORS[color], text) + + if on_color is not None: + text = fmt_str % (HIGHLIGHTS[on_color], text) + + if attrs is not None: + for attr in attrs: + text = fmt_str % (ATTRIBUTES[attr], text) + + text += RESET + return text + + +def cprint(text, color=None, on_color=None, attrs=None, **kwargs): + """Print colorize text. + + It accepts arguments of print function. + """ + + print((colored(text, color, on_color, attrs)), **kwargs) + + +if __name__ == '__main__': + print('Current terminal type: %s' % os.getenv('TERM')) + print('Test basic colors:') + cprint('Grey color', 'grey') + cprint('Red color', 'red') + cprint('Green color', 'green') + cprint('Yellow color', 'yellow') + cprint('Blue color', 'blue') + cprint('Magenta color', 'magenta') + cprint('Cyan color', 'cyan') + cprint('White color', 'white') + print(('-' * 78)) + + print('Test highlights:') + cprint('On grey color', on_color='on_grey') + cprint('On red color', on_color='on_red') + cprint('On green color', on_color='on_green') + cprint('On yellow color', on_color='on_yellow') + cprint('On blue color', on_color='on_blue') + cprint('On magenta color', on_color='on_magenta') + cprint('On cyan color', on_color='on_cyan') + cprint('On white color', color='grey', on_color='on_white') + print('-' * 78) + + print('Test attributes:') + cprint('Bold grey color', 'grey', attrs=['bold']) + cprint('Dark red color', 'red', attrs=['dark']) + cprint('Underline green color', 'green', attrs=['underline']) + cprint('Blink yellow color', 'yellow', attrs=['blink']) + cprint('Reversed blue color', 'blue', attrs=['reverse']) + cprint('Concealed Magenta color', 'magenta', attrs=['concealed']) + cprint('Bold underline reverse cyan color', 'cyan', + attrs=['bold', 'underline', 'reverse']) + cprint('Dark blink concealed white color', 'white', + attrs=['dark', 'blink', 'concealed']) + print(('-' * 78)) + + print('Test mixing:') + cprint('Underline red on grey color', 'red', 'on_grey', + ['underline']) + cprint('Reversed green on red color', 'green', 'on_red', ['reverse']) + diff --git a/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/termcolor.pyc b/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/termcolor.pyc new file mode 100644 index 00000000..b27a1e7a Binary files /dev/null and b/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/termcolor.pyc differ