crawler update
This commit is contained in:
parent
e81ddb5c30
commit
a7a937d205
@ -1,60 +0,0 @@
|
||||
import urllib
|
||||
import random
|
||||
from sgmllib import SGMLParser
|
||||
from urlparse import urlparse
|
||||
|
||||
'''
|
||||
TODO:
|
||||
- canonize urls -> canonize? slides?
|
||||
- server timeout -> safe crawled host, set timeout for crawled host
|
||||
- statistics -> http://www.ke.tu-darmstadt.de/lehre/ss13/web-mining/uebung2.html
|
||||
|
||||
'''
|
||||
|
||||
class URLLister(SGMLParser):
|
||||
def reset(self):
|
||||
SGMLParser.reset(self)
|
||||
self.urls = []
|
||||
|
||||
def start_a(self, attrs):
|
||||
|
||||
href = [v for k, v in attrs if k=='href']
|
||||
if href:
|
||||
# canonize url
|
||||
o = urlparse(href[0])
|
||||
|
||||
if o.scheme=='http' and (o.geturl() not in self.urls) and not "pdf" in o.path: # only use absolute urls....
|
||||
self.urls.extend([o.geturl()])
|
||||
|
||||
|
||||
startsite = "http://www.ke.tu-darmstadt.de/lehre/arbeiten"
|
||||
page = urllib.urlopen(startsite)
|
||||
print "currently visited url: "+startsite
|
||||
extractor = URLLister()
|
||||
extractor.feed(page.read())
|
||||
|
||||
i = 1
|
||||
numberOfSites = 1000
|
||||
lastHost = ""
|
||||
# crawl 100 sites...
|
||||
while(i <= numberOfSites):
|
||||
# get random url from queue
|
||||
url = random.choice(extractor.urls)
|
||||
|
||||
# check if lastHost == currentHost
|
||||
if urlparse(url).netloc != urlparse(lastHost).netloc:
|
||||
## remove url from queue
|
||||
extractor.urls.remove(url)
|
||||
print "("+str(i)+"/"+str(numberOfSites)+") currently visited url: "+url
|
||||
page = urllib.urlopen(url)
|
||||
extractor.feed(page.read())
|
||||
i = i + 1
|
||||
lastHost = url
|
||||
|
||||
|
||||
extractor.close()
|
||||
|
||||
print "\n \n ==== url queue ===="
|
||||
for u in extractor.urls:
|
||||
pass
|
||||
print u
|
||||
112
ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/crawler.py
Normal file
112
ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/crawler.py
Normal file
@ -0,0 +1,112 @@
|
||||
import urllib
|
||||
import random
|
||||
import robotparser
|
||||
from sgmllib import SGMLParser
|
||||
from urlparse import urlparse
|
||||
import sys
|
||||
from termcolor import colored, cprint
|
||||
|
||||
'''
|
||||
TODO:
|
||||
- canonize urls -> canonize? slides?
|
||||
- server timeout -> safe crawled host, set timeout for crawled host
|
||||
- statistics -> http://www.ke.tu-darmstadt.de/lehre/ss13/web-mining/uebung2.html
|
||||
|
||||
'''
|
||||
|
||||
#some variables
|
||||
visitedSites = 0
|
||||
prohibitedSites = 0
|
||||
visitedUrls = [] # safe already visited urls, so no url will be visited more than once
|
||||
|
||||
robotsTxtResults = {}
|
||||
|
||||
|
||||
def checkRobotsTxt(url):
|
||||
|
||||
o = urlparse(url)
|
||||
robotsUrl = o.scheme+"://"+o.netloc+"/robots.txt"
|
||||
rp = robotparser.RobotFileParser()
|
||||
rp.set_url(robotsUrl)
|
||||
|
||||
try:
|
||||
rp.read()
|
||||
deadLink = 0
|
||||
except:
|
||||
deadLink = 1
|
||||
if deadLink:
|
||||
return 1 # return true if robots.txt doesn't exist
|
||||
else:
|
||||
if rp.can_fetch("*", url):
|
||||
print "Checking robots.txt ("+robotsUrl+") \n "+colored("-> Allowed to visit :) "+url, "green")
|
||||
global visitedSites
|
||||
visitedSites += 1
|
||||
return 1
|
||||
else:
|
||||
print "Checking robots.txt ("+robotsUrl+") \n "+colored("-> Not allowed to visit :( "+url, "red")
|
||||
global prohibitedSites
|
||||
prohibitedSites += 1
|
||||
return 0
|
||||
|
||||
## TODO: canonical url not only check if url is valid. Transfer relative url to absolute one
|
||||
def canonicalUrl(url):
|
||||
o = urlparse(url)
|
||||
if o.scheme=='http' and (o.geturl() not in extractor.urls) and not "pdf" in o.path:
|
||||
return 1
|
||||
else:
|
||||
return 0
|
||||
|
||||
|
||||
|
||||
class URLLister(SGMLParser):
|
||||
def reset(self):
|
||||
SGMLParser.reset(self)
|
||||
self.urls = []
|
||||
|
||||
def start_a(self, attrs):
|
||||
|
||||
href = [v for k, v in attrs if k=='href']
|
||||
if href:
|
||||
if canonicalUrl(href[0]):
|
||||
self.urls.append(href[0])
|
||||
|
||||
|
||||
startsite = "http://www.ke.tu-darmstadt.de/lehre/arbeiten"
|
||||
page = urllib.urlopen(startsite)
|
||||
print "currently visited url: "+startsite
|
||||
extractor = URLLister()
|
||||
extractor.feed(page.read())
|
||||
|
||||
|
||||
i = 1
|
||||
numberOfSites = 1000
|
||||
lastHost = ""
|
||||
visitedHosts = []
|
||||
# crawl 100 sites...
|
||||
while(i <= numberOfSites):
|
||||
# get random url from queue
|
||||
url = random.choice(extractor.urls)
|
||||
|
||||
# check if lastHost == currentHost && robots.txt && already visited
|
||||
if urlparse(url).netloc != lastHost and checkRobotsTxt(url) and url not in visitedUrls:
|
||||
## remove url from queue
|
||||
extractor.urls.remove(url)
|
||||
print colored("("+str(i)+"/"+str(numberOfSites)+") currently visiting url: "+url, "blue")
|
||||
page = urllib.urlopen(url)
|
||||
visitedUrls.append(url)
|
||||
extractor.feed(page.read())
|
||||
i = i + 1
|
||||
lastHost = urlparse(url).netloc
|
||||
#visitedHosts[urlparse(url).netloc] = 5
|
||||
|
||||
|
||||
extractor.close()
|
||||
|
||||
print "\n \n ==== robots.txt ===="
|
||||
print "Visited Sites: "+str(visitedSites)
|
||||
print "Prohibited by robots.txt: "+str(prohibitedSites)
|
||||
|
||||
print "\n \n ==== url queue ===="
|
||||
for u in extractor.urls:
|
||||
pass
|
||||
#print u
|
||||
168
ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/termcolor.py
Normal file
168
ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/termcolor.py
Normal file
@ -0,0 +1,168 @@
|
||||
# coding: utf-8
|
||||
# Copyright (c) 2008-2011 Volvox Development Team
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
# THE SOFTWARE.
|
||||
#
|
||||
# Author: Konstantin Lepa <konstantin.lepa@gmail.com>
|
||||
|
||||
"""ANSII Color formatting for output in terminal."""
|
||||
|
||||
from __future__ import print_function
|
||||
import os
|
||||
|
||||
|
||||
__ALL__ = [ 'colored', 'cprint' ]
|
||||
|
||||
VERSION = (1, 1, 0)
|
||||
|
||||
ATTRIBUTES = dict(
|
||||
list(zip([
|
||||
'bold',
|
||||
'dark',
|
||||
'',
|
||||
'underline',
|
||||
'blink',
|
||||
'',
|
||||
'reverse',
|
||||
'concealed'
|
||||
],
|
||||
list(range(1, 9))
|
||||
))
|
||||
)
|
||||
del ATTRIBUTES['']
|
||||
|
||||
|
||||
HIGHLIGHTS = dict(
|
||||
list(zip([
|
||||
'on_grey',
|
||||
'on_red',
|
||||
'on_green',
|
||||
'on_yellow',
|
||||
'on_blue',
|
||||
'on_magenta',
|
||||
'on_cyan',
|
||||
'on_white'
|
||||
],
|
||||
list(range(40, 48))
|
||||
))
|
||||
)
|
||||
|
||||
|
||||
COLORS = dict(
|
||||
list(zip([
|
||||
'grey',
|
||||
'red',
|
||||
'green',
|
||||
'yellow',
|
||||
'blue',
|
||||
'magenta',
|
||||
'cyan',
|
||||
'white',
|
||||
],
|
||||
list(range(30, 38))
|
||||
))
|
||||
)
|
||||
|
||||
|
||||
RESET = '\033[0m'
|
||||
|
||||
|
||||
def colored(text, color=None, on_color=None, attrs=None):
|
||||
"""Colorize text.
|
||||
|
||||
Available text colors:
|
||||
red, green, yellow, blue, magenta, cyan, white.
|
||||
|
||||
Available text highlights:
|
||||
on_red, on_green, on_yellow, on_blue, on_magenta, on_cyan, on_white.
|
||||
|
||||
Available attributes:
|
||||
bold, dark, underline, blink, reverse, concealed.
|
||||
|
||||
Example:
|
||||
colored('Hello, World!', 'red', 'on_grey', ['blue', 'blink'])
|
||||
colored('Hello, World!', 'green')
|
||||
"""
|
||||
if os.getenv('ANSI_COLORS_DISABLED') is None:
|
||||
fmt_str = '\033[%dm%s'
|
||||
if color is not None:
|
||||
text = fmt_str % (COLORS[color], text)
|
||||
|
||||
if on_color is not None:
|
||||
text = fmt_str % (HIGHLIGHTS[on_color], text)
|
||||
|
||||
if attrs is not None:
|
||||
for attr in attrs:
|
||||
text = fmt_str % (ATTRIBUTES[attr], text)
|
||||
|
||||
text += RESET
|
||||
return text
|
||||
|
||||
|
||||
def cprint(text, color=None, on_color=None, attrs=None, **kwargs):
|
||||
"""Print colorize text.
|
||||
|
||||
It accepts arguments of print function.
|
||||
"""
|
||||
|
||||
print((colored(text, color, on_color, attrs)), **kwargs)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
print('Current terminal type: %s' % os.getenv('TERM'))
|
||||
print('Test basic colors:')
|
||||
cprint('Grey color', 'grey')
|
||||
cprint('Red color', 'red')
|
||||
cprint('Green color', 'green')
|
||||
cprint('Yellow color', 'yellow')
|
||||
cprint('Blue color', 'blue')
|
||||
cprint('Magenta color', 'magenta')
|
||||
cprint('Cyan color', 'cyan')
|
||||
cprint('White color', 'white')
|
||||
print(('-' * 78))
|
||||
|
||||
print('Test highlights:')
|
||||
cprint('On grey color', on_color='on_grey')
|
||||
cprint('On red color', on_color='on_red')
|
||||
cprint('On green color', on_color='on_green')
|
||||
cprint('On yellow color', on_color='on_yellow')
|
||||
cprint('On blue color', on_color='on_blue')
|
||||
cprint('On magenta color', on_color='on_magenta')
|
||||
cprint('On cyan color', on_color='on_cyan')
|
||||
cprint('On white color', color='grey', on_color='on_white')
|
||||
print('-' * 78)
|
||||
|
||||
print('Test attributes:')
|
||||
cprint('Bold grey color', 'grey', attrs=['bold'])
|
||||
cprint('Dark red color', 'red', attrs=['dark'])
|
||||
cprint('Underline green color', 'green', attrs=['underline'])
|
||||
cprint('Blink yellow color', 'yellow', attrs=['blink'])
|
||||
cprint('Reversed blue color', 'blue', attrs=['reverse'])
|
||||
cprint('Concealed Magenta color', 'magenta', attrs=['concealed'])
|
||||
cprint('Bold underline reverse cyan color', 'cyan',
|
||||
attrs=['bold', 'underline', 'reverse'])
|
||||
cprint('Dark blink concealed white color', 'white',
|
||||
attrs=['dark', 'blink', 'concealed'])
|
||||
print(('-' * 78))
|
||||
|
||||
print('Test mixing:')
|
||||
cprint('Underline red on grey color', 'red', 'on_grey',
|
||||
['underline'])
|
||||
cprint('Reversed green on red color', 'green', 'on_red', ['reverse'])
|
||||
|
||||
BIN
ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/termcolor.pyc
Normal file
BIN
ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/termcolor.pyc
Normal file
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user