slides update
10
ss2013/1_Web Mining/Uebungen/2_Uebung/abgabe/challenge.txt
Normal file
@ -0,0 +1,10 @@
|
||||
01 es
|
||||
02 de
|
||||
03 en
|
||||
04 en
|
||||
05 de
|
||||
06 es
|
||||
07 es
|
||||
08 de
|
||||
09 en
|
||||
10 es
|
||||
510
ss2013/1_Web Mining/Uebungen/2_Uebung/abgabe/crawler/crawler.py
Normal file
@ -0,0 +1,510 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import urllib2
|
||||
import sys
|
||||
import random
|
||||
import robotparser
|
||||
import re
|
||||
import math
|
||||
from sgmllib import SGMLParser
|
||||
import sgmllib
|
||||
from urlparse import urlparse
|
||||
from urlparse import urljoin
|
||||
import matplotlib.pyplot as plt
|
||||
import time
|
||||
from termcolor import colored
|
||||
from collections import Counter
|
||||
|
||||
'''
|
||||
VN:
|
||||
- Plagiats-Checker fertig
|
||||
- Sprachprüfer fertig
|
||||
|
||||
TODO:
|
||||
- DONE canonize urls -> canonize? slides? -> remember last host -> no magic here -> even using ugly global
|
||||
- DONE with getNextUrlToVisit():
|
||||
server timeout -> safe crawled host, set timeout for crawled host
|
||||
- statistics -> http://www.ke.tu-darmstadt.de/lehre/ss13/web-mining/uebung2.html
|
||||
|
||||
'''
|
||||
|
||||
# crawler attributes
|
||||
entrypoint = "http://www.ke.tu-darmstadt.de/lehre/arbeiten"
|
||||
entrypoint = "http://www.spiegel.de" # german website
|
||||
#entrypoint = "http://www.cnn.com" # english website
|
||||
#entrypoint = "http://www.red2000.com/spain/1index.html" # spanish website
|
||||
#entrypoint = "https://code.google.com/p/feedparser/issues/attachmentText?id=226&aid=-1296926914212963541&name=demonstrate_issue_226.xml&token=CHtgpTsdPmWnNsvScD0yfMuBriU%3A1368702558154"
|
||||
numberOfPagesToCrawl = 1000
|
||||
timeBetweenSameHost = 0 # 5 sec
|
||||
visitOnlyTuSites = False;
|
||||
|
||||
|
||||
#some variables
|
||||
prohibitedSites = 0
|
||||
visitedUrls = [] # safe already visited urls, so no url will be visited more than once
|
||||
|
||||
pages = {} # downloaded pages
|
||||
|
||||
numberHyperlink = {} # safe number of hyperlinks...
|
||||
numberHyperlinksPerPage = {} # safe number of hyperlinks per page
|
||||
|
||||
visitedHostsWithTimestamp = {} # safe visited hosts with timestamp
|
||||
robotsTxtResults = {} # safe robots.txt
|
||||
|
||||
lasthost = '' #last host
|
||||
|
||||
def normalizeMap(m):
|
||||
s = sum(m.values())
|
||||
|
||||
for k in m:
|
||||
m[k] = float(m[k]) / float(s)
|
||||
|
||||
def subtractDicts(dict1, dict2):
|
||||
dic = dict()
|
||||
for key in dict1:
|
||||
if key in dict2:
|
||||
dic[key] = max(0, int(dict1[key]) - int(dict2[key]))
|
||||
else:
|
||||
dic[key] = int(dict1[key])
|
||||
|
||||
for key in dict2:
|
||||
if key not in dict1:
|
||||
dic[key] = int(dict2[key])
|
||||
|
||||
return dic
|
||||
|
||||
def countWords(words):
|
||||
counts = {}
|
||||
for word in words:
|
||||
if word not in counts:
|
||||
counts[word] = 1
|
||||
else:
|
||||
counts[word] += 1
|
||||
return counts
|
||||
|
||||
def blockedByRobotsTxt(url):
|
||||
o = urlparse(url)
|
||||
robotsUrl = o.scheme+"://"+o.netloc+"/robots.txt"
|
||||
|
||||
if url in robotsTxtResults:
|
||||
rp = robotsTxtResults[robotsUrl]
|
||||
else:
|
||||
rp = robotparser.RobotFileParser()
|
||||
rp.set_url(robotsUrl)
|
||||
|
||||
try:
|
||||
rp.read()
|
||||
robotsTxtResults[robotsUrl] = rp
|
||||
except:
|
||||
robotsTxtResults[robotsUrl] = None # robots.txt doesn't exist
|
||||
|
||||
if robotsTxtResults[robotsUrl] == None:
|
||||
return False # return false if robots.txt doesn't exist
|
||||
else:
|
||||
if rp.can_fetch("*", url):
|
||||
return False
|
||||
else:
|
||||
print colored("-> not allowed to visit :( "+url, "red")
|
||||
global prohibitedSites
|
||||
prohibitedSites += 1
|
||||
return True
|
||||
|
||||
def canonicalUrl(url):
|
||||
global lasthost
|
||||
url = url.lower().replace(" ", "")
|
||||
|
||||
o = urlparse(url)
|
||||
|
||||
if o.netloc != '':
|
||||
lasthost = o.scheme + '://' + o.netloc
|
||||
|
||||
if o.scheme=='http' and not "pdf" in o.path and not ".." in o.geturl():
|
||||
if ".html" in o.path:
|
||||
return [url]
|
||||
if "." not in o.path:
|
||||
return [url]
|
||||
return []
|
||||
else:
|
||||
if o.scheme=='':
|
||||
return [urljoin(lasthost,o.path)]
|
||||
else:
|
||||
return []
|
||||
|
||||
|
||||
def getNextUrlToVisit():
|
||||
url = random.choice(extractor.urls)
|
||||
|
||||
if visitOnlyTuSites:
|
||||
if 'tu-darmstadt' not in urlparse(url).netloc:
|
||||
extractor.urls.remove(url)
|
||||
return getNextUrlToVisit()
|
||||
if url in numberHyperlink:
|
||||
numberHyperlink[url] += 1
|
||||
else:
|
||||
numberHyperlink[url] = 1
|
||||
|
||||
|
||||
host = urlparse(url).netloc
|
||||
|
||||
## check if url is blocked by robots.txt or was already visited ##
|
||||
if blockedByRobotsTxt(url) or url in visitedUrls:
|
||||
extractor.urls.remove(url)
|
||||
return getNextUrlToVisit()
|
||||
|
||||
## check if host got a timeout
|
||||
if host in visitedHostsWithTimestamp:
|
||||
timestamp = visitedHostsWithTimestamp[host]
|
||||
secondsSinceLastVisit = int(time.time()) - timestamp
|
||||
if secondsSinceLastVisit >= timeBetweenSameHost:
|
||||
visitedHostsWithTimestamp[host] = int(time.time())
|
||||
visitedUrls.append(url)
|
||||
extractor.urls.remove(url)
|
||||
return url
|
||||
else:
|
||||
secondsToWait = timeBetweenSameHost - secondsSinceLastVisit
|
||||
print colored(" -> give host ("+host+") a break (wait at least: "+str(secondsToWait)+" seconds)", "magenta")
|
||||
return getNextUrlToVisit()
|
||||
else:
|
||||
visitedHostsWithTimestamp[host] = int(time.time())
|
||||
visitedUrls.append(url)
|
||||
extractor.urls.remove(url)
|
||||
return url
|
||||
|
||||
|
||||
class URLLister(SGMLParser):
|
||||
## fix SGMLParseError
|
||||
def resetParser(self):
|
||||
SGMLParser.reset(self)
|
||||
|
||||
def reset(self):
|
||||
SGMLParser.reset(self)
|
||||
self.urls = []
|
||||
|
||||
def start_a(self, attrs):
|
||||
|
||||
href = [v for k, v in attrs if k=='href']
|
||||
if href:
|
||||
url = canonicalUrl(href[0])
|
||||
self.urls.extend(url)
|
||||
|
||||
# count number of links on actual site
|
||||
if href[0] in numberHyperlinksPerPage:
|
||||
numberHyperlinksPerPage[href[0]] += 1
|
||||
else:
|
||||
numberHyperlinksPerPage[href[0]] = 1
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
page = urllib2.urlopen(entrypoint, timeout = 5)
|
||||
print "currently visited url: "+entrypoint
|
||||
extractor = URLLister()
|
||||
extractor.feed(page.read())
|
||||
page.close()
|
||||
|
||||
|
||||
i = 1
|
||||
while(i <= numberOfPagesToCrawl):
|
||||
url = getNextUrlToVisit()
|
||||
print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") currently visiting url: "+url, "blue")
|
||||
try:
|
||||
page = urllib2.urlopen(url, timeout = 6)
|
||||
pageContent = page.read()
|
||||
pageContent = pageContent.replace('<![CDATA[', '<![CDATA[') ## bugfix for SGMLParser
|
||||
page.close()
|
||||
extractor.feed(pageContent)
|
||||
pages[url] = pageContent
|
||||
i += 1
|
||||
|
||||
# exception handling
|
||||
except urllib2.HTTPError, err:
|
||||
if err.code == 404:
|
||||
print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: HTTP Error: not found: "+url, "red")
|
||||
pass
|
||||
if err.code == 400:
|
||||
print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: HTTP Error: bad request: "+url, "red")
|
||||
pass
|
||||
if err.code == 403:
|
||||
print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: HTTP Error: forbidden: "+url, "red")
|
||||
pass
|
||||
except urllib2.URLError:
|
||||
print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: urllib2.URLError: "+url, "red")
|
||||
pass
|
||||
except sgmllib.SGMLParseError:
|
||||
print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: sgmllib.SGMLParseError: "+url, "red")
|
||||
extractor.resetParser()
|
||||
pass
|
||||
except:
|
||||
print "Unexpected error:", sys.exc_info()[0]
|
||||
pass
|
||||
|
||||
extractor.close()
|
||||
|
||||
print "\n \n ==== robots.txt ===="
|
||||
print "prohibit by robots.txt: "+str(prohibitedSites)
|
||||
|
||||
|
||||
## print table number hyperlinks per website ##
|
||||
print "\n \n ==== numberHyperlink ===="
|
||||
print "#Hyperlinks \t Website"
|
||||
|
||||
linkCount1 = {}
|
||||
for u in numberHyperlink.values():
|
||||
if u not in linkCount1:
|
||||
linkCount1[u] = 1
|
||||
else:
|
||||
linkCount1[u] += 1
|
||||
|
||||
|
||||
xValues1 = []
|
||||
yValues1 = []
|
||||
|
||||
for u in linkCount1:
|
||||
xValues1.append(u)
|
||||
yValues1.append(linkCount1[u])
|
||||
|
||||
plt.plot(xValues1, yValues1)
|
||||
plt.xlabel('Haeufigkeiten des Auftretens')
|
||||
plt.ylabel('Anzahl der URLs')
|
||||
plt.show()
|
||||
|
||||
|
||||
## print table number hyperlinks to page ##
|
||||
print "\n \n ==== Anzahl URLs pro Seite ===="
|
||||
print "#Anzahl URLs pro Seite"
|
||||
linkCount2 = {}
|
||||
for u in numberHyperlinksPerPage.values():
|
||||
if u not in linkCount2:
|
||||
linkCount2[u] = 1
|
||||
else:
|
||||
linkCount2[u] += 1
|
||||
|
||||
|
||||
xValues2 = []
|
||||
yValues2 = []
|
||||
|
||||
for u in linkCount2:
|
||||
xValues2.append(u)
|
||||
yValues2.append(linkCount2[u])
|
||||
|
||||
'''plt.plot(xValues2, yValues2)
|
||||
plt.xlabel('Anzahl der Hyperlinks pro Seite')
|
||||
plt.ylabel('Anzahl der URLs')
|
||||
#plt.xscale('log')
|
||||
#plt.yscale('log')
|
||||
plt.show()'''
|
||||
|
||||
print "\n \n ==== url queue ===="
|
||||
for u in extractor.urls:
|
||||
pass
|
||||
#print u
|
||||
|
||||
threshold = 0.9 # how much similar must 2 urls be to be logged
|
||||
|
||||
#print "\n \n ==== copied content probability (>= " + str(threshold*100) + " %) ===="
|
||||
#print "URL1 \t URL2 \t Similarity in %"
|
||||
# wordcounts per page
|
||||
wordCountsByPage = {}
|
||||
charsByPage = {}
|
||||
## count words in all pages ##
|
||||
for url in pages:
|
||||
tmp = re.sub("[\n\r]", "", pages[url]) # remove all scripts
|
||||
tmp = re.sub("<\s*script.*?>.+?<\s*\/script.*?>", "", tmp) # remove all scripts
|
||||
tmp = re.sub("<\s*style.*?>.+?<\s*\/style.*?>", "", tmp) # remove all styles
|
||||
tmp = re.sub("&.+?;", "", tmp) # remove all html entities
|
||||
tmp = re.sub("<.+?>", "", tmp) # remove all html tags
|
||||
tmp = re.sub("\d", "", tmp) # remove all numbers
|
||||
words = re.findall("(\w+)", tmp) # split words
|
||||
words = [x.lower() for x in words] # all words to lower case
|
||||
words = [s for s in words if len(s) > 4 and len(s) <= 10]
|
||||
|
||||
wordCountsByPage[url] = countWords(words)
|
||||
|
||||
chars = re.findall("[A-za-z]", tmp); # find all characters
|
||||
chars = [x.lower() for x in chars] # all characters to lower case
|
||||
charsByPage[url] = chars
|
||||
|
||||
## calculate wordcount deltas and print double-content sites ##
|
||||
wordCountDeltas = {}
|
||||
for url1 in wordCountsByPage:
|
||||
for url2 in wordCountsByPage:
|
||||
if url1 == url2:
|
||||
continue
|
||||
|
||||
if url1 not in wordCountDeltas:
|
||||
wordCountDeltas[url1] = {}
|
||||
if url2 in wordCountDeltas[url1]: # do it once only
|
||||
continue
|
||||
|
||||
wordCounts1 = wordCountsByPage[url1]
|
||||
wordCounts2 = wordCountsByPage[url2]
|
||||
|
||||
sum1 = sum(wordCounts1.values())
|
||||
if sum1 == 0:
|
||||
continue
|
||||
|
||||
#print "calculating deltas of url1: " + url1 + " -- url2: " + url2
|
||||
deltaWordCounts = subtractDicts(wordCounts1, wordCounts2)
|
||||
|
||||
wordCountDeltas[url1][url2] = math.fabs(float(sum(deltaWordCounts.values())) / float(sum1))
|
||||
if 1 - wordCountDeltas[url1][url2] > threshold:
|
||||
#print url1 + " \t " + url2 + " \t " + str((1 - wordCountDeltas[url1][url2]) * 100)
|
||||
pass
|
||||
|
||||
## determine the sites' languages ##
|
||||
spanish = 'es'
|
||||
english = 'en'
|
||||
german = 'de'
|
||||
|
||||
pageLanguages = {}
|
||||
lettersByLanguage = {}
|
||||
lettersByLanguage[spanish] = {
|
||||
'e' : 13.68,
|
||||
'a' : 12.53,
|
||||
'o' : 8.68,
|
||||
's' : 7.98,
|
||||
'r' : 6.87,
|
||||
|
||||
'n' : 6.71,
|
||||
'i' : 6.25,
|
||||
'd' : 5.86,
|
||||
'l' : 4.97,
|
||||
'c' : 4.68,
|
||||
|
||||
't' : 4.63,
|
||||
'u' : 3.93,
|
||||
'm' : 3.15,
|
||||
'p' : 2.51,
|
||||
'b' : 1.42,
|
||||
|
||||
'g' : 1.01,
|
||||
'v' : 0.90,
|
||||
'y' : 0.90,
|
||||
'q' : 0.88,
|
||||
'h' : 0.70,
|
||||
|
||||
'f' : 0.69,
|
||||
'z' : 0.52,
|
||||
'j' : 0.44,
|
||||
'x' : 0.21,
|
||||
'w' : 0.02,
|
||||
|
||||
'k' : 0.01
|
||||
}
|
||||
lettersByLanguage[english] = {
|
||||
'e' : 12.70,
|
||||
't' : 9.06,
|
||||
'a' : 8.16,
|
||||
'o' : 7.50,
|
||||
'i' : 6.96,
|
||||
|
||||
'n' : 6.74,
|
||||
's' : 6.32,
|
||||
'h' : 6.09,
|
||||
'r' : 5.99,
|
||||
'd' : 4.25,
|
||||
|
||||
'l' : 4.03,
|
||||
'c' : 2.78,
|
||||
'u' : 2.76,
|
||||
'm' : 2.41,
|
||||
'w' : 2.36,
|
||||
|
||||
'f' : 2.23,
|
||||
'g' : 2.02,
|
||||
'y' : 1.97,
|
||||
'p' : 1.93,
|
||||
'b' : 1.49,
|
||||
|
||||
'v' : 0.98,
|
||||
'k' : 0.77,
|
||||
'j' : 0.15,
|
||||
'x' : 0.15,
|
||||
'q' : 0.10,
|
||||
|
||||
'z' : 0.07
|
||||
}
|
||||
lettersByLanguage[german] = {
|
||||
'e' : 17.4,
|
||||
'n' : 9.78,
|
||||
'i' : 7.55,
|
||||
's' : 7.27,
|
||||
'r' : 7.00,
|
||||
|
||||
'a' : 6.51,
|
||||
't' : 6.15,
|
||||
'd' : 5.08,
|
||||
'h' : 4.76,
|
||||
'u' : 4.35,
|
||||
|
||||
'l' : 3.44,
|
||||
'c' : 3.06,
|
||||
'g' : 3.01,
|
||||
'o' : 2.59,
|
||||
'm' : 2.53,
|
||||
|
||||
'b' : 1.89,
|
||||
'w' : 1.89,
|
||||
'f' : 1.66,
|
||||
'k' : 1.21,
|
||||
'z' : 1.13,
|
||||
|
||||
'v' : 0.85,
|
||||
'p' : 0.67,
|
||||
'j' : 0.27,
|
||||
'y' : 0.04,
|
||||
'x' : 0.03,
|
||||
|
||||
'q' : 0.02
|
||||
}
|
||||
|
||||
# normalize maps
|
||||
normalizeMap(lettersByLanguage[spanish])
|
||||
normalizeMap(lettersByLanguage[english])
|
||||
normalizeMap(lettersByLanguage[german])
|
||||
|
||||
languageCounts = {}
|
||||
for url in charsByPage:
|
||||
tokens = charsByPage[url]
|
||||
tokenCounts = dict(Counter(tokens))
|
||||
|
||||
tokenSum = sum(tokenCounts.values())
|
||||
|
||||
# Calculating the squared error
|
||||
rankings = {}
|
||||
matches = {}
|
||||
for token in tokenCounts:
|
||||
for key2 in lettersByLanguage:
|
||||
if token not in lettersByLanguage[key2]:
|
||||
continue
|
||||
p = float(lettersByLanguage[key2][token]) * 100
|
||||
if p >= 0:
|
||||
if key2 not in rankings:
|
||||
rankings[key2] = 0
|
||||
matches[key2] = 0
|
||||
# calculate the squared error from observed and reference frequencies
|
||||
rankings[key2] += math.pow(math.fabs(tokenCounts[token] * 100 / tokenSum - p), 2)
|
||||
matches[key2] += 1
|
||||
|
||||
# Resulting language has the minimal mean squared error
|
||||
minRanking = -1
|
||||
language = None
|
||||
for key in rankings:
|
||||
rankings[key] /= matches[key]
|
||||
|
||||
if minRanking == -1 or rankings[key] < minRanking:
|
||||
minRanking = rankings[key]
|
||||
language = key
|
||||
|
||||
if language != None:
|
||||
pageLanguages[url] = language
|
||||
|
||||
if language not in languageCounts:
|
||||
languageCounts[language] = 1
|
||||
else:
|
||||
languageCounts[language] += 1
|
||||
|
||||
print "\n \n ==== language distribution ===="
|
||||
print "Language \t Number of occurences"
|
||||
for lang in languageCounts:
|
||||
print lang + " \t " + str(languageCounts[lang])
|
||||
@ -0,0 +1,168 @@
|
||||
# coding: utf-8
|
||||
# Copyright (c) 2008-2011 Volvox Development Team
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
# THE SOFTWARE.
|
||||
#
|
||||
# Author: Konstantin Lepa <konstantin.lepa@gmail.com>
|
||||
|
||||
"""ANSII Color formatting for output in terminal."""
|
||||
|
||||
from __future__ import print_function
|
||||
import os
|
||||
|
||||
|
||||
__ALL__ = [ 'colored', 'cprint' ]
|
||||
|
||||
VERSION = (1, 1, 0)
|
||||
|
||||
ATTRIBUTES = dict(
|
||||
list(zip([
|
||||
'bold',
|
||||
'dark',
|
||||
'',
|
||||
'underline',
|
||||
'blink',
|
||||
'',
|
||||
'reverse',
|
||||
'concealed'
|
||||
],
|
||||
list(range(1, 9))
|
||||
))
|
||||
)
|
||||
del ATTRIBUTES['']
|
||||
|
||||
|
||||
HIGHLIGHTS = dict(
|
||||
list(zip([
|
||||
'on_grey',
|
||||
'on_red',
|
||||
'on_green',
|
||||
'on_yellow',
|
||||
'on_blue',
|
||||
'on_magenta',
|
||||
'on_cyan',
|
||||
'on_white'
|
||||
],
|
||||
list(range(40, 48))
|
||||
))
|
||||
)
|
||||
|
||||
|
||||
COLORS = dict(
|
||||
list(zip([
|
||||
'grey',
|
||||
'red',
|
||||
'green',
|
||||
'yellow',
|
||||
'blue',
|
||||
'magenta',
|
||||
'cyan',
|
||||
'white',
|
||||
],
|
||||
list(range(30, 38))
|
||||
))
|
||||
)
|
||||
|
||||
|
||||
RESET = '\033[0m'
|
||||
|
||||
|
||||
def colored(text, color=None, on_color=None, attrs=None):
|
||||
"""Colorize text.
|
||||
|
||||
Available text colors:
|
||||
red, green, yellow, blue, magenta, cyan, white.
|
||||
|
||||
Available text highlights:
|
||||
on_red, on_green, on_yellow, on_blue, on_magenta, on_cyan, on_white.
|
||||
|
||||
Available attributes:
|
||||
bold, dark, underline, blink, reverse, concealed.
|
||||
|
||||
Example:
|
||||
colored('Hello, World!', 'red', 'on_grey', ['blue', 'blink'])
|
||||
colored('Hello, World!', 'green')
|
||||
"""
|
||||
if os.getenv('ANSI_COLORS_DISABLED') is None:
|
||||
fmt_str = '\033[%dm%s'
|
||||
if color is not None:
|
||||
text = fmt_str % (COLORS[color], text)
|
||||
|
||||
if on_color is not None:
|
||||
text = fmt_str % (HIGHLIGHTS[on_color], text)
|
||||
|
||||
if attrs is not None:
|
||||
for attr in attrs:
|
||||
text = fmt_str % (ATTRIBUTES[attr], text)
|
||||
|
||||
text += RESET
|
||||
return text
|
||||
|
||||
|
||||
def cprint(text, color=None, on_color=None, attrs=None, **kwargs):
|
||||
"""Print colorize text.
|
||||
|
||||
It accepts arguments of print function.
|
||||
"""
|
||||
|
||||
print((colored(text, color, on_color, attrs)), **kwargs)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
print('Current terminal type: %s' % os.getenv('TERM'))
|
||||
print('Test basic colors:')
|
||||
cprint('Grey color', 'grey')
|
||||
cprint('Red color', 'red')
|
||||
cprint('Green color', 'green')
|
||||
cprint('Yellow color', 'yellow')
|
||||
cprint('Blue color', 'blue')
|
||||
cprint('Magenta color', 'magenta')
|
||||
cprint('Cyan color', 'cyan')
|
||||
cprint('White color', 'white')
|
||||
print(('-' * 78))
|
||||
|
||||
print('Test highlights:')
|
||||
cprint('On grey color', on_color='on_grey')
|
||||
cprint('On red color', on_color='on_red')
|
||||
cprint('On green color', on_color='on_green')
|
||||
cprint('On yellow color', on_color='on_yellow')
|
||||
cprint('On blue color', on_color='on_blue')
|
||||
cprint('On magenta color', on_color='on_magenta')
|
||||
cprint('On cyan color', on_color='on_cyan')
|
||||
cprint('On white color', color='grey', on_color='on_white')
|
||||
print('-' * 78)
|
||||
|
||||
print('Test attributes:')
|
||||
cprint('Bold grey color', 'grey', attrs=['bold'])
|
||||
cprint('Dark red color', 'red', attrs=['dark'])
|
||||
cprint('Underline green color', 'green', attrs=['underline'])
|
||||
cprint('Blink yellow color', 'yellow', attrs=['blink'])
|
||||
cprint('Reversed blue color', 'blue', attrs=['reverse'])
|
||||
cprint('Concealed Magenta color', 'magenta', attrs=['concealed'])
|
||||
cprint('Bold underline reverse cyan color', 'cyan',
|
||||
attrs=['bold', 'underline', 'reverse'])
|
||||
cprint('Dark blink concealed white color', 'white',
|
||||
attrs=['dark', 'blink', 'concealed'])
|
||||
print(('-' * 78))
|
||||
|
||||
print('Test mixing:')
|
||||
cprint('Underline red on grey color', 'red', 'on_grey',
|
||||
['underline'])
|
||||
cprint('Reversed green on red color', 'green', 'on_red', ['reverse'])
|
||||
|
||||
BIN
ss2013/1_Web Mining/Uebungen/2_Uebung/abgabe/keaddon.xpi
Normal file
@ -0,0 +1,5 @@
|
||||
This is the keaddon add-on. It contains:
|
||||
|
||||
* A program (lib/main.js).
|
||||
* A few tests.
|
||||
* Some meager documentation.
|
||||
@ -0,0 +1,26 @@
|
||||
var text = "";
|
||||
var cleantext = "";
|
||||
var paragraphs = document.getElementsByTagName('p');
|
||||
var open = '<';
|
||||
var close = '>';
|
||||
for(var i=0; i<paragraphs.length; i++) {
|
||||
text += paragraphs[i].innerHTML;
|
||||
}
|
||||
|
||||
var doAppend = true;
|
||||
var tmp = "";
|
||||
for(var i=0; i<text.length; i++) {
|
||||
tmp = text.charAt(i);
|
||||
if( tmp == open ) {
|
||||
doAppend = false;
|
||||
}
|
||||
if(doAppend) {
|
||||
cleantext += tmp;
|
||||
}
|
||||
if( tmp == close ) {
|
||||
doAppend = true;
|
||||
}
|
||||
}
|
||||
//cleantext = unescape(cleantext);
|
||||
|
||||
postMessage(cleantext);
|
||||
@ -0,0 +1,3 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
|
||||
<svg xmlns="http://www.w3.org/2000/svg" version="1.1" width="900" height="600"><rect width="900" height="600" fill="#ED2939"/><rect width="600" height="600" fill="#fff"/><rect width="300" height="600" fill="#002395"/></svg>
|
||||
|
After Width: | Height: | Size: 378 B |
@ -0,0 +1,8 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
|
||||
<svg xmlns="http://www.w3.org/2000/svg" width="1000" height="600" viewBox="0 0 5 3">
|
||||
<desc>Flag of Germany</desc>
|
||||
<rect id="black_stripe" width="5" height="3" y="0" x="0" fill="#000"/>
|
||||
<rect id="red_stripe" width="5" height="2" y="1" x="0" fill="#D00"/>
|
||||
<rect id="gold_stripe" width="5" height="1" y="2" x="0" fill="#FFCE00"/>
|
||||
</svg>
|
||||
|
After Width: | Height: | Size: 491 B |
|
After Width: | Height: | Size: 230 KiB |
@ -0,0 +1,10 @@
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 60 30" width="1200" height="600">
|
||||
<clipPath id="t">
|
||||
<path d="M30,15 h30 v15 z v15 h-30 z h-30 v-15 z v-15 h30 z"/>
|
||||
</clipPath>
|
||||
<path d="M0,0 v30 h60 v-30 z" fill="#00247d"/>
|
||||
<path d="M0,0 L60,30 M60,0 L0,30" stroke="#fff" stroke-width="6"/>
|
||||
<path d="M0,0 L60,30 M60,0 L0,30" clip-path="url(#t)" stroke="#cf142b" stroke-width="4"/>
|
||||
<path d="M30,0 v30 M0,15 h60" stroke="#fff" stroke-width="10"/>
|
||||
<path d="M30,0 v30 M0,15 h60" stroke="#cf142b" stroke-width="6"/>
|
||||
</svg>
|
||||
|
After Width: | Height: | Size: 521 B |
|
After Width: | Height: | Size: 760 B |
|
After Width: | Height: | Size: 2.4 KiB |
|
After Width: | Height: | Size: 9.9 KiB |
|
After Width: | Height: | Size: 835 B |
|
After Width: | Height: | Size: 674 B |
@ -0,0 +1,2 @@
|
||||
The main module is a program that creates a widget. When a user clicks on
|
||||
the widget, the program loads the mozilla.org website in a new tab.
|
||||
@ -0,0 +1,4 @@
|
||||
exports.german = 'de';
|
||||
exports.french = 'fr';
|
||||
exports.spanish = 'es';
|
||||
exports.english = 'en';
|
||||
@ -0,0 +1,65 @@
|
||||
var widgets = require("widget");
|
||||
var pageMod = require("page-mod");
|
||||
var student = require("student");
|
||||
var data = require("self").data;
|
||||
|
||||
var workers = new Array();
|
||||
var mod = null;
|
||||
|
||||
exports.main = function(options, callback) {
|
||||
mod = pageMod.PageMod(
|
||||
{
|
||||
include: "*",
|
||||
contentScriptWhen:"ready",
|
||||
contentScriptFile: data.url("./contentScripts/keworker.js"),
|
||||
onAttach: function onAttach(worker) {
|
||||
worker.on('message', handleMessage);
|
||||
workers.push(worker);
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
var widget = widgets.Widget(
|
||||
{
|
||||
id: "ke",
|
||||
label: "Knowledge Engineering",
|
||||
contentURL: data.url("keicon.png")
|
||||
}
|
||||
);
|
||||
|
||||
function handleMessage(message) {
|
||||
var lang = require("language");
|
||||
if(message.length > 0) {
|
||||
//TODO: Iconswitch
|
||||
var language = student.student(message);
|
||||
console.log(language);
|
||||
switch(language) {
|
||||
case lang.german:
|
||||
widget.contentURL = data.url("./flag/de.png");
|
||||
break;
|
||||
case lang.spanish:
|
||||
widget.contentURL = data.url("./flag/es.png");
|
||||
break;
|
||||
case lang.english:
|
||||
widget.contentURL = data.url("./flag/en.png");
|
||||
break;
|
||||
case lang.french:
|
||||
widget.contentURL = data.url("./flag/fr.png");
|
||||
break;
|
||||
default:
|
||||
widget.contentURL = data.url("./keicon.png");
|
||||
|
||||
}
|
||||
//TODO: response
|
||||
}
|
||||
}
|
||||
|
||||
console.log("The add-on is running.");
|
||||
}
|
||||
|
||||
exports.onUnload = function(reason) {
|
||||
if(mod != null) {mod.destroy();}
|
||||
for(var i=0; i<workers.length; i++) {
|
||||
workers[i].destroy();
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,8 @@
|
||||
var lang = require("language");
|
||||
var util = require("utility");
|
||||
|
||||
function student(text) {
|
||||
return lang.german;
|
||||
}
|
||||
|
||||
exports.student = student;
|
||||
@ -0,0 +1,82 @@
|
||||
|
||||
/*
|
||||
Will count all equal array entries and list them in a javascript object such
|
||||
that there is a object property for each unique array entry. The value of that
|
||||
property equals the number of occurences. The order of the properties is sorted
|
||||
according to the number of occurences. With the most occurences first.
|
||||
|
||||
You can use this construct
|
||||
|
||||
for(var key in obj) {
|
||||
console.log(obj[key]);
|
||||
}
|
||||
|
||||
to iterate over all items in sorted order.
|
||||
*/
|
||||
function countElements(array) {
|
||||
var tmp = {};
|
||||
var result = {};
|
||||
var helpArray = [];
|
||||
for(var i=0; i<array.length; i++) {
|
||||
var item = array[i];
|
||||
if(!tmp.hasOwnProperty( item )) {
|
||||
tmp[item] = 1;
|
||||
} else {
|
||||
tmp[item] += 1;
|
||||
}
|
||||
}
|
||||
for(var key in tmp) {
|
||||
helpArray.push({key:key, value:tmp[key]});
|
||||
}
|
||||
helpArray.sort(comparePairs);
|
||||
for(var i=0; i<helpArray.length; i++) {
|
||||
result[helpArray[i].key] = helpArray[i].value;
|
||||
}
|
||||
return result;
|
||||
};
|
||||
|
||||
/*Helperfunction for countElements*/
|
||||
function comparePairs(a,b) {
|
||||
return b.value - a.value;
|
||||
}
|
||||
|
||||
/*
|
||||
Tokenize a text at whitespace.
|
||||
Some punctuations are removed
|
||||
*/
|
||||
function tokenize(text) {
|
||||
var lct = text.toLowerCase();
|
||||
lct = lct.replace(/(\.|,|!|\?|'|"|\\|\/|\|)/g,"");
|
||||
var result = lct.split(/\W/g);
|
||||
return result;
|
||||
}
|
||||
|
||||
/*
|
||||
Create an array of chars from a given text.
|
||||
*/
|
||||
function toCharArray(text) {
|
||||
var result = [];
|
||||
for(var i=0; i<text.length; i++) {
|
||||
result.push(text[i]);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
Create an array of charpairs from a given text.
|
||||
*/
|
||||
function toCharPairs(text) {
|
||||
var result = [];
|
||||
if (text.length > 1) {
|
||||
for(var i=1; i<text.length; i++) {
|
||||
result.push(text[i-1]+text[i]);
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
exports.countElements = countElements;
|
||||
exports.tokenize = tokenize;
|
||||
exports.toCharArray = toCharArray;
|
||||
exports.toCharPairs = toCharPairs;
|
||||
@ -0,0 +1,10 @@
|
||||
{
|
||||
"name": "keaddon",
|
||||
"license": "MPL 1.1/GPL 2.0/LGPL 2.1",
|
||||
"author": "Clemens Dörrhöfer",
|
||||
"version": "0.1",
|
||||
"fullName": "keaddon",
|
||||
"id": "jid0-GN3ivO79cgfs9k4P3lxdo7TPFa4",
|
||||
"description": "a basic add-on",
|
||||
"icon": "data/keicon.png"
|
||||
}
|
||||
@ -0,0 +1,83 @@
|
||||
const main = require("main");
|
||||
const lang = require("language");
|
||||
exports.test_test_run = function(test) {
|
||||
test.pass("Unit test running!");
|
||||
};
|
||||
|
||||
exports.test_id = function(test) {
|
||||
test.assert(require("self").id.length > 0);
|
||||
};
|
||||
|
||||
exports.test_url = function(test) {
|
||||
require("request").Request({
|
||||
url: "http://www.mozilla.org/",
|
||||
onComplete: function(response) {
|
||||
test.assertEqual(response.statusText, "OK");
|
||||
test.done();
|
||||
}
|
||||
}).get();
|
||||
test.waitUntilDone(20000);
|
||||
};
|
||||
|
||||
exports.test_open_tab = function(test) {
|
||||
const tabs = require("tabs");
|
||||
tabs.open({
|
||||
url: "http://www.mozilla.org/",
|
||||
onReady: function(tab) {
|
||||
test.assertEqual(tab.url, "http://www.mozilla.org/");
|
||||
test.done();
|
||||
}
|
||||
});
|
||||
test.waitUntilDone(20000);
|
||||
};
|
||||
|
||||
var errormessage = "";
|
||||
|
||||
exports.test_util_countElements = function(test) {
|
||||
const util = require("utility");
|
||||
test.assert(compareObjects(util.countElements(["du", "du", "hallo", "hallo", "du"]),{"hallo":2, "du":3}),errormessage);
|
||||
};
|
||||
|
||||
exports.test_util_toCharArray = function(test) {
|
||||
const util = require("utility");
|
||||
test.assert(compareArrays(util.toCharArray("test"), ["t","e","s","t"]), errormessage);
|
||||
};
|
||||
|
||||
exports.test_util_toCharPairs = function(test) {
|
||||
const util = require("utility");
|
||||
test.assert(compareArrays(util.toCharPairs("mainz"),["ma", "ai", "in", "nz"]), errormessage);
|
||||
};
|
||||
|
||||
exports.test_util_tokenize = function(test) {
|
||||
const util = require("utility");
|
||||
test.assert(compareArrays(util.tokenize("Dem Igel geht's gut."),["dem","igel","gehts","gut"]), errormessage);
|
||||
};
|
||||
|
||||
exports.test_student_student = function(test) {
|
||||
const student = require("student");
|
||||
var text = "blubber";
|
||||
test.assertEqual(student.student(text), lang.german, "Geht nicht weil.");
|
||||
};
|
||||
|
||||
function compareObjects(a,b) {
|
||||
for(var key in a) {
|
||||
if( a[key] != b[key] ) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
};
|
||||
|
||||
function compareArrays(a,b) {
|
||||
if (a.length != b.length) {
|
||||
errormessage = "Arrays of unequal size";
|
||||
return false
|
||||
}
|
||||
for(var i=0; i<a.length; i++) {
|
||||
if (a[i] != b[i]) {
|
||||
errormessage = a[i] + " != " + b[i];
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
};
|
||||
BIN
ss2013/1_Web Mining/Uebungen/2_Uebung/abgabe/solution.pdf
Normal file
@ -34,7 +34,7 @@ entrypoint = "http://www.ke.tu-darmstadt.de/lehre/arbeiten"
|
||||
#entrypoint = "http://www.red2000.com/spain/1index.html" # spanish website
|
||||
#entrypoint = "https://code.google.com/p/feedparser/issues/attachmentText?id=226&aid=-1296926914212963541&name=demonstrate_issue_226.xml&token=CHtgpTsdPmWnNsvScD0yfMuBriU%3A1368702558154"
|
||||
numberOfPagesToCrawl = 1000
|
||||
timeBetweenSameHost = 2 # 5 sec
|
||||
timeBetweenSameHost = 0 # 5 sec
|
||||
visitOnlyTuSites = True;
|
||||
|
||||
|
||||
@ -247,27 +247,52 @@ if __name__ == "__main__":
|
||||
## print table number hyperlinks per website ##
|
||||
print "\n \n ==== numberHyperlink ===="
|
||||
print "#Hyperlinks \t Website"
|
||||
keys = numberHyperlink.keys()
|
||||
keys.sort( lambda x,y: cmp(numberHyperlink[y], numberHyperlink[x]) ) # sort keys
|
||||
i = 0
|
||||
for u in keys:
|
||||
pass
|
||||
if i < 50:
|
||||
print str(numberHyperlink[u])+"\t \t \t"+u
|
||||
i += 1
|
||||
|
||||
|
||||
linkCount1 = {}
|
||||
for u in numberHyperlink.values():
|
||||
if u not in linkCount1:
|
||||
linkCount1[u] = 1
|
||||
else:
|
||||
linkCount1[u] += 1
|
||||
|
||||
|
||||
xValues1 = []
|
||||
yValues1 = []
|
||||
|
||||
for u in linkCount1:
|
||||
xValues1.append(u)
|
||||
yValues1.append(linkCount1[u])
|
||||
|
||||
plt.plot(xValues1, yValues1)
|
||||
plt.xlabel('Haeufigkeiten des Auftretens')
|
||||
plt.ylabel('Anzahl der URLs')
|
||||
plt.show()
|
||||
|
||||
|
||||
## print table number hyperlinks to page ##
|
||||
print "\n \n ==== numberHyperlinksPerPage ===="
|
||||
print "#HyperlinksToPage \t Website"
|
||||
keys = numberHyperlinksPerPage.keys()
|
||||
keys.sort( lambda x,y: cmp(numberHyperlinksPerPage[y], numberHyperlinksPerPage[x]) ) # sort keys
|
||||
i = 0
|
||||
for u in keys:
|
||||
pass
|
||||
if i < 50:
|
||||
print str(numberHyperlinksPerPage[u])+"\t \t \t"+u
|
||||
i += 1
|
||||
print "\n \n ==== Anzahl URLs pro Seite ===="
|
||||
print "#Anzahl URLs pro Seite"
|
||||
linkCount2 = {}
|
||||
for u in numberHyperlinksPerPage.values():
|
||||
if u not in linkCount2:
|
||||
linkCount2[u] = 1
|
||||
else:
|
||||
linkCount2[u] += 1
|
||||
|
||||
|
||||
xValues2 = []
|
||||
yValues2 = []
|
||||
|
||||
for u in linkCount2:
|
||||
xValues2.append(u)
|
||||
yValues2.append(linkCount2[u])
|
||||
|
||||
'''plt.plot(xValues2, yValues2)
|
||||
plt.xlabel('Anzahl der Hyperlinks pro Seite')
|
||||
plt.ylabel('Anzahl der URLs')
|
||||
#plt.xscale('log')
|
||||
#plt.yscale('log')
|
||||
plt.show()'''
|
||||
|
||||
print "\n \n ==== url queue ===="
|
||||
for u in extractor.urls:
|
||||
|
||||
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 23 KiB |
|
After Width: | Height: | Size: 24 KiB |
|
After Width: | Height: | Size: 47 KiB |
BIN
ss2013/1_Web Mining/Uebungen/2_Uebung/latex/grafiken/a2_tu_1.png
Normal file
|
After Width: | Height: | Size: 21 KiB |
BIN
ss2013/1_Web Mining/Uebungen/2_Uebung/latex/grafiken/a2_tu_2.png
Normal file
|
After Width: | Height: | Size: 24 KiB |
BIN
ss2013/1_Web Mining/Uebungen/2_Uebung/latex/grafiken/a2_tu_3.png
Normal file
|
After Width: | Height: | Size: 22 KiB |
BIN
ss2013/1_Web Mining/Uebungen/2_Uebung/latex/grafiken/a2_tu_4.png
Normal file
|
After Width: | Height: | Size: 40 KiB |
@ -47,26 +47,38 @@
|
||||
\@writefile{nav}{\headcommand {\beamer@subsectionpages {2}{8}}}
|
||||
\@writefile{nav}{\headcommand {\slideentry {2}{0}{9}{9/9}{}{0}}}
|
||||
\@writefile{nav}{\headcommand {\beamer@framepages {9}{9}}}
|
||||
\@writefile{toc}{\beamer@sectionintoc {3}{2. Aufgabe}{10}{0}{3}}
|
||||
\@writefile{nav}{\headcommand {\sectionentry {3}{2. Aufgabe}{10}{2. Aufgabe}{0}}}
|
||||
\@writefile{nav}{\headcommand {\beamer@sectionpages {9}{9}}}
|
||||
\@writefile{nav}{\headcommand {\beamer@subsectionpages {9}{9}}}
|
||||
\@writefile{nav}{\headcommand {\slideentry {3}{0}{10}{10/10}{}{0}}}
|
||||
\@writefile{nav}{\headcommand {\slideentry {2}{0}{10}{10/10}{}{0}}}
|
||||
\@writefile{nav}{\headcommand {\beamer@framepages {10}{10}}}
|
||||
\@writefile{toc}{\beamer@sectionintoc {4}{2. Aufgabe}{11}{0}{4}}
|
||||
\@writefile{nav}{\headcommand {\sectionentry {4}{2. Aufgabe}{11}{2. Aufgabe}{0}}}
|
||||
\@writefile{nav}{\headcommand {\beamer@sectionpages {10}{10}}}
|
||||
\@writefile{nav}{\headcommand {\beamer@subsectionpages {10}{10}}}
|
||||
\@writefile{nav}{\headcommand {\slideentry {4}{0}{11}{11/11}{}{0}}}
|
||||
\@writefile{nav}{\headcommand {\slideentry {2}{0}{11}{11/11}{}{0}}}
|
||||
\@writefile{nav}{\headcommand {\beamer@framepages {11}{11}}}
|
||||
\@writefile{nav}{\headcommand {\slideentry {4}{0}{12}{12/12}{}{0}}}
|
||||
\@writefile{nav}{\headcommand {\slideentry {2}{0}{12}{12/12}{}{0}}}
|
||||
\@writefile{nav}{\headcommand {\beamer@framepages {12}{12}}}
|
||||
\@writefile{nav}{\headcommand {\slideentry {4}{0}{13}{13/13}{}{0}}}
|
||||
\@writefile{nav}{\headcommand {\slideentry {2}{0}{13}{13/13}{}{0}}}
|
||||
\@writefile{nav}{\headcommand {\beamer@framepages {13}{13}}}
|
||||
\@writefile{nav}{\headcommand {\slideentry {4}{0}{14}{14/14}{}{0}}}
|
||||
\@writefile{nav}{\headcommand {\slideentry {2}{0}{14}{14/14}{}{0}}}
|
||||
\@writefile{nav}{\headcommand {\beamer@framepages {14}{14}}}
|
||||
\@writefile{nav}{\headcommand {\beamer@partpages {1}{14}}}
|
||||
\@writefile{nav}{\headcommand {\beamer@subsectionpages {11}{14}}}
|
||||
\@writefile{nav}{\headcommand {\beamer@sectionpages {11}{14}}}
|
||||
\@writefile{nav}{\headcommand {\beamer@documentpages {14}}}
|
||||
\@writefile{nav}{\headcommand {\def \inserttotalframenumber {14}}}
|
||||
\@writefile{nav}{\headcommand {\slideentry {2}{0}{15}{15/15}{}{0}}}
|
||||
\@writefile{nav}{\headcommand {\beamer@framepages {15}{15}}}
|
||||
\@writefile{nav}{\headcommand {\slideentry {2}{0}{16}{16/16}{}{0}}}
|
||||
\@writefile{nav}{\headcommand {\beamer@framepages {16}{16}}}
|
||||
\@writefile{nav}{\headcommand {\slideentry {2}{0}{17}{17/17}{}{0}}}
|
||||
\@writefile{nav}{\headcommand {\beamer@framepages {17}{17}}}
|
||||
\@writefile{nav}{\headcommand {\slideentry {2}{0}{18}{18/18}{}{0}}}
|
||||
\@writefile{nav}{\headcommand {\beamer@framepages {18}{18}}}
|
||||
\@writefile{nav}{\headcommand {\slideentry {2}{0}{19}{19/19}{}{0}}}
|
||||
\@writefile{nav}{\headcommand {\beamer@framepages {19}{19}}}
|
||||
\@writefile{nav}{\headcommand {\slideentry {2}{0}{20}{20/20}{}{0}}}
|
||||
\@writefile{nav}{\headcommand {\beamer@framepages {20}{20}}}
|
||||
\@writefile{nav}{\headcommand {\slideentry {2}{0}{21}{21/21}{}{0}}}
|
||||
\@writefile{nav}{\headcommand {\beamer@framepages {21}{21}}}
|
||||
\@writefile{nav}{\headcommand {\slideentry {2}{0}{22}{22/22}{}{0}}}
|
||||
\@writefile{nav}{\headcommand {\beamer@framepages {22}{22}}}
|
||||
\@writefile{nav}{\headcommand {\slideentry {2}{0}{23}{23/23}{}{0}}}
|
||||
\@writefile{nav}{\headcommand {\beamer@framepages {23}{23}}}
|
||||
\@writefile{nav}{\headcommand {\slideentry {2}{0}{24}{24/24}{}{0}}}
|
||||
\@writefile{nav}{\headcommand {\beamer@framepages {24}{24}}}
|
||||
\@writefile{nav}{\headcommand {\beamer@partpages {1}{24}}}
|
||||
\@writefile{nav}{\headcommand {\beamer@subsectionpages {9}{24}}}
|
||||
\@writefile{nav}{\headcommand {\beamer@sectionpages {9}{24}}}
|
||||
\@writefile{nav}{\headcommand {\beamer@documentpages {24}}}
|
||||
\@writefile{nav}{\headcommand {\def \inserttotalframenumber {24}}}
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
This is pdfTeX, Version 3.1415926-2.3-1.40.12 (TeX Live 2011) (format=pdflatex 2011.7.3) 19 MAY 2013 18:14
|
||||
This is pdfTeX, Version 3.1415926-2.3-1.40.12 (TeX Live 2011) (format=pdflatex 2011.7.3) 19 MAY 2013 22:19
|
||||
entering extended mode
|
||||
restricted \write18 enabled.
|
||||
%&-line parsing enabled.
|
||||
@ -1455,7 +1455,7 @@ Underfull \hbox (badness 10000) has occurred while \output is active
|
||||
|
||||
[2
|
||||
|
||||
] <grafiken/a1_abb1.png, id=47, 330.23375pt x 531.9875pt>
|
||||
] <grafiken/a1_abb1.png, id=39, 330.23375pt x 531.9875pt>
|
||||
File: grafiken/a1_abb1.png Graphic file (type png)
|
||||
|
||||
<use grafiken/a1_abb1.png>
|
||||
@ -1481,7 +1481,7 @@ Underfull \hbox (badness 10000) has occurred while \output is active
|
||||
|
||||
[4
|
||||
|
||||
] <grafiken/a1_abb2.png, id=58, 614.295pt x 131.49126pt>
|
||||
] <grafiken/a1_abb2.png, id=50, 614.295pt x 131.49126pt>
|
||||
File: grafiken/a1_abb2.png Graphic file (type png)
|
||||
|
||||
<use grafiken/a1_abb2.png>
|
||||
@ -1550,20 +1550,172 @@ Underfull \hbox (badness 10000) has occurred while \output is active
|
||||
|
||||
[12
|
||||
|
||||
]
|
||||
] <grafiken/a2_spiegel_1.png, id=96, 578.16pt x 433.62pt>
|
||||
File: grafiken/a2_spiegel_1.png Graphic file (type png)
|
||||
|
||||
<use grafiken/a2_spiegel_1.png>
|
||||
Package pdftex.def Info: grafiken/a2_spiegel_1.png used on input line 171.
|
||||
(pdftex.def) Requested size: 208.65793pt x 156.49014pt.
|
||||
|
||||
Overfull \vbox (22.25151pt too high) detected at line 171
|
||||
[]
|
||||
|
||||
|
||||
Underfull \hbox (badness 10000) has occurred while \output is active
|
||||
|
||||
[]
|
||||
|
||||
[13
|
||||
|
||||
]
|
||||
<./grafiken/a2_spiegel_1.png>]
|
||||
<grafiken/a2_spiegel_2.png, id=105, 578.16pt x 433.62pt>
|
||||
File: grafiken/a2_spiegel_2.png Graphic file (type png)
|
||||
|
||||
<use grafiken/a2_spiegel_2.png>
|
||||
Package pdftex.def Info: grafiken/a2_spiegel_2.png used on input line 180.
|
||||
(pdftex.def) Requested size: 208.65793pt x 156.49014pt.
|
||||
|
||||
Overfull \vbox (22.25151pt too high) detected at line 180
|
||||
[]
|
||||
|
||||
|
||||
Underfull \hbox (badness 10000) has occurred while \output is active
|
||||
|
||||
[]
|
||||
|
||||
[14
|
||||
|
||||
<./grafiken/a2_spiegel_2.png>]
|
||||
Underfull \hbox (badness 10000) has occurred while \output is active
|
||||
|
||||
[]
|
||||
|
||||
[15
|
||||
|
||||
] <grafiken/a2_spiegel_3.png, id=117, 578.16pt x 433.62pt>
|
||||
File: grafiken/a2_spiegel_3.png Graphic file (type png)
|
||||
|
||||
<use grafiken/a2_spiegel_3.png>
|
||||
Package pdftex.def Info: grafiken/a2_spiegel_3.png used on input line 196.
|
||||
(pdftex.def) Requested size: 208.65793pt x 156.49014pt.
|
||||
|
||||
Overfull \vbox (22.25151pt too high) detected at line 196
|
||||
[]
|
||||
|
||||
|
||||
Underfull \hbox (badness 10000) has occurred while \output is active
|
||||
|
||||
[]
|
||||
|
||||
[16
|
||||
|
||||
<./grafiken/a2_spiegel_3.png>]
|
||||
<grafiken/a2_spiegel_4.png, id=124, 578.16pt x 433.62pt>
|
||||
File: grafiken/a2_spiegel_4.png Graphic file (type png)
|
||||
|
||||
<use grafiken/a2_spiegel_4.png>
|
||||
Package pdftex.def Info: grafiken/a2_spiegel_4.png used on input line 205.
|
||||
(pdftex.def) Requested size: 208.65793pt x 156.49014pt.
|
||||
|
||||
Overfull \vbox (22.25151pt too high) detected at line 205
|
||||
[]
|
||||
|
||||
|
||||
Underfull \hbox (badness 10000) has occurred while \output is active
|
||||
|
||||
[]
|
||||
|
||||
[17
|
||||
|
||||
<./grafiken/a2_spiegel_4.png>]
|
||||
Underfull \hbox (badness 10000) has occurred while \output is active
|
||||
|
||||
[]
|
||||
|
||||
[18
|
||||
|
||||
]
|
||||
Underfull \hbox (badness 10000) has occurred while \output is active
|
||||
|
||||
[]
|
||||
|
||||
[19
|
||||
|
||||
] <grafiken/a2_tu_1.png, id=142, 578.16pt x 433.62pt>
|
||||
File: grafiken/a2_tu_1.png Graphic file (type png)
|
||||
|
||||
<use grafiken/a2_tu_1.png>
|
||||
Package pdftex.def Info: grafiken/a2_tu_1.png used on input line 236.
|
||||
(pdftex.def) Requested size: 208.65793pt x 156.49014pt.
|
||||
|
||||
Overfull \vbox (22.25151pt too high) detected at line 236
|
||||
[]
|
||||
|
||||
|
||||
Underfull \hbox (badness 10000) has occurred while \output is active
|
||||
|
||||
[]
|
||||
|
||||
[20
|
||||
|
||||
<./grafiken/a2_tu_1.png>]
|
||||
<grafiken/a2_tu_2.png, id=149, 578.16pt x 433.62pt>
|
||||
File: grafiken/a2_tu_2.png Graphic file (type png)
|
||||
<use grafiken/a2_tu_2.png>
|
||||
Package pdftex.def Info: grafiken/a2_tu_2.png used on input line 245.
|
||||
(pdftex.def) Requested size: 208.65793pt x 156.49014pt.
|
||||
|
||||
Overfull \vbox (22.25151pt too high) detected at line 245
|
||||
[]
|
||||
|
||||
|
||||
Underfull \hbox (badness 10000) has occurred while \output is active
|
||||
|
||||
[]
|
||||
|
||||
[21
|
||||
|
||||
<./grafiken/a2_tu_2.png>]
|
||||
<grafiken/a2_tu_3.png, id=156, 578.16pt x 433.62pt>
|
||||
File: grafiken/a2_tu_3.png Graphic file (type png)
|
||||
<use grafiken/a2_tu_3.png>
|
||||
Package pdftex.def Info: grafiken/a2_tu_3.png used on input line 254.
|
||||
(pdftex.def) Requested size: 208.65793pt x 156.49014pt.
|
||||
|
||||
Overfull \vbox (22.25151pt too high) detected at line 254
|
||||
[]
|
||||
|
||||
|
||||
Underfull \hbox (badness 10000) has occurred while \output is active
|
||||
|
||||
[]
|
||||
|
||||
[22
|
||||
|
||||
<./grafiken/a2_tu_3.png>]
|
||||
<grafiken/a2_tu_4.png, id=163, 578.16pt x 433.62pt>
|
||||
File: grafiken/a2_tu_4.png Graphic file (type png)
|
||||
<use grafiken/a2_tu_4.png>
|
||||
Package pdftex.def Info: grafiken/a2_tu_4.png used on input line 263.
|
||||
(pdftex.def) Requested size: 208.65793pt x 156.49014pt.
|
||||
|
||||
Overfull \vbox (22.25151pt too high) detected at line 263
|
||||
[]
|
||||
|
||||
|
||||
Underfull \hbox (badness 10000) has occurred while \output is active
|
||||
|
||||
[]
|
||||
|
||||
[23
|
||||
|
||||
<./grafiken/a2_tu_4.png>]
|
||||
Underfull \hbox (badness 10000) has occurred while \output is active
|
||||
|
||||
[]
|
||||
|
||||
[24
|
||||
|
||||
]
|
||||
\tf@nav=\write7
|
||||
\openout7 = `solution.nav'.
|
||||
@ -1574,20 +1726,20 @@ Underfull \hbox (badness 10000) has occurred while \output is active
|
||||
\tf@snm=\write9
|
||||
\openout9 = `solution.snm'.
|
||||
|
||||
Package atveryend Info: Empty hook `BeforeClearDocument' on input line 172.
|
||||
Package atveryend Info: Empty hook `AfterLastShipout' on input line 172.
|
||||
Package atveryend Info: Empty hook `BeforeClearDocument' on input line 278.
|
||||
Package atveryend Info: Empty hook `AfterLastShipout' on input line 278.
|
||||
(./solution.aux)
|
||||
Package atveryend Info: Empty hook `AtVeryEndDocument' on input line 172.
|
||||
Package atveryend Info: Executing hook `AtEndAfterFileList' on input line 172.
|
||||
Package atveryend Info: Empty hook `AtVeryEndDocument' on input line 278.
|
||||
Package atveryend Info: Executing hook `AtEndAfterFileList' on input line 278.
|
||||
Package rerunfilecheck Info: File `solution.out' has not changed.
|
||||
(rerunfilecheck) Checksum: 88D911AA5795ABD0722131B6C5D24A75;180.
|
||||
Package atveryend Info: Empty hook `AtVeryVeryEnd' on input line 172.
|
||||
(rerunfilecheck) Checksum: AE5CCE897D490A137427F55C345E5A34;90.
|
||||
Package atveryend Info: Empty hook `AtVeryVeryEnd' on input line 278.
|
||||
)
|
||||
Here is how much of TeX's memory you used:
|
||||
17220 strings out of 493633
|
||||
315045 string characters out of 3143378
|
||||
391807 words of memory out of 3000000
|
||||
20016 multiletter control sequences out of 15000+200000
|
||||
17292 strings out of 493633
|
||||
316299 string characters out of 3143378
|
||||
391806 words of memory out of 3000000
|
||||
20062 multiletter control sequences out of 15000+200000
|
||||
39628 words of font info for 52 fonts, out of 3000000 for 9000
|
||||
831 hyphenation exceptions out of 8191
|
||||
55i,20n,79p,425b,533s stack positions out of 5000i,500n,10000p,200000b,50000s
|
||||
@ -1601,10 +1753,10 @@ texlive/2011/texmf-dist/fonts/type1/urw/helvetic/uhvb8a.pfb></usr/local/texlive
|
||||
/2011/texmf-dist/fonts/type1/urw/helvetic/uhvbo8a.pfb></usr/local/texlive/2011/
|
||||
texmf-dist/fonts/type1/urw/helvetic/uhvr8a.pfb></usr/local/texlive/2011/texmf-d
|
||||
ist/fonts/type1/urw/helvetic/uhvro8a.pfb>
|
||||
Output written on solution.pdf (14 pages, 146011 bytes).
|
||||
Output written on solution.pdf (24 pages, 351323 bytes).
|
||||
PDF statistics:
|
||||
156 PDF objects out of 1000 (max. 8388607)
|
||||
122 compressed objects within 2 object streams
|
||||
33 named destinations out of 1000 (max. 500000)
|
||||
80 words of extra memory for PDF output out of 10000 (max. 10000000)
|
||||
221 PDF objects out of 1000 (max. 8388607)
|
||||
161 compressed objects within 2 object streams
|
||||
51 named destinations out of 1000 (max. 500000)
|
||||
104 words of extra memory for PDF output out of 10000 (max. 10000000)
|
||||
|
||||
|
||||
@ -23,24 +23,38 @@
|
||||
\headcommand {\beamer@subsectionpages {2}{8}}
|
||||
\headcommand {\slideentry {2}{0}{9}{9/9}{}{0}}
|
||||
\headcommand {\beamer@framepages {9}{9}}
|
||||
\headcommand {\sectionentry {3}{2. Aufgabe}{10}{2. Aufgabe}{0}}
|
||||
\headcommand {\beamer@sectionpages {9}{9}}
|
||||
\headcommand {\beamer@subsectionpages {9}{9}}
|
||||
\headcommand {\slideentry {3}{0}{10}{10/10}{}{0}}
|
||||
\headcommand {\slideentry {2}{0}{10}{10/10}{}{0}}
|
||||
\headcommand {\beamer@framepages {10}{10}}
|
||||
\headcommand {\sectionentry {4}{2. Aufgabe}{11}{2. Aufgabe}{0}}
|
||||
\headcommand {\beamer@sectionpages {10}{10}}
|
||||
\headcommand {\beamer@subsectionpages {10}{10}}
|
||||
\headcommand {\slideentry {4}{0}{11}{11/11}{}{0}}
|
||||
\headcommand {\slideentry {2}{0}{11}{11/11}{}{0}}
|
||||
\headcommand {\beamer@framepages {11}{11}}
|
||||
\headcommand {\slideentry {4}{0}{12}{12/12}{}{0}}
|
||||
\headcommand {\slideentry {2}{0}{12}{12/12}{}{0}}
|
||||
\headcommand {\beamer@framepages {12}{12}}
|
||||
\headcommand {\slideentry {4}{0}{13}{13/13}{}{0}}
|
||||
\headcommand {\slideentry {2}{0}{13}{13/13}{}{0}}
|
||||
\headcommand {\beamer@framepages {13}{13}}
|
||||
\headcommand {\slideentry {4}{0}{14}{14/14}{}{0}}
|
||||
\headcommand {\slideentry {2}{0}{14}{14/14}{}{0}}
|
||||
\headcommand {\beamer@framepages {14}{14}}
|
||||
\headcommand {\beamer@partpages {1}{14}}
|
||||
\headcommand {\beamer@subsectionpages {11}{14}}
|
||||
\headcommand {\beamer@sectionpages {11}{14}}
|
||||
\headcommand {\beamer@documentpages {14}}
|
||||
\headcommand {\def \inserttotalframenumber {14}}
|
||||
\headcommand {\slideentry {2}{0}{15}{15/15}{}{0}}
|
||||
\headcommand {\beamer@framepages {15}{15}}
|
||||
\headcommand {\slideentry {2}{0}{16}{16/16}{}{0}}
|
||||
\headcommand {\beamer@framepages {16}{16}}
|
||||
\headcommand {\slideentry {2}{0}{17}{17/17}{}{0}}
|
||||
\headcommand {\beamer@framepages {17}{17}}
|
||||
\headcommand {\slideentry {2}{0}{18}{18/18}{}{0}}
|
||||
\headcommand {\beamer@framepages {18}{18}}
|
||||
\headcommand {\slideentry {2}{0}{19}{19/19}{}{0}}
|
||||
\headcommand {\beamer@framepages {19}{19}}
|
||||
\headcommand {\slideentry {2}{0}{20}{20/20}{}{0}}
|
||||
\headcommand {\beamer@framepages {20}{20}}
|
||||
\headcommand {\slideentry {2}{0}{21}{21/21}{}{0}}
|
||||
\headcommand {\beamer@framepages {21}{21}}
|
||||
\headcommand {\slideentry {2}{0}{22}{22/22}{}{0}}
|
||||
\headcommand {\beamer@framepages {22}{22}}
|
||||
\headcommand {\slideentry {2}{0}{23}{23/23}{}{0}}
|
||||
\headcommand {\beamer@framepages {23}{23}}
|
||||
\headcommand {\slideentry {2}{0}{24}{24/24}{}{0}}
|
||||
\headcommand {\beamer@framepages {24}{24}}
|
||||
\headcommand {\beamer@partpages {1}{24}}
|
||||
\headcommand {\beamer@subsectionpages {9}{24}}
|
||||
\headcommand {\beamer@sectionpages {9}{24}}
|
||||
\headcommand {\beamer@documentpages {24}}
|
||||
\headcommand {\def \inserttotalframenumber {24}}
|
||||
|
||||
@ -1,4 +1,2 @@
|
||||
\BOOKMARK [2][]{Outline0.1}{1. Aufgabe}{}% 1
|
||||
\BOOKMARK [2][]{Outline0.2}{2. Aufgabe}{}% 2
|
||||
\BOOKMARK [2][]{Outline0.3}{2. Aufgabe}{}% 3
|
||||
\BOOKMARK [2][]{Outline0.4}{2. Aufgabe}{}% 4
|
||||
|
||||
@ -80,7 +80,7 @@ $p_{expected(lp, lang) \approx \frac{1}{i * ln(1,78 * N)}}$
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}
|
||||
\frametitle{1. Aufgabe \\ Firefox-Plugin}
|
||||
\frametitle{1. Aufgabe \\ Firefox-Plugin}
|
||||
\begin{itemize}
|
||||
\item Häufigkeiten der Buchstaben bzw. Buchstabenpaare ($n_{text(l)}$) relativ zur Gesamtanzahl ($n_{text}$): \\
|
||||
\begin{center}
|
||||
@ -117,7 +117,7 @@ $MSE(lang) = \frac{\sum_{l}(\tilde{p}_{text}(l) - p_{expected(lp, lang)})^2}{n_{
|
||||
% % % % % % % % % % % % % % % % % % % % % % %% % % % % % % % % % % % % % % % % % % % % % %% % % % % % % % % % % % % % % % % % % % % % % %
|
||||
\section{2. Aufgabe}
|
||||
\begin{frame}
|
||||
\frametitle{2. Aufgabe \\ Crawler}
|
||||
\frametitle{2. und 3. Aufgabe \\ Crawler}
|
||||
\textbf{Verfahren zur Erkennung von Duplikaten:} \\
|
||||
\textbf{1.} Alle Wörter mit einer Länge von 4 und kleiner 11 werden von der Webseite extrahiert.
|
||||
\begin{itemize}
|
||||
@ -127,9 +127,9 @@ $MSE(lang) = \frac{\sum_{l}(\tilde{p}_{text}(l) - p_{expected(lp, lang)})^2}{n_{
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
\section{2. Aufgabe}
|
||||
|
||||
\begin{frame}
|
||||
\frametitle{2. Aufgabe \\ Crawler}
|
||||
\frametitle{2. und 3. Aufgabe \\ Crawler}
|
||||
\textbf{2.} Eine Zuweisung von Wörtern zu deren Auftrittshäufigkeit wird angefertigt \\
|
||||
\textbf{3.} Für alle paarweise verschiedenen Seiten werden die Auftrittshäufigkeiten subtrahiert, so dass deren Ergebnis minimal null ergibt. Zudem werden alle Wörter, die auf einer, aber nicht auf der anderen Seite vorkommen, ebenfalls der anderen Seite zugewiesen
|
||||
\begin{itemize}
|
||||
@ -137,29 +137,135 @@ $MSE(lang) = \frac{\sum_{l}(\tilde{p}_{text}(l) - p_{expected(lp, lang)})^2}{n_{
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
\section{2. Aufgabe}
|
||||
|
||||
\begin{frame}
|
||||
\frametitle{2. Aufgabe \\ Crawler}
|
||||
\frametitle{2. und 3. Aufgabe \\ Crawler}
|
||||
\textbf{4.} Anschließend wird die resultierende Gesamtzahl an Wortvorkommnissen durch die Anzahl der Wortvorkommnisse vor der Subtraktion geteilt. Fällt dieser Wert unter eine definierte Grenze, gilt die Seite als Duplikat. \\
|
||||
\begin{itemize}
|
||||
\item Im Code ist anstatt einer Untergrenze eine Obergrenze von 90\% angegeben, die Berechnung wurde also umgekehrt, so dass hohe Werte eine hohe Duplikatswahrscheinlichkeit implizieren.
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
|
||||
|
||||
\begin{frame}
|
||||
\frametitle{2. Aufgabe \\ Crawler}
|
||||
Histogramm über die Anzahl der URLs pro Seite (wie beim ersten Übungsblatt mit Worthäufigkeiten, auch logarithmisch)
|
||||
\frametitle{2. und 3. Aufgabe \\ Crawler}
|
||||
\textbf{Startseite:} \\
|
||||
http://www.spiegel.de/ \\
|
||||
Es wurden 1000 Seiten besucht. \\
|
||||
\vspace{1cm}
|
||||
\textbf{Erkannte Sprachen:}
|
||||
\begin{center}
|
||||
de $\to$ 623 \\
|
||||
en $\to$ 246 \\
|
||||
es $\to$ 131 \\
|
||||
\end{center}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}
|
||||
\frametitle{2. Aufgabe \\ Crawler}
|
||||
Histogramm mit den Häufigkeiten des Auftretens von Hyperlinks, d.h. wie viele Links treten 1-mal, 2-mal, ... auftreten ...
|
||||
\frametitle{2. und 3. Aufgabe \\ Crawler}
|
||||
\begin{figure}
|
||||
\noindent\includegraphics[height=5.5cm,keepaspectratio]{grafiken/a2_spiegel_1.png}
|
||||
\caption{Anzahl URLs pro Seite}
|
||||
\end{figure}
|
||||
\end{frame}
|
||||
|
||||
|
||||
\begin{frame}
|
||||
\frametitle{2. und 3. Aufgabe \\ Crawler}
|
||||
\begin{figure}
|
||||
\noindent\includegraphics[height=5.5cm,keepaspectratio]{grafiken/a2_spiegel_2.png}
|
||||
\caption{Anzahl URLs pro Seite (logarithmisch)}
|
||||
\end{figure}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}
|
||||
\frametitle{2. und 3. Aufgabe \\ Crawler}
|
||||
\begin{itemize}
|
||||
\item Viele Internetseiten verweisen auf wenige andere Internetseiten
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
|
||||
\begin{frame}
|
||||
\frametitle{2. und 3. Aufgabe \\ Crawler}
|
||||
\begin{figure}
|
||||
\noindent\includegraphics[height=5.5cm,keepaspectratio]{grafiken/a2_spiegel_3.png}
|
||||
\caption{Häufigkeiten des Auftretens von Hyperlinks}
|
||||
\end{figure}
|
||||
\end{frame}
|
||||
|
||||
|
||||
\begin{frame}
|
||||
\frametitle{2. und 3. Aufgabe \\ Crawler}
|
||||
\begin{figure}
|
||||
\noindent\includegraphics[height=5.5cm,keepaspectratio]{grafiken/a2_spiegel_4.png}
|
||||
\caption{Häufigkeiten des Auftretens von Hyperlinks (logarithmisch)}
|
||||
\end{figure}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}
|
||||
\frametitle{2. und 3. Aufgabe \\ Crawler}
|
||||
\begin{itemize}
|
||||
\item Es gibt nur wenige Internetseiten die oft referenziert werden.
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
|
||||
\begin{frame}
|
||||
\frametitle{2. und 3. Aufgabe \\ Crawler}
|
||||
\textbf{Startseite:} \\
|
||||
http://www.ke.tu-darmstadt.de/lehre/arbeiten \\
|
||||
Crawler hat nur Seiten innerhalb der TU Darmstadt der Form *.tu.darmstadt.de besucht.
|
||||
Es wurden 1000 Seiten besucht. \\
|
||||
\vspace{1cm}
|
||||
\textbf{Erkannte Sprachen:}
|
||||
\begin{center}
|
||||
de $\to$ 329 \\
|
||||
en $\to$ 576 \\
|
||||
es $\to$ 95 \\
|
||||
\end{center}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}
|
||||
\frametitle{2. und 3. Aufgabe \\ Crawler}
|
||||
\begin{figure}
|
||||
\noindent\includegraphics[height=5.5cm,keepaspectratio]{grafiken/a2_tu_1.png}
|
||||
\caption{Anzahl URLs pro Seite}
|
||||
\end{figure}
|
||||
\end{frame}
|
||||
|
||||
|
||||
\begin{frame}
|
||||
\frametitle{2. und 3. Aufgabe \\ Crawler}
|
||||
\begin{figure}
|
||||
\noindent\includegraphics[height=5.5cm,keepaspectratio]{grafiken/a2_tu_2.png}
|
||||
\caption{Anzahl URLs pro Seite (logarithmisch)}
|
||||
\end{figure}
|
||||
\end{frame}
|
||||
|
||||
|
||||
\begin{frame}
|
||||
\frametitle{2. und 3. Aufgabe \\ Crawler}
|
||||
\begin{figure}
|
||||
\noindent\includegraphics[height=5.5cm,keepaspectratio]{grafiken/a2_tu_3.png}
|
||||
\caption{Häufigkeiten des Auftretens von Hyperlinks}
|
||||
\end{figure}
|
||||
\end{frame}
|
||||
|
||||
|
||||
\begin{frame}
|
||||
\frametitle{2. und 3. Aufgabe \\ Crawler}
|
||||
\begin{figure}
|
||||
\noindent\includegraphics[height=5.5cm,keepaspectratio]{grafiken/a2_tu_4.png}
|
||||
\caption{Häufigkeiten des Auftretens von Hyperlinks (logarithmisch)}
|
||||
\end{figure}
|
||||
\end{frame}
|
||||
|
||||
|
||||
|
||||
\begin{frame}
|
||||
\frametitle{2. Aufgabe \\ Crawler}
|
||||
\frametitle{2. und 3. Aufgabe \\ Crawler}
|
||||
\textbf{Erfahrungen und Probleme:}
|
||||
\begin{itemize}
|
||||
\item Findet man einen Onlineshop, so wird die Queue mit sehr vielen Links dieses Shops gefüllt und der Crawler besucht mit sehr hoher Wahrscheinlichkeit nur noch URLs innerhalb des Shops.
|
||||
|
||||
@ -2,5 +2,3 @@
|
||||
\select@language {ngerman}
|
||||
\beamer@sectionintoc {1}{1. Aufgabe}{2}{0}{1}
|
||||
\beamer@sectionintoc {2}{2. Aufgabe}{9}{0}{2}
|
||||
\beamer@sectionintoc {3}{2. Aufgabe}{10}{0}{3}
|
||||
\beamer@sectionintoc {4}{2. Aufgabe}{11}{0}{4}
|
||||
|
||||
379
ss2013/1_Web Mining/Uebungen/2_Uebung/results.rtf
Normal file
@ -0,0 +1,379 @@
|
||||
{\rtf1\ansi\ansicpg1252\cocoartf1187\cocoasubrtf370
|
||||
{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
|
||||
{\colortbl;\red255\green255\blue255;}
|
||||
\paperw11900\paperh16840\margl1440\margr1440\vieww16900\viewh8400\viewkind0
|
||||
\pard\tx566\tx1133\tx1700\tx2267\tx2834\tx3401\tx3968\tx4535\tx5102\tx5669\tx6236\tx6803\pardirnatural
|
||||
|
||||
\f0\b\fs24 \cf0 entrypoint: google.de:
|
||||
\b0 \
|
||||
\
|
||||
==== robots.txt ====\
|
||||
prohibit by robots.txt: 172\
|
||||
\
|
||||
\
|
||||
==== numberHyperlink ====\
|
||||
#Hyperlinks Website\
|
||||
19 http://www.blogger.com/profile/05109496878476775729\
|
||||
19 http://www.google.de/history/optout?hl=de\
|
||||
18 http://www.google.de/intl/de/options/\
|
||||
8 http://www.vovone.com/company/profile/\
|
||||
8 http://www.vovone.com/more/solutions/\
|
||||
8 http://www.vovone.com/company/partners/\
|
||||
7 http://www.google.de/intl/de/policies/privacy/\
|
||||
7 http://rocketsandsuch.blogspot.de/2009_08_01_archive.html\
|
||||
7 http://www.vovone.com/more/ssl-certificates/ssl-certificate-validation/\
|
||||
7 http://www.google.de/webhp?hl=de&tab=iw\
|
||||
7 http://www.vovone.com/company/ask-for-more/\
|
||||
6 http://www.vovone.com/domain-names/redirect-domain-name/\
|
||||
6 http://voice.google.com\
|
||||
6 http://www.vovone.com/support/f-a-q/\
|
||||
6 http://www.vovone.com/domain-names/domain-termination/\
|
||||
6 http://www.vovone.com/support/documentation/\
|
||||
6 http://www.vovone.com/company/careers/\
|
||||
5 http://www.vovone.com/discounts-offers/\
|
||||
5 http://www.google.com/press/blogs/directory.html#tab0\
|
||||
5 http://rocketsandsuch.blogspot.de/2008_03_01_archive.html\
|
||||
5 http://rocketsandsuch.blogspot.de/2009_09_01_archive.html\
|
||||
5 http://www.vovone.com/servers/\
|
||||
5 http://rocketsandsuch.blogspot.de/2009_01_01_archive.html\
|
||||
5 http://www.vovone.com\
|
||||
5 http://fusion.google.com/add?source=atgs&feedurl=http%3a//feeds.feedburner.com/googleappsupdates\
|
||||
5 http://www.vovone.com/more/solutions/service-level-agreements/\
|
||||
5 http://www.vovone.com/support/\
|
||||
5 http://www.vovone.com/servers/managed-servers/\
|
||||
5 http://rocketsandsuch.blogspot.de/2008_10_01_archive.html\
|
||||
5 http://rocketsandsuch.blogspot.de/2009_10_01_archive.html\
|
||||
5 http://feedburner.google.com/fb/a/mailverify?uri=googleappsupdates&loc=en_us\
|
||||
5 http://www.vovone.com/more/reseller-plans/affiliate-plan/\
|
||||
5 http://www.vovone.com/more/ssl-certificates/ssl-certificate-type/\
|
||||
5 http://blog.chromium.org/\
|
||||
5 http://www.vovone.com/company/conditions/notice-and-take-down/\
|
||||
5 http://www.vovone.com/more/ssl-certificates/ssl-certificates-brand/\
|
||||
4 http://www.vovone.com/more/colocation/private-rackspace/\
|
||||
4 http://www.vovone.com/more/ssl-certificates/\
|
||||
4 http://www.google.de/setprefdomain?prefdom=us&sig=0_h0pay1e5n4pq04s4m5soth6xqlk%3d\
|
||||
4 http://www.vovone.com/company/technology/security/\
|
||||
4 http://rocketsandsuch.blogspot.de/search?updated-min=2007-01-01t00:00:00-08:00&updated-max=2008-01-01t00:00:00-08:00&max-results=50\
|
||||
4 http://www.google.de/setprefdomain?prefdom=us&sig=0_bbxqe3gzyewbwv2egvfk2cujk3w%3d\
|
||||
4 http://www.vovone.com/more/\
|
||||
4 http://www.vovone.com/web-hosting/special-plans/special-plans-magento-hosting/\
|
||||
4 http://www.vovone.com/more/colocation/shared-rackspace/\
|
||||
4 http://www.vovone.com/company/conditions/\
|
||||
4 http://www.vovone.com/more/solutions/managed-services/\
|
||||
4 http://mail.google.com\
|
||||
4 http://rocketsandsuch.blogspot.de/2008/10/hubble-bubble-toil-and-trouble.html\
|
||||
4 http://www.vovone.com/servers/vps/vps-plan-8192/\
|
||||
\
|
||||
\
|
||||
==== numberHyperlinksPerPage ====\
|
||||
#HyperlinksToPage Website\
|
||||
9088 javascript:void(0)\
|
||||
1867 #\
|
||||
898 javascript:;\
|
||||
522 http://www.blogger.com/profile/05109496878476775729\
|
||||
392 http://www.vovone.com\
|
||||
348 /\
|
||||
347 http://www.blogger.com/profile/09046869427384152063\
|
||||
317 \
|
||||
301 http://www.vovone.com/support/\
|
||||
298 https://my.vovone.com\
|
||||
295 http://www.vovone.com/company/careers/\
|
||||
272 http://feedburner.google.com/fb/a/mailverify?uri=GoogleAppsUpdates&loc=en_US\
|
||||
270 http://fusion.google.com/add?source=atgs&feedurl=http%3A//feeds.feedburner.com/GoogleAppsUpdates\
|
||||
256 the-button-element.html#concept-fe-value\
|
||||
242 http://www.blogger.com/profile/06992649719432295652\
|
||||
221 http://www.vovone.com/servers/\
|
||||
220 the-input-element.html#the-input-element\
|
||||
216 http://www.vovone.com/company/\
|
||||
206 http://www.vovone.com/web-hosting/\
|
||||
206 http://www.vovone.com/more/colocation/\
|
||||
205 http://www.vovone.com/more/ssl-certificates/\
|
||||
205 http://www.vovone.com/servers/dedicated-servers/\
|
||||
204 http://www.vovone.com/more/colocation/private-rackspace/\
|
||||
203 http://www.vovone.com/more/solutions/\
|
||||
203 http://www.vovone.com/company/technology/\
|
||||
203 http://www.vovone.com/servers/managed-servers/\
|
||||
203 http://www.vovone.com/domain-names/\
|
||||
202 http://www.vovone.com/voip-services/\
|
||||
202 http://www.vovone.com/company/conditions/\
|
||||
201 http://www.vovone.com/more/reseller-plans/\
|
||||
201 http://www.vovone.com/voip-services/cloud-voip/\
|
||||
200 http://www.vovone.com/company/promise/\
|
||||
200 http://www.vovone.com/voip-services/voip-accounts/\
|
||||
200 http://www.vovone.com/domain-names/domain-termination/\
|
||||
200 http://www.vovone.com/more/ssl-certificates/ssl-certificate-type/\
|
||||
200 http://www.vovone.com/domain-names/transfer-domain-name/\
|
||||
200 http://www.vovone.com/company/profile/\
|
||||
199 http://www.vovone.com/more/solutions/service-level-agreements/\
|
||||
199 http://www.vovone.com/more/solutions/managed-services/\
|
||||
199 http://www.vovone.com/support/documentation/\
|
||||
199 http://www.vovone.com/voip-services/business-voip/\
|
||||
199 http://www.vovone.com/more/ssl-certificates/ssl-certificate-validation/\
|
||||
199 http://www.vovone.com/more/ssl-certificates/ssl-certificates-brand/\
|
||||
199 http://www.vovone.com/more/colocation/shared-rackspace/\
|
||||
199 http://www.vovone.com/more/reseller-plans/affiliate-plan/\
|
||||
199 http://www.vovone.com/support/f-a-q/\
|
||||
198 http://www.vovone.com/support/support-desk/\
|
||||
198 http://www.vovone.com/voip-services/wholesale-voip/\
|
||||
197 http://www.vovone.com/domain-names/redirect-domain-name/\
|
||||
197 http://www.vovone.com/company/press/\
|
||||
\
|
||||
\
|
||||
\
|
||||
|
||||
\b entrypoint http://www.ke.tu-darmstadt.de/lehre/arbeiten:
|
||||
\b0 \
|
||||
\
|
||||
==== robots.txt ====\
|
||||
prohibit by robots.txt: 4\
|
||||
\
|
||||
\
|
||||
==== numberHyperlink ====\
|
||||
#Hyperlinks Website\
|
||||
405 http://www.ke.tu-darmstadt.de/bibtex/authors/show/1625\
|
||||
120 http://www.ke.tu-darmstadt.de/bibtex/authors/show/1677\
|
||||
107 http://www.tu-darmstadt.de\
|
||||
77 http://www.informatik.tu-darmstadt.de\
|
||||
71 http://www.ke.tu-darmstadt.de\
|
||||
61 http://www.ke.tu-darmstadt.de/bibtex/authors/show/875\
|
||||
46 http://www.ke.tu-darmstadt.de/lehre\
|
||||
46 http://www.ke.tu-darmstadt.de/news\
|
||||
41 http://www.ke.tu-darmstadt.de/bibtex/authors/show/708\
|
||||
41 http://www.ke.tu-darmstadt.de/bibtex/search\
|
||||
40 http://www.ke.tu-darmstadt.de/de/studierende/studienbuero/ansprechpartner-studienbuero/\
|
||||
40 http://www.ke.tu-darmstadt.de/bibtex/export\
|
||||
39 http://www.informatik.tu-darmstadt.de/de/aktuelles/veranstaltungentermine/\
|
||||
39 http://www.ke.tu-darmstadt.de/bibtex/publications/showlist/type#proceedings\
|
||||
38 http://www.ke.tu-darmstadt.de/de/intern/index/\
|
||||
38 http://www.ke.tu-darmstadt.de/de/studierende/studiendekanat/ansprechpartner/\
|
||||
37 http://www.ke.tu-darmstadt.de/bibtex/publications\
|
||||
37 http://www.informatik.tu-darmstadt.de/de/aktuelles/neuigkeiten/neuigkeiten/artikel/vortrag-ueber-fitweltweit-programm-des-daad-1/\
|
||||
36 http://www.ke.tu-darmstadt.de/resources\
|
||||
36 http://www.ke.tu-darmstadt.de/bibtex/topics/single/77\
|
||||
36 http://www.ke.tu-darmstadt.de/bibtex/authors/show/1849\
|
||||
35 http://www.ke.tu-darmstadt.de/bibtex/publications/showlist/type\
|
||||
34 http://www.informatik.tu-darmstadt.de/de/aktuelles/neuigkeiten/neuigkeiten/artikel/smarte-spione/\
|
||||
34 http://www.ke.tu-darmstadt.de/bibtex/publications/showlist/recent\
|
||||
33 http://www.ke.tu-darmstadt.de/de/fachbereich/dekanat/\
|
||||
33 http://www.ke.tu-darmstadt.de/de/fachbereich/bilder/absolventenfeier-november-2012/begruessung/\
|
||||
33 http://www.ke.tu-darmstadt.de/bibtex/publications/showlist/type#inproceedings\
|
||||
33 http://www.ke.tu-darmstadt.de/research\
|
||||
31 http://www.ke.tu-darmstadt.de/de/forschung/netzwerkpartner/\
|
||||
29 http://www.ke.tu-darmstadt.de/de/aktuelles/newsletter-an-und-abmeldung/\
|
||||
29 http://www.ke.tu-darmstadt.de/bibtex/authors/show/702\
|
||||
29 http://www.ke.tu-darmstadt.de/projects\
|
||||
29 http://www.ke.tu-darmstadt.de/bibtex/topics/single/33\
|
||||
29 http://www.ke.tu-darmstadt.de/bibtex/publications/showlist/type#incollection\
|
||||
28 http://www.informatik.tu-darmstadt.de/de/aktuelles/neuigkeiten/neuigkeiten/artikel/eine-kultur-der-privatsphaere-im-internet/\
|
||||
28 http://www.ke.tu-darmstadt.de/bibtex/topics\
|
||||
28 http://www.ke.tu-darmstadt.de/bibtex/publications/showlist/type#book\
|
||||
27 http://www.ke.tu-darmstadt.de/de/aktuelles/neuigkeiten/\
|
||||
26 http://www.ke.tu-darmstadt.de/bibtex/authors/show/3036\
|
||||
25 http://www.ke.tu-darmstadt.de/bibtex/authors/show/2370\
|
||||
24 http://www.ke.tu-darmstadt.de/de/aktuelles/preise-und-auszeichnungen/\
|
||||
24 http://www.ke.tu-darmstadt.de/staff\
|
||||
24 http://www.ke.tu-darmstadt.de/impressum\
|
||||
24 http://www.ke.tu-darmstadt.de/de/studierende/news-fuer-studierende/\
|
||||
24 http://www.ke.tu-darmstadt.de/publications\
|
||||
23 http://www.ke.tu-darmstadt.de/bibtex/authors/show/2365\
|
||||
23 http://www.ke.tu-darmstadt.de/termine\
|
||||
23 http://www.ke.tu-darmstadt.de/de/ehemalige/alumni-portal-der-tu-darmstadt/\
|
||||
23 http://www.ke.tu-darmstadt.de/de/ehemalige/\
|
||||
22 http://www.tu-darmstadt.de/\
|
||||
\
|
||||
\
|
||||
==== numberHyperlinksPerPage ====\
|
||||
#HyperlinksToPage Website\
|
||||
3528 http://www.ke.tu-darmstadt.de/bibtex/authors/show/1625\
|
||||
915 http://www.tu-darmstadt.de\
|
||||
904 http://www.ke.tu-darmstadt.de/bibtex/authors/show/1677\
|
||||
635 de/aktuelles/neuigkeiten/\
|
||||
577 de/fachbereich/dekanat/\
|
||||
575 de/fachbereich/bilder/absolventenfeier-november-2012/begruessung/\
|
||||
528 http://www.informatik.tu-darmstadt.de\
|
||||
499 http://www.ke.tu-darmstadt.de\
|
||||
490 de/aktuelles/newsletter-an-und-abmeldung/\
|
||||
482 de/forschung/netzwerkpartner/\
|
||||
481 http://www.ke.tu-darmstadt.de/bibtex/topics/single/33\
|
||||
474 de/studierende/studiendekanat/ansprechpartner/\
|
||||
468 de/studierende/studienbuero/ansprechpartner-studienbuero/\
|
||||
452 de/intern/index/\
|
||||
450 http://www.ke.tu-darmstadt.de/bibtex/authors/show/875\
|
||||
444 http://www.ke.tu-darmstadt.de/bibtex/topics/single/77\
|
||||
434 http://www.ke.tu-darmstadt.de/bibtex/publications/showlist/recent\
|
||||
434 http://www.ke.tu-darmstadt.de/bibtex/publications/showlist/type\
|
||||
433 javascript:this.print()\
|
||||
429 javascript:fontsize('reset')\
|
||||
429 javascript:fontsize('inkrement')\
|
||||
429 javascript:fontsize('dekrement')\
|
||||
424 http://www.ke.tu-darmstadt.de/bibtex/search\
|
||||
424 http://www.ke.tu-darmstadt.de/bibtex/topics\
|
||||
424 http://www.ke.tu-darmstadt.de/bibtex/publications/showlist/type#Proceedings\
|
||||
424 http://www.ke.tu-darmstadt.de/bibtex/publications/showlist/type#Book\
|
||||
424 http://www.ke.tu-darmstadt.de/bibtex/publications\
|
||||
424 http://www.ke.tu-darmstadt.de/bibtex/publications/showlist/type#Inproceedings\
|
||||
424 http://www.ke.tu-darmstadt.de/bibtex/publications/showlist/type#Incollection\
|
||||
424 http://www.ke.tu-darmstadt.de/bibtex/export\
|
||||
412 de/aktuelles/neuigkeiten/neuigkeiten/artikel/smarte-spione/\
|
||||
408 de/aktuelles/neuigkeiten/neuigkeiten/artikel/eine-kultur-der-privatsphaere-im-internet/\
|
||||
408 de/aktuelles/neuigkeiten/neuigkeiten/artikel/vortrag-ueber-fitweltweit-programm-des-daad-1/\
|
||||
405 \
|
||||
382 http://www.ke.tu-darmstadt.de/bibtex/authors/show/708\
|
||||
369 de/fachbereich/\
|
||||
352 de/fachbereich/ehrungen-und-auszeichnungen/alwin-walther-medaille/\
|
||||
351 de/fachbereich/kontakt-und-anfahrt/\
|
||||
351 de/fachbereich/personen/\
|
||||
350 de/fachbereich/professuren-und-gruppenleitungen/\
|
||||
350 de/fachbereich/ueber-den-fachbereich/\
|
||||
350 de/fachbereich/ausschuesse-gremien-und-kommissionen/\
|
||||
349 http://www.informatik.tu-darmstadt.de/index.php?id=40\
|
||||
336 http://www.ke.tu-darmstadt.de/bibtex/authors/show/702\
|
||||
330 http://www.informatik.tu-darmstadt.de/index.php?id=1894\
|
||||
306 http://www.ke.tu-darmstadt.de/bibtex/authors/show/1849\
|
||||
302 http://www.ke.tu-darmstadt.de/news\
|
||||
298 de/tu/\
|
||||
277 http://www.tu-darmstadt.de/\
|
||||
264 #top\
|
||||
\
|
||||
\
|
||||
==== url queue ====\
|
||||
\
|
||||
\
|
||||
==== language distribution ====\
|
||||
Language Number of occurences\
|
||||
de 329\
|
||||
en 576\
|
||||
es 94\
|
||||
\
|
||||
\
|
||||
|
||||
\b entrypoint http://www.spiegel.de:\
|
||||
\
|
||||
\
|
||||
|
||||
\b0 ==== robots.txt ====\
|
||||
prohibit by robots.txt: 115\
|
||||
\
|
||||
\
|
||||
==== numberHyperlink ====\
|
||||
#Hyperlinks Website\
|
||||
43 https://www.amazon.de/b\
|
||||
38 http://www.amazon.de/spiegel\
|
||||
28 http://tv.adobe.com\
|
||||
21 http://tvprogramm.spiegel.de/\
|
||||
19 http://www.spiegel.de/\
|
||||
18 https://service.spiegel.de\
|
||||
18 http://www.spiegel.de/spiegel/spiegelgeschichte/index-2013-2.html\
|
||||
17 http://www.spiegel.de/spiegel/deinspiegel/index-2013-6.html\
|
||||
16 https://www.ebook.de/de/category/61110/unsere_vorteile.html\
|
||||
16 http://www.spiegel.de\
|
||||
15 http://www.spiegel.de/shop\
|
||||
14 http://www.shopbop.com/gp/help/customer/display.html\
|
||||
14 http://www.manager-magazin.de/\
|
||||
14 http://www.spiegel.de/spiegel/spiegelwissen/index-2013-2.html\
|
||||
13 http://www.spiegel.de/spiegel/\
|
||||
13 http://www.spiegel.de/wissenschaft/\
|
||||
12 http://wetter.spiegel.de/spiegel/\
|
||||
10 https://www.ebook.de/de/category/59475/kontakt_impressum.html\
|
||||
10 http://abo.spiegel.de/go/place!abosspsc\
|
||||
9 https://www.amazon.de/gp/cart/view.html\
|
||||
9 https://www.ebook.de/de/category/59424/hilfe.html\
|
||||
9 http://www.amazon.de/gp/feature.html\
|
||||
9 http://www.spiegel.de/sport/\
|
||||
9 https://media.libri.de/de/category/58974/sony_reader.html\
|
||||
9 http://www.spiegelgruppe-nachdrucke.de\
|
||||
9 https://www.ebook.de/de/category/61132/newsletter.html\
|
||||
9 http://www.spiegelwissen.tv/flashsite/index.html\
|
||||
8 http://www.spiegel.de/hilfe/\
|
||||
8 http://abo.spiegel.de/?et_cid=7&et_lid=1946&et_sub=heftkasten\
|
||||
8 https://www.amazon.es/b\
|
||||
8 https://www.ebook.de/de/category/59663/gutscheine_kaufen.html\
|
||||
8 https://www.ebook.de/de/category/52122/ebooks.html\
|
||||
8 http://www.spiegel.de/politik/\
|
||||
8 https://www.ebook.de/de/account/wishlist/add\
|
||||
8 https://www.amazon.de/pc-mac-downloads-herunterladen-digital-steam/b\
|
||||
8 http://www.spiegel.de/spiegel/unispiegel/\
|
||||
8 http://www.spiegel.de/unispiegel/studium/tools-hier-werden-sie-geholfen-a-640620.html\
|
||||
8 http://www.harvardbusinessmanager.de/\
|
||||
7 http://www.amazon.co.jp/\
|
||||
7 https://www.ebook.de/de/category/63461/ebooks_verschenken.html\
|
||||
7 https://www.ebook.de/de/category/browse\
|
||||
7 http://kdp.amazon.de/\
|
||||
7 http://abo.spiegel.de/?et_cid=7&et_lid=1946&et_sub=aboreiter\
|
||||
7 http://www.spiegel-qc.de/selbstbuchungstool\
|
||||
7 https://media.libri.de/de/category/52124/buecher.html\
|
||||
7 http://www.spiegel-qc.de/\
|
||||
7 https://www.ebook.de/de/magazine\
|
||||
7 https://www.ebook.de\
|
||||
7 http://www.spiegel.de/video/\
|
||||
7 http://www.libri.de/shop/action/magazine/6/ebooks_reader.html\
|
||||
\
|
||||
\
|
||||
==== numberHyperlinksPerPage ====\
|
||||
#HyperlinksToPage Website\
|
||||
6966 #\
|
||||
1507 /\
|
||||
1027 \
|
||||
961 http://www.amazon.de/spiegel\
|
||||
671 http://tv.adobe.com/product/photoshop/\
|
||||
640 \{\{url\}\}\
|
||||
640 /gp/digital/fiona/manage\
|
||||
598 javascript:void(0);\
|
||||
597 http://tv.adobe.com/product/cs-production-premium/\
|
||||
586 http://www.spiegel.de/\
|
||||
575 http://www.spiegel.de/spiegel/\
|
||||
509 http://wetter.spiegel.de/spiegel/\
|
||||
504 <#=item.url #>\
|
||||
492 http://www.spiegel.de/shop\
|
||||
468 http://www.spiegel.de/spiegel/spiegelwissen/index-2013-2.html\
|
||||
468 http://www.spiegel.de/spiegel/deinspiegel/index-2013-6.html\
|
||||
468 http://www.spiegel.de/spiegel/spiegelgeschichte/index-2013-2.html\
|
||||
462 /gp/site-directory\
|
||||
460 /gp/cart/view.html?ie=UTF8&hasWorkingJavascript=1\
|
||||
441 /gp/registry/wishlist\
|
||||
435 /clouddrive\
|
||||
411 http://www.spiegel.de/sptv/magazin/\
|
||||
385 /gp/prime\
|
||||
382 /product/photoshop/\
|
||||
352 /gp/dmusic/mp3/player\
|
||||
323 https://www.ebook.de/de/account/ebookHistory\
|
||||
316 https://www.ebook.de/de/account/create/singlestep\
|
||||
311 http://tv.adobe.com/product/cs-design-premium/\
|
||||
311 http://forum.spiegel.de/\
|
||||
310 /product/illustrator/\
|
||||
308 http://tv.adobe.com/product/creative-cloud/\
|
||||
303 http://www.spiegel.de/video/\
|
||||
303 http://www.spiegel-qc.de/\
|
||||
297 /video/\
|
||||
296 http://www.spiegel.de/schlagzeilen/\
|
||||
294 http://www.quality-abo.de/\
|
||||
293 http://www.spiegelgruppe.de/\
|
||||
293 http://www.buchreport.de/\
|
||||
288 http://www.spiegelgruppe-nachdrucke.de\
|
||||
277 /de/category/60575/libri_de_ist_jetzt_ebook_de.html\
|
||||
276 http://www.manager-magazin.de/\
|
||||
274 http://tv.adobe.com/product/premiere-pro/\
|
||||
267 http://tv.adobe.com/product/after-effects/\
|
||||
264 http://www.harvardbusinessmanager.de/\
|
||||
262 /product/premiere-pro/\
|
||||
260 /MP3-Musik-Downloads/b?ie=UTF8&node=77195031\
|
||||
260 http://tvprogramm.spiegel.de/\
|
||||
259 /pc-mac-downloads-herunterladen-digital-steam/b?ie=UTF8&node=1333619031\
|
||||
259 /spiegel/\
|
||||
256 /Navigationssystems-Car-HiFi-Autoradios/b?ie=UTF8&node=236861011\
|
||||
\
|
||||
\
|
||||
==== url queue ====\
|
||||
\
|
||||
\
|
||||
==== language distribution ====\
|
||||
Language Number of occurences\
|
||||
de 623\
|
||||
en 246\
|
||||
es 130
|
||||
\b \
|
||||
|
||||
\b0 \
|
||||
\
|
||||
\
|
||||
}
|
||||