college/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/crawler.py

# -*- coding: utf-8 -*-
import urllib2
import sys
import random
import robotparser
import re
import math
from sgmllib import SGMLParser
import sgmllib
from urlparse import urlparse
from urlparse import urljoin
import matplotlib.pyplot as plt
import time
from termcolor import colored
from collections import Counter

'''
VN:
    - Plagiats-Checker fertig
    - Sprachprüfer fertig

TODO:
    - DONE canonize urls -> canonize? slides? -> remember last host -> no magic here -> even using ugly global
    - DONE with getNextUrlToVisit():
            server timeout -> safe crawled host, set timeout for crawled host
    - statistics -> http://www.ke.tu-darmstadt.de/lehre/ss13/web-mining/uebung2.html

'''

# crawler attributes
entrypoint = "http://www.ke.tu-darmstadt.de/lehre/arbeiten"
#entrypoint = "http://www.spiegel.de" # german website
#entrypoint = "http://www.cnn.com" # english website
#entrypoint = "http://www.red2000.com/spain/1index.html" # spanish website
#entrypoint = "https://code.google.com/p/feedparser/issues/attachmentText?id=226&aid=-1296926914212963541&name=demonstrate_issue_226.xml&token=CHtgpTsdPmWnNsvScD0yfMuBriU%3A1368702558154"
numberOfPagesToCrawl = 1000
timeBetweenSameHost = 0 # 5 sec
visitOnlyTuSites = True;


#some variables
prohibitedSites = 0
visitedUrls = [] # safe already visited urls, so no url will be visited more than once

pages = {} # downloaded pages

numberHyperlink = {} # safe number of hyperlinks...
numberHyperlinksPerPage = {} # safe number of hyperlinks per page

visitedHostsWithTimestamp = {} # safe visited hosts with timestamp
robotsTxtResults = {} # safe robots.txt

lasthost = '' #last host

def normalizeMap(m):
    s = sum(m.values())

    for k in m:
        m[k] = float(m[k]) / float(s)

def subtractDicts(dict1, dict2):
    dic = dict()
    for key in dict1:
        if key in dict2:
            dic[key] = max(0, int(dict1[key]) - int(dict2[key]))
        else:
            dic[key] = int(dict1[key])

    for key in dict2:
        if key not in dict1:
            dic[key] = int(dict2[key])

    return dic

def countWords(words):
    counts = {}
    for word in words:
        if word not in counts:
            counts[word] = 1
        else:
            counts[word] += 1
    return counts

def blockedByRobotsTxt(url):
    o = urlparse(url)
    robotsUrl = o.scheme+"://"+o.netloc+"/robots.txt"

    if url in robotsTxtResults:
        rp = robotsTxtResults[robotsUrl]
    else:
        rp = robotparser.RobotFileParser()
        rp.set_url(robotsUrl)

        try:
            rp.read()
            robotsTxtResults[robotsUrl] = rp
        except:
            robotsTxtResults[robotsUrl] = None # robots.txt doesn't exist

    if robotsTxtResults[robotsUrl] == None:
        return False # return false if robots.txt doesn't exist
    else:
        if rp.can_fetch("*", url):
            return False
        else:
            print colored("-> not allowed to visit :(  "+url, "red")
            global prohibitedSites
            prohibitedSites += 1
            return True

def canonicalUrl(url):
    global lasthost
    url = url.lower().replace(" ", "")

    o = urlparse(url)

    if o.netloc != '':
        lasthost = o.scheme + '://' + o.netloc

    if o.scheme=='http' and not "pdf" in o.path and not ".." in o.geturl():
        if ".html" in o.path:
            return [url]
        if "." not in o.path:
            return [url]
        return []
    else:
        if o.scheme=='':
            return [urljoin(lasthost,o.path)]
        else:
            return []


def getNextUrlToVisit():
    url = random.choice(extractor.urls)

    if visitOnlyTuSites:
        if 'tu-darmstadt' not in urlparse(url).netloc:
            extractor.urls.remove(url)
            return getNextUrlToVisit()
    if url in numberHyperlink:
        numberHyperlink[url] += 1
    else:
        numberHyperlink[url] = 1


    host = urlparse(url).netloc

    ## check if url is blocked by robots.txt or was already visited ##
    if blockedByRobotsTxt(url) or url in visitedUrls:
        extractor.urls.remove(url)
        return getNextUrlToVisit()

    ## check if host got a timeout
    if host in visitedHostsWithTimestamp:
        timestamp = visitedHostsWithTimestamp[host]
        secondsSinceLastVisit = int(time.time()) - timestamp
        if secondsSinceLastVisit >= timeBetweenSameHost:
            visitedHostsWithTimestamp[host] = int(time.time())
            visitedUrls.append(url)
            extractor.urls.remove(url)
            return url
        else:
            secondsToWait = timeBetweenSameHost - secondsSinceLastVisit
            print colored("    -> give host ("+host+") a break (wait at least: "+str(secondsToWait)+" seconds)", "magenta")
            return getNextUrlToVisit()
    else:
        visitedHostsWithTimestamp[host] = int(time.time())
        visitedUrls.append(url)
        extractor.urls.remove(url)
        return url


class URLLister(SGMLParser):
    ## fix SGMLParseError
    def resetParser(self):
        SGMLParser.reset(self)

    def reset(self):
        SGMLParser.reset(self)
        self.urls = []

    def start_a(self, attrs):

        href = [v for k, v in attrs if k=='href']
        if href:
            url = canonicalUrl(href[0])
            self.urls.extend(url)

            # count number of links on actual site
            if href[0] in numberHyperlinksPerPage:
                numberHyperlinksPerPage[href[0]] += 1
            else:
                numberHyperlinksPerPage[href[0]] = 1


if __name__ == "__main__":

    page = urllib2.urlopen(entrypoint, timeout = 5)
    print "currently visited url: "+entrypoint
    extractor = URLLister()
    extractor.feed(page.read())
    page.close()


    i = 1
    while(i <= numberOfPagesToCrawl):
        url = getNextUrlToVisit()
        print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") currently visiting url: "+url, "blue")
        try:
            page = urllib2.urlopen(url, timeout = 6)
            pageContent = page.read()
            pageContent = pageContent.replace('<![CDATA[', '&lt;![CDATA[') ## bugfix for SGMLParser
            page.close()
            extractor.feed(pageContent)
            pages[url] = pageContent
            i += 1

        # exception handling
        except urllib2.HTTPError, err:
            if err.code == 404:
                print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: HTTP Error: not found: "+url, "red")
                pass
            if err.code == 400:
                print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: HTTP Error: bad request: "+url, "red")
                pass
            if err.code == 403:
                print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: HTTP Error: forbidden: "+url, "red")
                pass
	except urllib2.URLError:
            print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: urllib2.URLError: "+url, "red")
            pass
        except sgmllib.SGMLParseError:
            print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: sgmllib.SGMLParseError: "+url, "red")
            extractor.resetParser()
            pass
        except:
            print "Unexpected error:", sys.exc_info()[0]
            pass

    extractor.close()

    print "\n \n ==== robots.txt ===="
    print "prohibit by robots.txt: "+str(prohibitedSites)


    ## print table number hyperlinks per website ##
    print "\n \n ==== numberHyperlink ===="
    print "#Hyperlinks \t Website"

    linkCount1 = {}
    for u in numberHyperlink.values():
        if u not in linkCount1:
            linkCount1[u] = 1
        else:
            linkCount1[u] += 1


    xValues1 = []
    yValues1 = []

    for u in linkCount1:
        xValues1.append(u)
        yValues1.append(linkCount1[u])

    plt.plot(xValues1, yValues1)
    plt.xlabel('Haeufigkeiten des Auftretens')
    plt.ylabel('Anzahl der URLs')
    plt.show()


    ## print table number hyperlinks to page ##
    print "\n \n ==== Anzahl URLs pro Seite ===="
    print "#Anzahl URLs pro Seite"
    linkCount2 = {}
    for u in numberHyperlinksPerPage.values():
        if u not in linkCount2:
            linkCount2[u] = 1
        else:
            linkCount2[u] += 1


    xValues2 = []
    yValues2 = []

    for u in linkCount2:
        xValues2.append(u)
        yValues2.append(linkCount2[u])

    '''plt.plot(xValues2, yValues2)
    plt.xlabel('Anzahl der Hyperlinks pro Seite')
    plt.ylabel('Anzahl der URLs')
    #plt.xscale('log')
    #plt.yscale('log')
    plt.show()'''

    print "\n \n ==== url queue ===="
    for u in extractor.urls:
        pass
        #print u

    threshold = 0.9 # how much similar must 2 urls be to be logged

    #print "\n \n ==== copied content probability (>= " + str(threshold*100) + " %) ===="
    #print "URL1 \t URL2 \t Similarity in %"
    # wordcounts per page
    wordCountsByPage = {}
    charsByPage = {}
    ## count words in all pages ##
    for url in pages:
        tmp = re.sub("[\n\r]", "", pages[url]) # remove all scripts
        tmp = re.sub("<\s*script.*?>.+?<\s*\/script.*?>", "", tmp) # remove all scripts
        tmp = re.sub("<\s*style.*?>.+?<\s*\/style.*?>", "", tmp) # remove all styles
        tmp = re.sub("&.+?;", "", tmp) # remove all html entities
        tmp = re.sub("<.+?>", "", tmp) # remove all html tags
        tmp = re.sub("\d", "", tmp) # remove all numbers
        words = re.findall("(\w+)", tmp) # split words
        words = [x.lower() for x in words] # all words to lower case
        words = [s for s in words if len(s) > 4 and len(s) <= 10]

        wordCountsByPage[url] = countWords(words)

        chars = re.findall("[A-za-z]", tmp); # find all characters
        chars = [x.lower() for x in chars] # all characters to lower case
        charsByPage[url] = chars

    ## calculate wordcount deltas and print double-content sites ##
    wordCountDeltas = {}
    for url1 in wordCountsByPage:
        for url2 in wordCountsByPage:
            if url1 == url2:
                continue

            if url1 not in wordCountDeltas:
                wordCountDeltas[url1] = {}
            if url2 in wordCountDeltas[url1]: # do it once only
                continue

            wordCounts1 = wordCountsByPage[url1]
            wordCounts2 = wordCountsByPage[url2]

            sum1 = sum(wordCounts1.values())
            if sum1 == 0:
                continue

            #print "calculating deltas of url1: " + url1 + " -- url2: " + url2
            deltaWordCounts = subtractDicts(wordCounts1, wordCounts2)

            wordCountDeltas[url1][url2] = math.fabs(float(sum(deltaWordCounts.values())) / float(sum1))
            if 1 - wordCountDeltas[url1][url2] > threshold:
                #print url1 + " \t " + url2 + " \t " + str((1 - wordCountDeltas[url1][url2]) * 100)
                pass

    ## determine the sites' languages ##
    spanish = 'es'
    english = 'en'
    german = 'de'

    pageLanguages = {}
    lettersByLanguage = {}
    lettersByLanguage[spanish] = {
        'e' : 13.68,
        'a' : 12.53,
        'o' : 8.68,
        's' : 7.98,
        'r' : 6.87,

        'n' : 6.71,
        'i' : 6.25,
        'd' : 5.86,
        'l' : 4.97,
        'c' : 4.68,

        't' : 4.63,
        'u' : 3.93,
        'm' : 3.15,
        'p' : 2.51,
        'b' : 1.42,

        'g' : 1.01,
        'v' : 0.90,
        'y' : 0.90,
        'q' : 0.88,
        'h' : 0.70,

        'f' : 0.69,
        'z' : 0.52,
        'j' : 0.44,
        'x' : 0.21,
        'w' : 0.02,

        'k' : 0.01
    }
    lettersByLanguage[english] = {
        'e' : 12.70,
        't' : 9.06,
        'a' : 8.16,
        'o' : 7.50,
        'i' : 6.96,

        'n' : 6.74,
        's' : 6.32,
        'h' : 6.09,
        'r' : 5.99,
        'd' : 4.25,

        'l' : 4.03,
        'c' : 2.78,
        'u' : 2.76,
        'm' : 2.41,
        'w' : 2.36,

        'f' : 2.23,
        'g' : 2.02,
        'y' : 1.97,
        'p' : 1.93,
        'b' : 1.49,

        'v' : 0.98,
        'k' : 0.77,
        'j' : 0.15,
        'x' : 0.15,
        'q' : 0.10,

        'z' : 0.07
    }
    lettersByLanguage[german] = {
        'e' : 17.4,
        'n' : 9.78,
        'i' : 7.55,
        's' : 7.27,
        'r' : 7.00,

        'a' : 6.51,
        't' : 6.15,
        'd' : 5.08,
        'h' : 4.76,
        'u' : 4.35,

        'l' : 3.44,
        'c' : 3.06,
        'g' : 3.01,
        'o' : 2.59,
        'm' : 2.53,

        'b' : 1.89,
        'w' : 1.89,
        'f' : 1.66,
        'k' : 1.21,
        'z' : 1.13,

        'v' : 0.85,
        'p' : 0.67,
        'j' : 0.27,
        'y' : 0.04,
        'x' : 0.03,

        'q' : 0.02
    }

    # normalize maps
    normalizeMap(lettersByLanguage[spanish])
    normalizeMap(lettersByLanguage[english])
    normalizeMap(lettersByLanguage[german])

    languageCounts = {}
    for url in charsByPage:
        tokens = charsByPage[url]
        tokenCounts = dict(Counter(tokens))

	tokenSum = sum(tokenCounts.values())

	# Calculating the squared error
	rankings = {}
	matches = {}
	for token in tokenCounts:
           for key2 in lettersByLanguage:
               if token not in lettersByLanguage[key2]:
                   continue
               p = float(lettersByLanguage[key2][token]) * 100
               if p >= 0:
                   if key2 not in rankings:
                       rankings[key2] = 0
                       matches[key2] = 0
                   # calculate the squared error from observed and reference frequencies
                   rankings[key2] += math.pow(math.fabs(tokenCounts[token] * 100 / tokenSum - p), 2)
                   matches[key2] += 1

	# Resulting language has the minimal mean squared error
	minRanking = -1
	language = None
	for key in rankings:
            rankings[key] /= matches[key]

            if minRanking == -1 or rankings[key] < minRanking:
                minRanking = rankings[key]
                language = key

        if language != None:
            pageLanguages[url] = language

            if language not in languageCounts:
                languageCounts[language] = 1
            else:
                languageCounts[language] += 1

    print "\n \n ==== language distribution ===="
    print "Language \t Number of occurences"
    for lang in languageCounts:
        print lang + " \t " + str(languageCounts[lang])