web mining u1 - u4

2013-05-05 10:14:38 +02:00 · 2013-05-05 10:14:38 +02:00 · bc50b297ea
commit bc50b297ea
parent 5f94d45927
15 changed files with 42729 additions and 0 deletions
--- a/Mining/Uebungen/1_Uebung/u1-u4/A
+++ b/Mining/Uebungen/1_Uebung/u1-u4/A
--- a/Mining/Uebungen/1_Uebung/u1-u4/A
+++ b/Mining/Uebungen/1_Uebung/u1-u4/A
--- a/Mining/Uebungen/1_Uebung/u1-u4/Einfuehrung
+++ b/Mining/Uebungen/1_Uebung/u1-u4/Einfuehrung
--- a/Mining/Uebungen/1_Uebung/u1-u4/Expository
+++ b/Mining/Uebungen/1_Uebung/u1-u4/Expository
--- a/Mining/Uebungen/1_Uebung/u1-u4/Solution.doc
+++ b/Mining/Uebungen/1_Uebung/u1-u4/Solution.doc
--- a/Mining/Uebungen/1_Uebung/u1-u4/Solution.docx
+++ b/Mining/Uebungen/1_Uebung/u1-u4/Solution.docx
--- a/Mining/Uebungen/1_Uebung/u1-u4/charfreq_over_rank_english.png
+++ b/Mining/Uebungen/1_Uebung/u1-u4/charfreq_over_rank_english.png
--- a/Mining/Uebungen/1_Uebung/u1-u4/charfreq_over_rank_german.png
+++ b/Mining/Uebungen/1_Uebung/u1-u4/charfreq_over_rank_german.png
--- a/Mining/Uebungen/1_Uebung/u1-u4/english
+++ b/Mining/Uebungen/1_Uebung/u1-u4/english
@ -0,0 +1,128 @@
+i
+me
+my
+myself
+we
+our
+ours
+ourselves
+you
+your
+yours
+yourself
+yourselves
+he
+him
+his
+himself
+she
+her
+hers
+herself
+it
+its
+itself
+they
+them
+their
+theirs
+themselves
+what
+which
+who
+whom
+this
+that
+these
+those
+am
+is
+are
+was
+were
+be
+been
+being
+have
+has
+had
+having
+do
+does
+did
+doing
+a
+an
+the
+and
+but
+if
+or
+because
+as
+until
+while
+of
+at
+by
+for
+with
+about
+against
+between
+into
+through
+during
+before
+after
+above
+below
+to
+from
+up
+down
+in
+out
+on
+off
+over
+under
+again
+further
+then
+once
+here
+there
+when
+where
+why
+how
+all
+any
+both
+each
+few
+more
+most
+other
+some
+such
+no
+nor
+not
+only
+own
+same
+so
+than
+too
+very
+s
+t
+can
+will
+just
+don
+should
+now
+
--- a/Mining/Uebungen/1_Uebung/u1-u4/german
+++ b/Mining/Uebungen/1_Uebung/u1-u4/german
@ -0,0 +1,231 @@
+aber
+alle
+allem
+allen
+aller
+alles
+als
+also
+am
+an
+ander
+andere
+anderem
+anderen
+anderer
+anderes
+anderm
+andern
+anderr
+anders
+auch
+auf
+aus
+bei
+bin
+bis
+bist
+da
+damit
+dann
+der
+den
+des
+dem
+die
+das
+daß
+derselbe
+derselben
+denselben
+desselben
+demselben
+dieselbe
+dieselben
+dasselbe
+dazu
+dein
+deine
+deinem
+deinen
+deiner
+deines
+denn
+derer
+dessen
+dich
+dir
+du
+dies
+diese
+diesem
+diesen
+dieser
+dieses
+doch
+dort
+durch
+ein
+eine
+einem
+einen
+einer
+eines
+einig
+einige
+einigem
+einigen
+einiger
+einiges
+einmal
+er
+ihn
+ihm
+es
+etwas
+euer
+eure
+eurem
+euren
+eurer
+eures
+für
+gegen
+gewesen
+hab
+habe
+haben
+hat
+hatte
+hatten
+hier
+hin
+hinter
+ich
+mich
+mir
+ihr
+ihre
+ihrem
+ihren
+ihrer
+ihres
+euch
+im
+in
+indem
+ins
+ist
+jede
+jedem
+jeden
+jeder
+jedes
+jene
+jenem
+jenen
+jener
+jenes
+jetzt
+kann
+kein
+keine
+keinem
+keinen
+keiner
+keines
+können
+könnte
+machen
+man
+manche
+manchem
+manchen
+mancher
+manches
+mein
+meine
+meinem
+meinen
+meiner
+meines
+mit
+muss
+musste
+nach
+nicht
+nichts
+noch
+nun
+nur
+ob
+oder
+ohne
+sehr
+sein
+seine
+seinem
+seinen
+seiner
+seines
+selbst
+sich
+sie
+ihnen
+sind
+so
+solche
+solchem
+solchen
+solcher
+solches
+soll
+sollte
+sondern
+sonst
+über
+um
+und
+uns
+unse
+unsem
+unsen
+unser
+unses
+unter
+viel
+vom
+von
+vor
+während
+war
+waren
+warst
+was
+weg
+weil
+weiter
+welche
+welchem
+welchen
+welcher
+welches
+wenn
+werde
+werden
+wie
+wieder
+will
+wir
+wird
+wirst
+wo
+wollen
+wollte
+würde
+würden
+zu
+zum
+zur
+zwar
+zwischen
--- a/Mining/Uebungen/1_Uebung/u1-u4/numwordsoffreq_over_wordfreq_linear.png
+++ b/Mining/Uebungen/1_Uebung/u1-u4/numwordsoffreq_over_wordfreq_linear.png
--- a/Mining/Uebungen/1_Uebung/u1-u4/numwordsoffreq_over_wordfreq_log2.png
+++ b/Mining/Uebungen/1_Uebung/u1-u4/numwordsoffreq_over_wordfreq_log2.png
--- a/Mining/Uebungen/1_Uebung/u1-u4/task2.py
+++ b/Mining/Uebungen/1_Uebung/u1-u4/task2.py
@ -0,0 +1,168 @@
+import sys
+import re
+import matplotlib.pyplot as plt
+
+def split(text, regex):
+    ret = re.findall(regex, text)
+    for i in range(len(ret)):
+        ret[i] = ret[i].lower()
+    return ret
+
+def splitAll(files, regex):
+    words = []
+    for filename in files:
+        print "Processing " + filename + "."
+        print "\tSplitting words..."
+        words += split(open(filename).read(), regex)
+    wordcount = len(words)
+    print "Absolute word count: " + str(wordcount)
+    return words
+
+def countWords(words):
+    counts = {}
+    for word in words:
+        if word not in counts:
+            counts[word] = 1
+        else:
+            counts[word] += 1
+    return counts
+
+def reverseDictionary(dictionary):
+    ret = {}
+    i = 0
+    for word in dictionary.keys():
+        if dictionary[word] not in ret:
+            ret[dictionary[word]] = [word]
+        else:
+            ret[dictionary[word]].append(word)
+    return ret
+
+def printWordCounts(countToWords, maxCount = -1):
+    totalcount = 0
+    for count in countToWords:
+        totalcount += count * len(countToWords[count])
+    
+    print "#\t| Abs.\t\t| Rel.\t\t| Words"
+    sortedKeys = sorted(countToWords.iterkeys(), None, None, True)
+    i = 0
+    breakOuter = False
+    for count in sortedKeys:
+        for word in countToWords[count]:
+            fmtStr = str(i+1) + "\t| " + str(count) + "\t\t| %3.2f %% \t| " + word
+            values = (float(count) / float(totalcount) * 100)
+            print fmtStr % values
+            i += 1
+            if i >= maxCount and maxCount > 0:
+                breakOuter = True
+                break
+        if breakOuter:
+            break
+
+basedir = "C:/Users/Victor/Dropbox/Uni/Web Mining/Ex 1/"        
+file0 = "A History of Banks for Savings in Great Britain and Ireland by William Lewins.txt"
+file1 = "Expository Writing by Mervin James Curl.txt"
+file2 = "A Little Girl in Old San Francisco by Amanda Minnie Douglas.txt"
+file3 = "Einfuehrung in die moderne Logik.txt"
+
+words = splitAll([basedir + file0, basedir + file1, basedir + file2], "\w+")
+
+# (2.1)
+print "\n=== 30 most frequent words ===\n"
+countToWords = reverseDictionary(countWords(words))
+
+printWordCounts(countToWords, 30)
+
+# (2.2)
+print "\n=== 30 most frequent words w/o stopwords ===\n"
+countToWordsWithoutStopwords = {}
+stopWords = split(open(basedir + "english").read(), "\w+")
+for count in countToWords.keys():
+    newWords = []
+    for word in countToWords[count]:
+        if word not in stopWords:
+            newWords.append(word)
+
+    if len(newWords):
+        countToWordsWithoutStopwords[count] = newWords
+
+printWordCounts(countToWordsWithoutStopwords, 30)       	
+
+# (3.1)
+xValues = []
+yValues = []
+i = 0
+sortedKeys = sorted(countToWords.iterkeys(), None, None, True)
+for count in sortedKeys:
+    for word in countToWords[count]:
+        xValues.append(i)
+        yValues.append(count)
+        i += 1
+        
+'''plt.plot(xValues, yValues)
+plt.xlabel('word rank')
+plt.ylabel('word frequency')
+plt.xscale('log')
+plt.yscale('log')
+plt.show()'''
+
+# (3.2)
+xValues = []
+yValues = []
+sortedKeys = sorted(countToWords.iterkeys(), None, None, True)
+for count in sortedKeys:
+    xValues.append(count)
+    yValues.append(len(countToWords[count]))
+
+'''plt.plot(xValues, yValues)
+plt.xlabel('word frequency')
+plt.ylabel('number of words with this frequency')
+plt.xscale('log')
+plt.yscale('log')
+plt.show()'''
+
+# (4)
+characters = splitAll([basedir + file0], "\w{1,2}")
+
+print "\n=== characters by frequency (text 1) ===\n"
+countToWords = reverseDictionary(countWords(characters))
+
+printWordCounts(countToWords, 10)
+
+'''xValues = []
+yValues = []
+i = 0
+sortedKeys = sorted(countToWords.iterkeys(), None, None, True)
+for count in sortedKeys:
+    for word in countToWords[count]:
+        xValues.append(i)
+        yValues.append(count)
+        i += 1
+
+plt.plot(xValues, yValues)
+plt.xlabel('character rank')
+plt.ylabel('character frequency')
+plt.yscale('log')
+plt.show()'''
+
+characters = splitAll([basedir + file3], "\w{1,2}")
+
+print "\n=== characters by frequency (text 2) ===\n"
+countToWords = reverseDictionary(countWords(characters))
+
+printWordCounts(countToWords, 10)
+
+'''xValues = []
+yValues = []
+i = 0
+sortedKeys = sorted(countToWords.iterkeys(), None, None, True)
+for count in sortedKeys:
+    for word in countToWords[count]:
+        xValues.append(i)
+        yValues.append(count)
+        i += 1
+
+plt.plot(xValues, yValues)
+plt.xlabel('character rank')
+plt.ylabel('character frequency')
+plt.yscale('log')
+plt.show()'''
--- a/Mining/Uebungen/1_Uebung/u1-u4/wordfreq_over_wordrank_linear.png
+++ b/Mining/Uebungen/1_Uebung/u1-u4/wordfreq_over_wordrank_linear.png
--- a/Mining/Uebungen/1_Uebung/u1-u4/wordfreq_over_wordrank_log2.png
+++ b/Mining/Uebungen/1_Uebung/u1-u4/wordfreq_over_wordrank_log2.png