From 17dfb87a67f65c3aec6f238dd261c5c49da71bbe Mon Sep 17 00:00:00 2001 From: rylon Date: Sun, 16 Jun 2013 18:54:39 +0200 Subject: [PATCH] stopword filter -> der bayes klasifiziert alles als learned -> warum? --- .../Uebungen/4_Uebung/G22_predictions.txt | 56 ++-- .../4_Uebung/code/confusion_matrix.py | 24 +- .../Uebungen/4_Uebung/code/naive_bayes.py | 54 ++- .../Uebungen/4_Uebung/data/stopwords/README | 9 + .../Uebungen/4_Uebung/data/stopwords/danish | 94 ++++++ .../Uebungen/4_Uebung/data/stopwords/dutch | 101 ++++++ .../Uebungen/4_Uebung/data/stopwords/english | 128 +++++++ .../Uebungen/4_Uebung/data/stopwords/finnish | 235 +++++++++++++ .../Uebungen/4_Uebung/data/stopwords/french | 155 +++++++++ .../Uebungen/4_Uebung/data/stopwords/german | 231 +++++++++++++ .../4_Uebung/data/stopwords/hungarian | 199 +++++++++++ .../Uebungen/4_Uebung/data/stopwords/italian | 279 ++++++++++++++++ .../4_Uebung/data/stopwords/norwegian | 176 ++++++++++ .../4_Uebung/data/stopwords/portuguese | 203 ++++++++++++ .../Uebungen/4_Uebung/data/stopwords/russian | 151 +++++++++ .../Uebungen/4_Uebung/data/stopwords/spanish | 313 ++++++++++++++++++ .../Uebungen/4_Uebung/data/stopwords/swedish | 114 +++++++ .../Uebungen/4_Uebung/data/stopwords/turkish | 53 +++ 18 files changed, 2525 insertions(+), 50 deletions(-) create mode 100644 ss2013/1_Web Mining/Uebungen/4_Uebung/data/stopwords/README create mode 100644 ss2013/1_Web Mining/Uebungen/4_Uebung/data/stopwords/danish create mode 100644 ss2013/1_Web Mining/Uebungen/4_Uebung/data/stopwords/dutch create mode 100644 ss2013/1_Web Mining/Uebungen/4_Uebung/data/stopwords/english create mode 100644 ss2013/1_Web Mining/Uebungen/4_Uebung/data/stopwords/finnish create mode 100644 ss2013/1_Web Mining/Uebungen/4_Uebung/data/stopwords/french create mode 100644 ss2013/1_Web Mining/Uebungen/4_Uebung/data/stopwords/german create mode 100644 ss2013/1_Web Mining/Uebungen/4_Uebung/data/stopwords/hungarian create mode 100644 ss2013/1_Web Mining/Uebungen/4_Uebung/data/stopwords/italian create mode 100644 ss2013/1_Web Mining/Uebungen/4_Uebung/data/stopwords/norwegian create mode 100644 ss2013/1_Web Mining/Uebungen/4_Uebung/data/stopwords/portuguese create mode 100644 ss2013/1_Web Mining/Uebungen/4_Uebung/data/stopwords/russian create mode 100644 ss2013/1_Web Mining/Uebungen/4_Uebung/data/stopwords/spanish create mode 100644 ss2013/1_Web Mining/Uebungen/4_Uebung/data/stopwords/swedish create mode 100644 ss2013/1_Web Mining/Uebungen/4_Uebung/data/stopwords/turkish diff --git a/ss2013/1_Web Mining/Uebungen/4_Uebung/G22_predictions.txt b/ss2013/1_Web Mining/Uebungen/4_Uebung/G22_predictions.txt index 2a15528f..e761e1ac 100644 --- a/ss2013/1_Web Mining/Uebungen/4_Uebung/G22_predictions.txt +++ b/ss2013/1_Web Mining/Uebungen/4_Uebung/G22_predictions.txt @@ -1,28 +1,28 @@ 00f0316054ddf9504f87ea28e73683b6.txt learned -025338a8f0b21608e843df13d54c8c70.txt belles_lettres +025338a8f0b21608e843df13d54c8c70.txt learned 05aac70e552dd51430af3c194ad0fdec.txt learned 083ae11a870e96d1f5c9835eaf48118c.txt learned 08932387850eea34daae545225dcf8a2.txt learned -08bfc610c065764b9dfec4eed039fa69.txt government +08bfc610c065764b9dfec4eed039fa69.txt learned 096fddf36cb7d1de3f236d85bdefb938.txt learned 0bd46492f5ad09df3b80cb62a335e689.txt learned 0c16f57f865c4e0e072546097b1d4adb.txt learned 0c267d9b541a7d3e07eb0e841609b307.txt learned 0f0c1baf6f76f3e16cac7855405029a6.txt learned 100ec74fe0d1dd74956246df46d9b845.txt learned -10609aabf1b727d9728d96ffe9064f11.txt government -1361da4407b0db04d4fd9b0dc51331b8.txt belles_lettres +10609aabf1b727d9728d96ffe9064f11.txt learned +1361da4407b0db04d4fd9b0dc51331b8.txt learned 13a8cf112cb61bb237b91bffac75f506.txt learned 14c10df92c36df39ec7d541654884a6f.txt learned 15d5e6a3a826d0471c4dec0215169c94.txt learned -17543675c21f3d1961df70e4bc05b677.txt government +17543675c21f3d1961df70e4bc05b677.txt learned 1af257d94c2c7c31650edd49fe5c3888.txt learned 1da789efbc92bb26ab551c749a498714.txt learned 21ee90d49d229072cdfe3373f386fbc6.txt learned 2269c21867d5c492b2e223bc5589897e.txt learned 233809d44fe0b4625aba2b21e2a090c3.txt learned -2448920f7507a90ae5de9895518a4256.txt belles_lettres -25544eed4ec559512c188e8d73c61576.txt government +2448920f7507a90ae5de9895518a4256.txt learned +25544eed4ec559512c188e8d73c61576.txt learned 280024d9ca375fe894e0c3852153d91c.txt learned 2ae543a13eb502dfb34efcd691af4c19.txt learned 2bfe3d3546118761639b703dc042174b.txt learned @@ -33,7 +33,7 @@ 3455cd8f4a88bbe179c3d16ff2d08aed.txt learned 3463870779e91a0b3ef42dcb5614c417.txt learned 347c4ff3005261de62a1350cf3552db1.txt learned -371e9a205208a5fe2d058b1373246b06.txt romance +371e9a205208a5fe2d058b1373246b06.txt learned 3a0e840d849fb693fb0350a9bca049a7.txt learned 3cfe918d71f0216d698a656bb261754a.txt learned 42f560bacaae1ac7960efdafc40c9957.txt learned @@ -46,8 +46,8 @@ 4ad03bf39d4b20405d92877d8a2d620c.txt learned 4b1042b36a2e8d19883107213a55d4fd.txt learned 4cd9f5cf912b67d8d541cf805e35ec9d.txt learned -4d1f93581f8df325a0a8fd9df3a60f49.txt government -4e7a71284825f9b8302c914b3bf65c41.txt government +4d1f93581f8df325a0a8fd9df3a60f49.txt learned +4e7a71284825f9b8302c914b3bf65c41.txt learned 4f8b6422ab5ad965d2925bb93f1a5ad1.txt learned 4feeaa056745eaa93855a6d05cc21d20.txt learned 5122f89d4fff6ec6e26062ded7c5387e.txt learned @@ -59,7 +59,7 @@ 5a3733909b787420f2ae4a84095d90b6.txt learned 5a6fe4735711b757130334f30a5c0d8e.txt learned 5cfb1bed9bb97b6a0aabd93ea65d677b.txt learned -5db5250d2936c795389841699a64b1dc.txt belles_lettres +5db5250d2936c795389841699a64b1dc.txt learned 5e9a239de5aeb08b0713d0245fc914c7.txt learned 5f606972d66ed49044f3eadaf4eb2a54.txt learned 60e338de63774c5ef4e7beba18bc6577.txt learned @@ -68,30 +68,30 @@ 63167efcd7a7bdbd4b742f6e482312f4.txt learned 635e2c48ef4a37462fd8a4cd17375c5c.txt learned 64812690c6155fba3f1aba0514496dd9.txt learned -65f1d037cb5f92da6605cea6d0d703d0.txt belles_lettres -66abce82b770b4368691f2926f87089e.txt belles_lettres +65f1d037cb5f92da6605cea6d0d703d0.txt learned +66abce82b770b4368691f2926f87089e.txt learned 691c1e5e341a19e59b27dfb4f71fc0e0.txt learned 712b9c9622c73dbb0e6dc5ba2c231cf0.txt learned 71aa03bfef20157578b6b613174d3fe6.txt learned 71f153ecdef94026a97b635a40b375c8.txt learned 7341b4fda4d972adfbf854a0d6be3400.txt learned 74486d71097c34544195b52bdd844839.txt learned -745df40e8d2ba4bf6abfcb197c65359c.txt government -74a1421e246c3ffc08398609f75e292c.txt belles_lettres +745df40e8d2ba4bf6abfcb197c65359c.txt learned +74a1421e246c3ffc08398609f75e292c.txt learned 784346fad149c3736d309036e925526c.txt learned 787d5f0883aa5fa768a624c226fc7294.txt learned 791f3304bbd155e0211904d1d002b081.txt learned -7a297cedd35c3ffb12ab6011d34f1244.txt belles_lettres +7a297cedd35c3ffb12ab6011d34f1244.txt learned 7c809ae6732c39ea9a020a307ff35b3a.txt learned 7f8b847188c77b75a2b00e906e0ae693.txt learned 805ea08c406a72dbff755a3627aeb677.txt learned 8459fa5551ec11ae82c5fc404f2b3988.txt learned 853f9d4b400a22d2abbf0f2e17d6ae33.txt learned -864ff44244fb6229ba79ce3df93df701.txt belles_lettres +864ff44244fb6229ba79ce3df93df701.txt learned 8758b603d3ce23de68cbd13665a128d4.txt learned 87d7774f30d9221f856bab02a3f5ffc4.txt learned 8b2d2ff3e27f2d56f5c51f85c2754cf9.txt learned -8babd57d7cbd695d8c04d698626593e8.txt belles_lettres +8babd57d7cbd695d8c04d698626593e8.txt learned 8ce16ec688419c614801d5c29cec6153.txt learned 8d2066cd72a448eb69348dbb68f754d8.txt learned 8fb3df3b7d96dc4383c84447a4fdd1a3.txt learned @@ -99,13 +99,13 @@ 93c4b35148e7dcb767ea607fe7edf2c3.txt learned 990e5a79b032e5cb9ab3e56cab71a6ef.txt learned 9b9ed2005178bb6098ae874260128fc6.txt learned -9c97ea8f2d4dea9c31ebe73765f2396b.txt belles_lettres +9c97ea8f2d4dea9c31ebe73765f2396b.txt learned 9f08d188f8174081f5b02a7f07668846.txt learned 9f9b19682a8401fd40bce446f33d508b.txt learned -9fe0cd0d62c294ed1bc7b29e7e65c18a.txt government +9fe0cd0d62c294ed1bc7b29e7e65c18a.txt learned 9febf62c0e6509f3e1ad065a5a6aef8d.txt learned a03db0b1e3bb05fc0f961d2a655e8dad.txt learned -a716803991f9713e7986d252e26e7382.txt belles_lettres +a716803991f9713e7986d252e26e7382.txt learned a98e64947521853ff24f52e12b77c789.txt learned aa5156a64316e6836b14c61879d80712.txt learned ac848bdeda712352e09e5fa392be4574.txt learned @@ -119,10 +119,10 @@ b3346fa7bed6f5b9ad06bc831c59ad6c.txt learned b3681b289f0dd87a5c1f9573cd825866.txt learned b4d65c8e57797e496834f5f6d9d3e49e.txt learned b65707c01e68cc6d4d59e18d9f98f423.txt learned -b8a039ba1694ce7ce87737ce5c7480d8.txt government -b998ac20277e09a1c3fecbdfb028b33a.txt government -ba6843edc446617d1e6e5ec53246d849.txt belles_lettres -bb6d375a8b847c7c10f9bdbf7324eb03.txt belles_lettres +b8a039ba1694ce7ce87737ce5c7480d8.txt learned +b998ac20277e09a1c3fecbdfb028b33a.txt learned +ba6843edc446617d1e6e5ec53246d849.txt learned +bb6d375a8b847c7c10f9bdbf7324eb03.txt learned bbbda4cef7aeb20352c9f1d9b453a9e5.txt learned be6f1bd428b9933bedbc6bd401868415.txt learned bf8ce15b10cb746bb1181645a42012db.txt learned @@ -133,8 +133,8 @@ c39fda6fbf81d87bb6508b1bbe7faf93.txt learned c5a19f446f960c849d67b25238a08397.txt learned c65f6ecdb1ba01da0e6525dd525621e1.txt learned c942ba590a82fd0827b79e3d6bfb25d3.txt learned -c9497d141930518b8005ba352b4d1637.txt government -cb24d378b3966cf4f3f663f8b13430f2.txt belles_lettres +c9497d141930518b8005ba352b4d1637.txt learned +cb24d378b3966cf4f3f663f8b13430f2.txt learned ce39b27592fc593d0ee117651b072cc1.txt learned ceacd82d3757974d93538f67b74bc25e.txt learned cfdd298764ed82fa2304e427dcb53db9.txt learned @@ -147,7 +147,7 @@ dc713f9e699e9e610b458b5c991ce514.txt learned dc89c7bfd3f0eefd385f0a81c1a59981.txt learned dc9a7b20833ff389ae573597095f253d.txt learned dcacb995ec95ede56ba389128922603c.txt learned -dd1a33aada4ffb0564f709c10b95cedc.txt government +dd1a33aada4ffb0564f709c10b95cedc.txt learned e058a15d26f17f7193a032eed51bbbfc.txt learned e2daacfa9c33ea659beaa1a7763bfe57.txt learned e43c7ff67adf6fdd0710c0ec91776481.txt learned diff --git a/ss2013/1_Web Mining/Uebungen/4_Uebung/code/confusion_matrix.py b/ss2013/1_Web Mining/Uebungen/4_Uebung/code/confusion_matrix.py index 1213b4d3..a5f85355 100644 --- a/ss2013/1_Web Mining/Uebungen/4_Uebung/code/confusion_matrix.py +++ b/ss2013/1_Web Mining/Uebungen/4_Uebung/code/confusion_matrix.py @@ -61,8 +61,11 @@ def accuracy(): i += 1 #print ok_recognized - #print document_count - accuracy = float(ok_recognized) / float(wrong_recognized) + #print document_count + if wrong_recognized + ok_recognized <> 0: + accuracy = float(ok_recognized) / float(ok_recognized+wrong_recognized) + else: + accuracy = 0 def prec(): #per class -> positive matches / alle matches auf class @@ -78,7 +81,7 @@ def prec(): else: not_ok_values += c j += 1 - if not_ok_values +ok_values > 0: + if not_ok_values + ok_values > 0: precision[i] = float(ok_values) / float(ok_values + not_ok_values) else: precision[i] = 0 #division by zero @@ -137,7 +140,10 @@ def recall(): else: not_okvalues += conf[i] j += 1 - recalls[i] = float(ok_values) / float(not_okvalues) + if not_okvalues + ok_values <> 0: + recalls[i] = float(ok_values) / float(ok_values+not_okvalues) + #else: + # recalls[i] = 0 i += 1 def pp_recall(): @@ -160,7 +166,10 @@ def prec_micro(): i += 1 global precision_micro - precision_micro = float(result[0]) / float(result[0]+result[1]) + if result[0]+result[1] <> 0: + precision_micro = float(result[0]) / float(result[0]+result[1]) + else: + precision_micro = 0 def conf_micro_class(class_): i = 0 @@ -200,7 +209,10 @@ def recall_micro(): i += 1 global recall_micro - recall_micro = float(result[0]) / float(result[0]+result[2]) + if result[0]+result[2] <> 0: + recall_micro = float(result[0]) / float(result[0]+result[2]) + else: + recall_micro = 0 def pp_microrecall(): print "Recall Microavg: "+str(round(recall_micro*100,4))+"%" diff --git a/ss2013/1_Web Mining/Uebungen/4_Uebung/code/naive_bayes.py b/ss2013/1_Web Mining/Uebungen/4_Uebung/code/naive_bayes.py index 35b972a3..e598eb71 100644 --- a/ss2013/1_Web Mining/Uebungen/4_Uebung/code/naive_bayes.py +++ b/ss2013/1_Web Mining/Uebungen/4_Uebung/code/naive_bayes.py @@ -5,7 +5,7 @@ import shutil import random import sys import math - +import re @@ -14,6 +14,7 @@ actualDir = os.path.dirname(os.path.realpath(__file__)) dataDir = os.path.join(actualDir, '../data') trainDir = os.path.join(dataDir, 'u4_train') testDir = os.path.join(dataDir, 'u4_test') +stopwords = os.path.join(dataDir, 'stopwords/english') @@ -116,9 +117,7 @@ class multiclassClassifier: for line in lines: thisline = line.split(" "); for word in thisline: - word = word.lower() - word = word.replace(".", " ") - word = word.replace(",", " ") + word = self.clean_word(word) if dictonary.has_key(word): dictonary[str(word)] += 1 else: @@ -128,21 +127,41 @@ class multiclassClassifier: def bayes(self, text, termfrequenciesOfClasses, termCount, percentage): + result = 1.0 for line in text: thisline = line.split(" "); for word in thisline: - word = word.lower() - word = word.replace(',', ' ') - word = word.replace(".", " ") - result = 1.0 - for word in text: - if termfrequenciesOfClasses.has_key(str(word)): - result += math.log((termfrequenciesOfClasses[word]+1.)/(termCount+1)) - else: - result += math.log(1./(termCount+1)) - result += math.log(percentage) + word = self.clean_word(word) + if word <> "": + #result = 1.0 + for word in text: + if termfrequenciesOfClasses.has_key(str(word)): + result += math.log((termfrequenciesOfClasses[word]+1.)/(termCount+1)) + else: + result += math.log(1./(termCount+1)) + result += math.log(percentage) return result - + + def clean_word(self, word): + #print word + word = word.lower() #lowercase + #return word + word = "".join(re.findall("[a-z]+", word)) #only characters + #return word + if len(word) <= 4: #only words longer 4 + return "" + #return word + if self.isStopWord(word): #stopwordfilter + return "" + #print word + return word + + def isStopWord(self,word): + for line in open(stopwords,'r').readlines(): + if line.strip() == word: + return True + return False + @@ -173,6 +192,7 @@ if __name__ == '__main__': mc.percentage[classes] = mc.countClasses[classes]/sumOfClasses + #class_matches = [] listing = os.listdir(trainDir) for classes in listing: # classes path = trainDir+'/'+classes+'/testdata' @@ -183,8 +203,10 @@ if __name__ == '__main__': maxRes = sys.maxint * -1 # check all possible classes for cl in mc.percentage.keys(): - f = open(currentPath, 'r') + f = open(currentPath, 'r') temp = mc.bayes(f.readlines(), mc.termfrequenciesOfClasses[cl], sumOfClasses, mc.percentage[cl]) + #class_matches.append([infile,cl,temp]) + #print class_matches if (temp >= maxRes): maxRes = temp mc.filesToPrediction[infile] = cl diff --git a/ss2013/1_Web Mining/Uebungen/4_Uebung/data/stopwords/README b/ss2013/1_Web Mining/Uebungen/4_Uebung/data/stopwords/README new file mode 100644 index 00000000..fb651801 --- /dev/null +++ b/ss2013/1_Web Mining/Uebungen/4_Uebung/data/stopwords/README @@ -0,0 +1,9 @@ +Stopwords Corpus + +This corpus contains lists of stop words for several languages. These +are high-frequency grammatical words which are usually ignored in text +retrieval applications. + +They were obtained from: +http://anoncvs.postgresql.org/cvsweb.cgi/pgsql/src/backend/snowball/stopwords/ + diff --git a/ss2013/1_Web Mining/Uebungen/4_Uebung/data/stopwords/danish b/ss2013/1_Web Mining/Uebungen/4_Uebung/data/stopwords/danish new file mode 100644 index 00000000..d3edc675 --- /dev/null +++ b/ss2013/1_Web Mining/Uebungen/4_Uebung/data/stopwords/danish @@ -0,0 +1,94 @@ +og +i +jeg +det +at +en +den +til +er +som +på +de +med +han +af +for +ikke +der +var +mig +sig +men +et +har +om +vi +min +havde +ham +hun +nu +over +da +fra +du +ud +sin +dem +os +op +man +hans +hvor +eller +hvad +skal +selv +her +alle +vil +blev +kunne +ind +når +være +dog +noget +ville +jo +deres +efter +ned +skulle +denne +end +dette +mit +også +under +have +dig +anden +hende +mine +alt +meget +sit +sine +vor +mod +disse +hvis +din +nogle +hos +blive +mange +ad +bliver +hendes +været +thi +jer +sådan diff --git a/ss2013/1_Web Mining/Uebungen/4_Uebung/data/stopwords/dutch b/ss2013/1_Web Mining/Uebungen/4_Uebung/data/stopwords/dutch new file mode 100644 index 00000000..cafa0324 --- /dev/null +++ b/ss2013/1_Web Mining/Uebungen/4_Uebung/data/stopwords/dutch @@ -0,0 +1,101 @@ +de +en +van +ik +te +dat +die +in +een +hij +het +niet +zijn +is +was +op +aan +met +als +voor +had +er +maar +om +hem +dan +zou +of +wat +mijn +men +dit +zo +door +over +ze +zich +bij +ook +tot +je +mij +uit +der +daar +haar +naar +heb +hoe +heeft +hebben +deze +u +want +nog +zal +me +zij +nu +ge +geen +omdat +iets +worden +toch +al +waren +veel +meer +doen +toen +moet +ben +zonder +kan +hun +dus +alles +onder +ja +eens +hier +wie +werd +altijd +doch +wordt +wezen +kunnen +ons +zelf +tegen +na +reeds +wil +kon +niets +uw +iemand +geweest +andere diff --git a/ss2013/1_Web Mining/Uebungen/4_Uebung/data/stopwords/english b/ss2013/1_Web Mining/Uebungen/4_Uebung/data/stopwords/english new file mode 100644 index 00000000..a9130116 --- /dev/null +++ b/ss2013/1_Web Mining/Uebungen/4_Uebung/data/stopwords/english @@ -0,0 +1,128 @@ +i +me +my +myself +we +our +ours +ourselves +you +your +yours +yourself +yourselves +he +him +his +himself +she +her +hers +herself +it +its +itself +they +them +their +theirs +themselves +what +which +who +whom +this +that +these +those +am +is +are +was +were +be +been +being +have +has +had +having +do +does +did +doing +a +an +the +and +but +if +or +because +as +until +while +of +at +by +for +with +about +against +between +into +through +during +before +after +above +below +to +from +up +down +in +out +on +off +over +under +again +further +then +once +here +there +when +where +why +how +all +any +both +each +few +more +most +other +some +such +no +nor +not +only +own +same +so +than +too +very +s +t +can +will +just +don +should +now + diff --git a/ss2013/1_Web Mining/Uebungen/4_Uebung/data/stopwords/finnish b/ss2013/1_Web Mining/Uebungen/4_Uebung/data/stopwords/finnish new file mode 100644 index 00000000..47ee200f --- /dev/null +++ b/ss2013/1_Web Mining/Uebungen/4_Uebung/data/stopwords/finnish @@ -0,0 +1,235 @@ +olla +olen +olet +on +olemme +olette +ovat +ole +oli +olisi +olisit +olisin +olisimme +olisitte +olisivat +olit +olin +olimme +olitte +olivat +ollut +olleet +en +et +ei +emme +ette +eivät +minä +minun +minut +minua +minussa +minusta +minuun +minulla +minulta +minulle +sinä +sinun +sinut +sinua +sinussa +sinusta +sinuun +sinulla +sinulta +sinulle +hän +hänen +hänet +häntä +hänessä +hänestä +häneen +hänellä +häneltä +hänelle +me +meidän +meidät +meitä +meissä +meistä +meihin +meillä +meiltä +meille +te +teidän +teidät +teitä +teissä +teistä +teihin +teillä +teiltä +teille +he +heidän +heidät +heitä +heissä +heistä +heihin +heillä +heiltä +heille +tämä +tämän +tätä +tässä +tästä +tähän +tallä +tältä +tälle +tänä +täksi +tuo +tuon +tuotä +tuossa +tuosta +tuohon +tuolla +tuolta +tuolle +tuona +tuoksi +se +sen +sitä +siinä +siitä +siihen +sillä +siltä +sille +sinä +siksi +nämä +näiden +näitä +näissä +näistä +näihin +näillä +näiltä +näille +näinä +näiksi +nuo +noiden +noita +noissa +noista +noihin +noilla +noilta +noille +noina +noiksi +ne +niiden +niitä +niissä +niistä +niihin +niillä +niiltä +niille +niinä +niiksi +kuka +kenen +kenet +ketä +kenessä +kenestä +keneen +kenellä +keneltä +kenelle +kenenä +keneksi +ketkä +keiden +ketkä +keitä +keissä +keistä +keihin +keillä +keiltä +keille +keinä +keiksi +mikä +minkä +minkä +mitä +missä +mistä +mihin +millä +miltä +mille +minä +miksi +mitkä +joka +jonka +jota +jossa +josta +johon +jolla +jolta +jolle +jona +joksi +jotka +joiden +joita +joissa +joista +joihin +joilla +joilta +joille +joina +joiksi +että +ja +jos +koska +kuin +mutta +niin +sekä +sillä +tai +vaan +vai +vaikka +kanssa +mukaan +noin +poikki +yli +kun +niin +nyt +itse diff --git a/ss2013/1_Web Mining/Uebungen/4_Uebung/data/stopwords/french b/ss2013/1_Web Mining/Uebungen/4_Uebung/data/stopwords/french new file mode 100644 index 00000000..e7cbf4c9 --- /dev/null +++ b/ss2013/1_Web Mining/Uebungen/4_Uebung/data/stopwords/french @@ -0,0 +1,155 @@ +au +aux +avec +ce +ces +dans +de +des +du +elle +en +et +eux +il +je +la +le +leur +lui +ma +mais +me +même +mes +moi +mon +ne +nos +notre +nous +on +ou +par +pas +pour +qu +que +qui +sa +se +ses +son +sur +ta +te +tes +toi +ton +tu +un +une +vos +votre +vous +c +d +j +l +à +m +n +s +t +y +été +étée +étées +étés +étant +étante +étants +étantes +suis +es +est +sommes +êtes +sont +serai +seras +sera +serons +serez +seront +serais +serait +serions +seriez +seraient +étais +était +étions +étiez +étaient +fus +fut +fûmes +fûtes +furent +sois +soit +soyons +soyez +soient +fusse +fusses +fût +fussions +fussiez +fussent +ayant +ayante +ayantes +ayants +eu +eue +eues +eus +ai +as +avons +avez +ont +aurai +auras +aura +aurons +aurez +auront +aurais +aurait +aurions +auriez +auraient +avais +avait +avions +aviez +avaient +eut +eûmes +eûtes +eurent +aie +aies +ait +ayons +ayez +aient +eusse +eusses +eût +eussions +eussiez +eussent diff --git a/ss2013/1_Web Mining/Uebungen/4_Uebung/data/stopwords/german b/ss2013/1_Web Mining/Uebungen/4_Uebung/data/stopwords/german new file mode 100644 index 00000000..edef220b --- /dev/null +++ b/ss2013/1_Web Mining/Uebungen/4_Uebung/data/stopwords/german @@ -0,0 +1,231 @@ +aber +alle +allem +allen +aller +alles +als +also +am +an +ander +andere +anderem +anderen +anderer +anderes +anderm +andern +anderr +anders +auch +auf +aus +bei +bin +bis +bist +da +damit +dann +der +den +des +dem +die +das +daß +derselbe +derselben +denselben +desselben +demselben +dieselbe +dieselben +dasselbe +dazu +dein +deine +deinem +deinen +deiner +deines +denn +derer +dessen +dich +dir +du +dies +diese +diesem +diesen +dieser +dieses +doch +dort +durch +ein +eine +einem +einen +einer +eines +einig +einige +einigem +einigen +einiger +einiges +einmal +er +ihn +ihm +es +etwas +euer +eure +eurem +euren +eurer +eures +für +gegen +gewesen +hab +habe +haben +hat +hatte +hatten +hier +hin +hinter +ich +mich +mir +ihr +ihre +ihrem +ihren +ihrer +ihres +euch +im +in +indem +ins +ist +jede +jedem +jeden +jeder +jedes +jene +jenem +jenen +jener +jenes +jetzt +kann +kein +keine +keinem +keinen +keiner +keines +können +könnte +machen +man +manche +manchem +manchen +mancher +manches +mein +meine +meinem +meinen +meiner +meines +mit +muss +musste +nach +nicht +nichts +noch +nun +nur +ob +oder +ohne +sehr +sein +seine +seinem +seinen +seiner +seines +selbst +sich +sie +ihnen +sind +so +solche +solchem +solchen +solcher +solches +soll +sollte +sondern +sonst +über +um +und +uns +unse +unsem +unsen +unser +unses +unter +viel +vom +von +vor +während +war +waren +warst +was +weg +weil +weiter +welche +welchem +welchen +welcher +welches +wenn +werde +werden +wie +wieder +will +wir +wird +wirst +wo +wollen +wollte +würde +würden +zu +zum +zur +zwar +zwischen diff --git a/ss2013/1_Web Mining/Uebungen/4_Uebung/data/stopwords/hungarian b/ss2013/1_Web Mining/Uebungen/4_Uebung/data/stopwords/hungarian new file mode 100644 index 00000000..94e9f9a0 --- /dev/null +++ b/ss2013/1_Web Mining/Uebungen/4_Uebung/data/stopwords/hungarian @@ -0,0 +1,199 @@ +a +ahogy +ahol +aki +akik +akkor +alatt +által +általában +amely +amelyek +amelyekben +amelyeket +amelyet +amelynek +ami +amit +amolyan +amíg +amikor +át +abban +ahhoz +annak +arra +arról +az +azok +azon +azt +azzal +azért +aztán +azután +azonban +bár +be +belül +benne +cikk +cikkek +cikkeket +csak +de +e +eddig +egész +egy +egyes +egyetlen +egyéb +egyik +egyre +ekkor +el +elég +ellen +elõ +elõször +elõtt +elsõ +én +éppen +ebben +ehhez +emilyen +ennek +erre +ez +ezt +ezek +ezen +ezzel +ezért +és +fel +felé +hanem +hiszen +hogy +hogyan +igen +így +illetve +ill. +ill +ilyen +ilyenkor +ison +ismét +itt +jó +jól +jobban +kell +kellett +keresztül +keressünk +ki +kívül +között +közül +legalább +lehet +lehetett +legyen +lenne +lenni +lesz +lett +maga +magát +majd +majd +már +más +másik +meg +még +mellett +mert +mely +melyek +mi +mit +míg +miért +milyen +mikor +minden +mindent +mindenki +mindig +mint +mintha +mivel +most +nagy +nagyobb +nagyon +ne +néha +nekem +neki +nem +néhány +nélkül +nincs +olyan +ott +össze +õ +õk +õket +pedig +persze +rá +s +saját +sem +semmi +sok +sokat +sokkal +számára +szemben +szerint +szinte +talán +tehát +teljes +tovább +továbbá +több +úgy +ugyanis +új +újabb +újra +után +utána +utolsó +vagy +vagyis +valaki +valami +valamint +való +vagyok +van +vannak +volt +voltam +voltak +voltunk +vissza +vele +viszont +volna diff --git a/ss2013/1_Web Mining/Uebungen/4_Uebung/data/stopwords/italian b/ss2013/1_Web Mining/Uebungen/4_Uebung/data/stopwords/italian new file mode 100644 index 00000000..6ee02b51 --- /dev/null +++ b/ss2013/1_Web Mining/Uebungen/4_Uebung/data/stopwords/italian @@ -0,0 +1,279 @@ +ad +al +allo +ai +agli +all +agl +alla +alle +con +col +coi +da +dal +dallo +dai +dagli +dall +dagl +dalla +dalle +di +del +dello +dei +degli +dell +degl +della +delle +in +nel +nello +nei +negli +nell +negl +nella +nelle +su +sul +sullo +sui +sugli +sull +sugl +sulla +sulle +per +tra +contro +io +tu +lui +lei +noi +voi +loro +mio +mia +miei +mie +tuo +tua +tuoi +tue +suo +sua +suoi +sue +nostro +nostra +nostri +nostre +vostro +vostra +vostri +vostre +mi +ti +ci +vi +lo +la +li +le +gli +ne +il +un +uno +una +ma +ed +se +perché +anche +come +dov +dove +che +chi +cui +non +più +quale +quanto +quanti +quanta +quante +quello +quelli +quella +quelle +questo +questi +questa +queste +si +tutto +tutti +a +c +e +i +l +o +ho +hai +ha +abbiamo +avete +hanno +abbia +abbiate +abbiano +avrò +avrai +avrà +avremo +avrete +avranno +avrei +avresti +avrebbe +avremmo +avreste +avrebbero +avevo +avevi +aveva +avevamo +avevate +avevano +ebbi +avesti +ebbe +avemmo +aveste +ebbero +avessi +avesse +avessimo +avessero +avendo +avuto +avuta +avuti +avute +sono +sei +è +siamo +siete +sia +siate +siano +sarò +sarai +sarà +saremo +sarete +saranno +sarei +saresti +sarebbe +saremmo +sareste +sarebbero +ero +eri +era +eravamo +eravate +erano +fui +fosti +fu +fummo +foste +furono +fossi +fosse +fossimo +fossero +essendo +faccio +fai +facciamo +fanno +faccia +facciate +facciano +farò +farai +farà +faremo +farete +faranno +farei +faresti +farebbe +faremmo +fareste +farebbero +facevo +facevi +faceva +facevamo +facevate +facevano +feci +facesti +fece +facemmo +faceste +fecero +facessi +facesse +facessimo +facessero +facendo +sto +stai +sta +stiamo +stanno +stia +stiate +stiano +starò +starai +starà +staremo +starete +staranno +starei +staresti +starebbe +staremmo +stareste +starebbero +stavo +stavi +stava +stavamo +stavate +stavano +stetti +stesti +stette +stemmo +steste +stettero +stessi +stesse +stessimo +stessero +stando diff --git a/ss2013/1_Web Mining/Uebungen/4_Uebung/data/stopwords/norwegian b/ss2013/1_Web Mining/Uebungen/4_Uebung/data/stopwords/norwegian new file mode 100644 index 00000000..9ac1abbb --- /dev/null +++ b/ss2013/1_Web Mining/Uebungen/4_Uebung/data/stopwords/norwegian @@ -0,0 +1,176 @@ +og +i +jeg +det +at +en +et +den +til +er +som +på +de +med +han +av +ikke +ikkje +der +så +var +meg +seg +men +ett +har +om +vi +min +mitt +ha +hadde +hun +nå +over +da +ved +fra +du +ut +sin +dem +oss +opp +man +kan +hans +hvor +eller +hva +skal +selv +sjøl +her +alle +vil +bli +ble +blei +blitt +kunne +inn +når +være +kom +noen +noe +ville +dere +som +deres +kun +ja +etter +ned +skulle +denne +for +deg +si +sine +sitt +mot +å +meget +hvorfor +dette +disse +uten +hvordan +ingen +din +ditt +blir +samme +hvilken +hvilke +sånn +inni +mellom +vår +hver +hvem +vors +hvis +både +bare +enn +fordi +før +mange +også +slik +vært +være +båe +begge +siden +dykk +dykkar +dei +deira +deires +deim +di +då +eg +ein +eit +eitt +elles +honom +hjå +ho +hoe +henne +hennar +hennes +hoss +hossen +ikkje +ingi +inkje +korleis +korso +kva +kvar +kvarhelst +kven +kvi +kvifor +me +medan +mi +mine +mykje +no +nokon +noka +nokor +noko +nokre +si +sia +sidan +so +somt +somme +um +upp +vere +vore +verte +vort +varte +vart diff --git a/ss2013/1_Web Mining/Uebungen/4_Uebung/data/stopwords/portuguese b/ss2013/1_Web Mining/Uebungen/4_Uebung/data/stopwords/portuguese new file mode 100644 index 00000000..6b247786 --- /dev/null +++ b/ss2013/1_Web Mining/Uebungen/4_Uebung/data/stopwords/portuguese @@ -0,0 +1,203 @@ +de +a +o +que +e +do +da +em +um +para +com +não +uma +os +no +se +na +por +mais +as +dos +como +mas +ao +ele +das +à +seu +sua +ou +quando +muito +nos +já +eu +também +só +pelo +pela +até +isso +ela +entre +depois +sem +mesmo +aos +seus +quem +nas +me +esse +eles +você +essa +num +nem +suas +meu +às +minha +numa +pelos +elas +qual +nós +lhe +deles +essas +esses +pelas +este +dele +tu +te +vocês +vos +lhes +meus +minhas +teu +tua +teus +tuas +nosso +nossa +nossos +nossas +dela +delas +esta +estes +estas +aquele +aquela +aqueles +aquelas +isto +aquilo +estou +está +estamos +estão +estive +esteve +estivemos +estiveram +estava +estávamos +estavam +estivera +estivéramos +esteja +estejamos +estejam +estivesse +estivéssemos +estivessem +estiver +estivermos +estiverem +hei +há +havemos +hão +houve +houvemos +houveram +houvera +houvéramos +haja +hajamos +hajam +houvesse +houvéssemos +houvessem +houver +houvermos +houverem +houverei +houverá +houveremos +houverão +houveria +houveríamos +houveriam +sou +somos +são +era +éramos +eram +fui +foi +fomos +foram +fora +fôramos +seja +sejamos +sejam +fosse +fôssemos +fossem +for +formos +forem +serei +será +seremos +serão +seria +seríamos +seriam +tenho +tem +temos +tém +tinha +tínhamos +tinham +tive +teve +tivemos +tiveram +tivera +tivéramos +tenha +tenhamos +tenham +tivesse +tivéssemos +tivessem +tiver +tivermos +tiverem +terei +terá +teremos +terão +teria +teríamos +teriam diff --git a/ss2013/1_Web Mining/Uebungen/4_Uebung/data/stopwords/russian b/ss2013/1_Web Mining/Uebungen/4_Uebung/data/stopwords/russian new file mode 100644 index 00000000..ecb83d4a --- /dev/null +++ b/ss2013/1_Web Mining/Uebungen/4_Uebung/data/stopwords/russian @@ -0,0 +1,151 @@ +и +в +во +не +что +он +на +я +с +со +как +а +то +все +она +так +его +но +да +ты +к +у +же +вы +за +бы +по +только +ее +мне +было +вот +от +меня +еще +нет +о +из +ему +теперь +когда +даже +ну +вдруг +ли +если +уже +или +ни +быть +был +него +до +вас +нибудь +опять +уж +вам +ведь +там +потом +себя +ничего +ей +может +они +тут +где +есть +надо +ней +для +мы +тебя +их +чем +была +сам +чтоб +без +будто +чего +раз +тоже +себе +под +будет +ж +тогда +кто +этот +того +потому +этого +какой +совсем +ним +здесь +этом +один +почти +мой +тем +чтобы +нее +сейчас +были +куда +зачем +всех +никогда +можно +при +наконец +два +об +другой +хоть +после +над +больше +тот +через +эти +нас +про +всего +них +какая +много +разве +три +эту +моя +впрочем +хорошо +свою +этой +перед +иногда +лучше +чуть +том +нельзя +такой +им +более +всегда +конечно +всю +между diff --git a/ss2013/1_Web Mining/Uebungen/4_Uebung/data/stopwords/spanish b/ss2013/1_Web Mining/Uebungen/4_Uebung/data/stopwords/spanish new file mode 100644 index 00000000..59bc786c --- /dev/null +++ b/ss2013/1_Web Mining/Uebungen/4_Uebung/data/stopwords/spanish @@ -0,0 +1,313 @@ +de +la +que +el +en +y +a +los +del +se +las +por +un +para +con +no +una +su +al +lo +como +más +pero +sus +le +ya +o +este +sí +porque +esta +entre +cuando +muy +sin +sobre +también +me +hasta +hay +donde +quien +desde +todo +nos +durante +todos +uno +les +ni +contra +otros +ese +eso +ante +ellos +e +esto +mí +antes +algunos +qué +unos +yo +otro +otras +otra +él +tanto +esa +estos +mucho +quienes +nada +muchos +cual +poco +ella +estar +estas +algunas +algo +nosotros +mi +mis +tú +te +ti +tu +tus +ellas +nosotras +vosostros +vosostras +os +mío +mía +míos +mías +tuyo +tuya +tuyos +tuyas +suyo +suya +suyos +suyas +nuestro +nuestra +nuestros +nuestras +vuestro +vuestra +vuestros +vuestras +esos +esas +estoy +estás +está +estamos +estáis +están +esté +estés +estemos +estéis +estén +estaré +estarás +estará +estaremos +estaréis +estarán +estaría +estarías +estaríamos +estaríais +estarían +estaba +estabas +estábamos +estabais +estaban +estuve +estuviste +estuvo +estuvimos +estuvisteis +estuvieron +estuviera +estuvieras +estuviéramos +estuvierais +estuvieran +estuviese +estuvieses +estuviésemos +estuvieseis +estuviesen +estando +estado +estada +estados +estadas +estad +he +has +ha +hemos +habéis +han +haya +hayas +hayamos +hayáis +hayan +habré +habrás +habrá +habremos +habréis +habrán +habría +habrías +habríamos +habríais +habrían +había +habías +habíamos +habíais +habían +hube +hubiste +hubo +hubimos +hubisteis +hubieron +hubiera +hubieras +hubiéramos +hubierais +hubieran +hubiese +hubieses +hubiésemos +hubieseis +hubiesen +habiendo +habido +habida +habidos +habidas +soy +eres +es +somos +sois +son +sea +seas +seamos +seáis +sean +seré +serás +será +seremos +seréis +serán +sería +serías +seríamos +seríais +serían +era +eras +éramos +erais +eran +fui +fuiste +fue +fuimos +fuisteis +fueron +fuera +fueras +fuéramos +fuerais +fueran +fuese +fueses +fuésemos +fueseis +fuesen +sintiendo +sentido +sentida +sentidos +sentidas +siente +sentid +tengo +tienes +tiene +tenemos +tenéis +tienen +tenga +tengas +tengamos +tengáis +tengan +tendré +tendrás +tendrá +tendremos +tendréis +tendrán +tendría +tendrías +tendríamos +tendríais +tendrían +tenía +tenías +teníamos +teníais +tenían +tuve +tuviste +tuvo +tuvimos +tuvisteis +tuvieron +tuviera +tuvieras +tuviéramos +tuvierais +tuvieran +tuviese +tuvieses +tuviésemos +tuvieseis +tuviesen +teniendo +tenido +tenida +tenidos +tenidas +tened diff --git a/ss2013/1_Web Mining/Uebungen/4_Uebung/data/stopwords/swedish b/ss2013/1_Web Mining/Uebungen/4_Uebung/data/stopwords/swedish new file mode 100644 index 00000000..742bb626 --- /dev/null +++ b/ss2013/1_Web Mining/Uebungen/4_Uebung/data/stopwords/swedish @@ -0,0 +1,114 @@ +och +det +att +i +en +jag +hon +som +han +på +den +med +var +sig +för +så +till +är +men +ett +om +hade +de +av +icke +mig +du +henne +då +sin +nu +har +inte +hans +honom +skulle +hennes +där +min +man +ej +vid +kunde +något +från +ut +när +efter +upp +vi +dem +vara +vad +över +än +dig +kan +sina +här +ha +mot +alla +under +någon +eller +allt +mycket +sedan +ju +denna +själv +detta +åt +utan +varit +hur +ingen +mitt +ni +bli +blev +oss +din +dessa +några +deras +blir +mina +samma +vilken +er +sådan +vår +blivit +dess +inom +mellan +sådant +varför +varje +vilka +ditt +vem +vilket +sitta +sådana +vart +dina +vars +vårt +våra +ert +era +vilkas diff --git a/ss2013/1_Web Mining/Uebungen/4_Uebung/data/stopwords/turkish b/ss2013/1_Web Mining/Uebungen/4_Uebung/data/stopwords/turkish new file mode 100644 index 00000000..5a48ccce --- /dev/null +++ b/ss2013/1_Web Mining/Uebungen/4_Uebung/data/stopwords/turkish @@ -0,0 +1,53 @@ +acaba +ama +aslında +az +bazı +belki +biri +birkaç +birşey +biz +bu +çok +çünkü +da +daha +de +defa +diye +eğer +en +gibi +hem +hep +hepsi +her +hiç +için +ile +ise +kez +ki +kim +mı +mu +mü +nasıl +ne +neden +nerde +nerede +nereye +niçin +niye +o +sanki +şey +siz +şu +tüm +ve +veya +ya +yani