From 9acfc59daeacdcb520bf61732bab17ef1a33a4e8 Mon Sep 17 00:00:00 2001 From: rylon Date: Sun, 16 Jun 2013 19:22:13 +0200 Subject: [PATCH] 29% acc, stopwords, fixed bayes? --- .../Uebungen/4_Uebung/G22_predictions.txt | 210 +++++++++--------- .../Uebungen/4_Uebung/code/naive_bayes.py | 23 +- 2 files changed, 117 insertions(+), 116 deletions(-) diff --git a/ss2013/1_Web Mining/Uebungen/4_Uebung/G22_predictions.txt b/ss2013/1_Web Mining/Uebungen/4_Uebung/G22_predictions.txt index e761e1ac..1ca8157f 100644 --- a/ss2013/1_Web Mining/Uebungen/4_Uebung/G22_predictions.txt +++ b/ss2013/1_Web Mining/Uebungen/4_Uebung/G22_predictions.txt @@ -1,165 +1,165 @@ -00f0316054ddf9504f87ea28e73683b6.txt learned -025338a8f0b21608e843df13d54c8c70.txt learned +00f0316054ddf9504f87ea28e73683b6.txt belles_lettres +025338a8f0b21608e843df13d54c8c70.txt belles_lettres 05aac70e552dd51430af3c194ad0fdec.txt learned 083ae11a870e96d1f5c9835eaf48118c.txt learned -08932387850eea34daae545225dcf8a2.txt learned -08bfc610c065764b9dfec4eed039fa69.txt learned -096fddf36cb7d1de3f236d85bdefb938.txt learned -0bd46492f5ad09df3b80cb62a335e689.txt learned -0c16f57f865c4e0e072546097b1d4adb.txt learned +08932387850eea34daae545225dcf8a2.txt belles_lettres +08bfc610c065764b9dfec4eed039fa69.txt belles_lettres +096fddf36cb7d1de3f236d85bdefb938.txt belles_lettres +0bd46492f5ad09df3b80cb62a335e689.txt belles_lettres +0c16f57f865c4e0e072546097b1d4adb.txt belles_lettres 0c267d9b541a7d3e07eb0e841609b307.txt learned 0f0c1baf6f76f3e16cac7855405029a6.txt learned -100ec74fe0d1dd74956246df46d9b845.txt learned -10609aabf1b727d9728d96ffe9064f11.txt learned -1361da4407b0db04d4fd9b0dc51331b8.txt learned +100ec74fe0d1dd74956246df46d9b845.txt belles_lettres +10609aabf1b727d9728d96ffe9064f11.txt belles_lettres +1361da4407b0db04d4fd9b0dc51331b8.txt belles_lettres 13a8cf112cb61bb237b91bffac75f506.txt learned -14c10df92c36df39ec7d541654884a6f.txt learned -15d5e6a3a826d0471c4dec0215169c94.txt learned +14c10df92c36df39ec7d541654884a6f.txt belles_lettres +15d5e6a3a826d0471c4dec0215169c94.txt belles_lettres 17543675c21f3d1961df70e4bc05b677.txt learned 1af257d94c2c7c31650edd49fe5c3888.txt learned -1da789efbc92bb26ab551c749a498714.txt learned -21ee90d49d229072cdfe3373f386fbc6.txt learned -2269c21867d5c492b2e223bc5589897e.txt learned -233809d44fe0b4625aba2b21e2a090c3.txt learned +1da789efbc92bb26ab551c749a498714.txt belles_lettres +21ee90d49d229072cdfe3373f386fbc6.txt belles_lettres +2269c21867d5c492b2e223bc5589897e.txt belles_lettres +233809d44fe0b4625aba2b21e2a090c3.txt belles_lettres 2448920f7507a90ae5de9895518a4256.txt learned 25544eed4ec559512c188e8d73c61576.txt learned -280024d9ca375fe894e0c3852153d91c.txt learned +280024d9ca375fe894e0c3852153d91c.txt belles_lettres 2ae543a13eb502dfb34efcd691af4c19.txt learned -2bfe3d3546118761639b703dc042174b.txt learned -3068c168367e3ed5cac6af3bde2e566e.txt learned -31307aa6842b932e7f3073b253b687d4.txt learned -32e8d2431fed46743b954c35de544335.txt learned +2bfe3d3546118761639b703dc042174b.txt belles_lettres +3068c168367e3ed5cac6af3bde2e566e.txt belles_lettres +31307aa6842b932e7f3073b253b687d4.txt belles_lettres +32e8d2431fed46743b954c35de544335.txt belles_lettres 332614e6d84d25bcb0724247debae9a6.txt learned -3455cd8f4a88bbe179c3d16ff2d08aed.txt learned +3455cd8f4a88bbe179c3d16ff2d08aed.txt belles_lettres 3463870779e91a0b3ef42dcb5614c417.txt learned -347c4ff3005261de62a1350cf3552db1.txt learned +347c4ff3005261de62a1350cf3552db1.txt belles_lettres 371e9a205208a5fe2d058b1373246b06.txt learned 3a0e840d849fb693fb0350a9bca049a7.txt learned -3cfe918d71f0216d698a656bb261754a.txt learned -42f560bacaae1ac7960efdafc40c9957.txt learned -444c4cd32cbc3f38551a7cddc23c65bb.txt learned -461965dbfcd3a75d610b913fd51b93e5.txt learned -46795cf89bb03979cf64942c96be6fa1.txt learned +3cfe918d71f0216d698a656bb261754a.txt belles_lettres +42f560bacaae1ac7960efdafc40c9957.txt belles_lettres +444c4cd32cbc3f38551a7cddc23c65bb.txt belles_lettres +461965dbfcd3a75d610b913fd51b93e5.txt belles_lettres +46795cf89bb03979cf64942c96be6fa1.txt belles_lettres 46ace5b2774edd552502d72d113a2537.txt learned 48ffdc34faa528fe84ba1575ad6cf022.txt learned -4a8178c328135fabac148a10a7dbd795.txt learned +4a8178c328135fabac148a10a7dbd795.txt belles_lettres 4ad03bf39d4b20405d92877d8a2d620c.txt learned -4b1042b36a2e8d19883107213a55d4fd.txt learned +4b1042b36a2e8d19883107213a55d4fd.txt belles_lettres 4cd9f5cf912b67d8d541cf805e35ec9d.txt learned -4d1f93581f8df325a0a8fd9df3a60f49.txt learned -4e7a71284825f9b8302c914b3bf65c41.txt learned -4f8b6422ab5ad965d2925bb93f1a5ad1.txt learned -4feeaa056745eaa93855a6d05cc21d20.txt learned -5122f89d4fff6ec6e26062ded7c5387e.txt learned +4d1f93581f8df325a0a8fd9df3a60f49.txt belles_lettres +4e7a71284825f9b8302c914b3bf65c41.txt belles_lettres +4f8b6422ab5ad965d2925bb93f1a5ad1.txt belles_lettres +4feeaa056745eaa93855a6d05cc21d20.txt belles_lettres +5122f89d4fff6ec6e26062ded7c5387e.txt belles_lettres 5185857492e797eb189d39ded8a8b64f.txt learned 53115e407b6ae7d1d6b90edd4ac7f2b7.txt learned -541e21b0a2ab6b31a44b787ffef004d7.txt learned +541e21b0a2ab6b31a44b787ffef004d7.txt belles_lettres 5759e663a1214223b2068cf85e891953.txt learned -5a17378f15a3eaac38b1245f842cd0d6.txt learned +5a17378f15a3eaac38b1245f842cd0d6.txt belles_lettres 5a3733909b787420f2ae4a84095d90b6.txt learned 5a6fe4735711b757130334f30a5c0d8e.txt learned -5cfb1bed9bb97b6a0aabd93ea65d677b.txt learned -5db5250d2936c795389841699a64b1dc.txt learned -5e9a239de5aeb08b0713d0245fc914c7.txt learned -5f606972d66ed49044f3eadaf4eb2a54.txt learned -60e338de63774c5ef4e7beba18bc6577.txt learned +5cfb1bed9bb97b6a0aabd93ea65d677b.txt belles_lettres +5db5250d2936c795389841699a64b1dc.txt belles_lettres +5e9a239de5aeb08b0713d0245fc914c7.txt belles_lettres +5f606972d66ed49044f3eadaf4eb2a54.txt belles_lettres +60e338de63774c5ef4e7beba18bc6577.txt belles_lettres 61f7508fa32ee25eb9ee4cf982eb6d27.txt learned 625237d5189df7054c13e62318cd9819.txt learned -63167efcd7a7bdbd4b742f6e482312f4.txt learned -635e2c48ef4a37462fd8a4cd17375c5c.txt learned +63167efcd7a7bdbd4b742f6e482312f4.txt belles_lettres +635e2c48ef4a37462fd8a4cd17375c5c.txt belles_lettres 64812690c6155fba3f1aba0514496dd9.txt learned 65f1d037cb5f92da6605cea6d0d703d0.txt learned -66abce82b770b4368691f2926f87089e.txt learned -691c1e5e341a19e59b27dfb4f71fc0e0.txt learned +66abce82b770b4368691f2926f87089e.txt belles_lettres +691c1e5e341a19e59b27dfb4f71fc0e0.txt belles_lettres 712b9c9622c73dbb0e6dc5ba2c231cf0.txt learned -71aa03bfef20157578b6b613174d3fe6.txt learned +71aa03bfef20157578b6b613174d3fe6.txt belles_lettres 71f153ecdef94026a97b635a40b375c8.txt learned 7341b4fda4d972adfbf854a0d6be3400.txt learned 74486d71097c34544195b52bdd844839.txt learned 745df40e8d2ba4bf6abfcb197c65359c.txt learned -74a1421e246c3ffc08398609f75e292c.txt learned -784346fad149c3736d309036e925526c.txt learned +74a1421e246c3ffc08398609f75e292c.txt belles_lettres +784346fad149c3736d309036e925526c.txt belles_lettres 787d5f0883aa5fa768a624c226fc7294.txt learned -791f3304bbd155e0211904d1d002b081.txt learned -7a297cedd35c3ffb12ab6011d34f1244.txt learned -7c809ae6732c39ea9a020a307ff35b3a.txt learned -7f8b847188c77b75a2b00e906e0ae693.txt learned -805ea08c406a72dbff755a3627aeb677.txt learned -8459fa5551ec11ae82c5fc404f2b3988.txt learned -853f9d4b400a22d2abbf0f2e17d6ae33.txt learned +791f3304bbd155e0211904d1d002b081.txt belles_lettres +7a297cedd35c3ffb12ab6011d34f1244.txt belles_lettres +7c809ae6732c39ea9a020a307ff35b3a.txt belles_lettres +7f8b847188c77b75a2b00e906e0ae693.txt belles_lettres +805ea08c406a72dbff755a3627aeb677.txt belles_lettres +8459fa5551ec11ae82c5fc404f2b3988.txt belles_lettres +853f9d4b400a22d2abbf0f2e17d6ae33.txt belles_lettres 864ff44244fb6229ba79ce3df93df701.txt learned -8758b603d3ce23de68cbd13665a128d4.txt learned -87d7774f30d9221f856bab02a3f5ffc4.txt learned -8b2d2ff3e27f2d56f5c51f85c2754cf9.txt learned -8babd57d7cbd695d8c04d698626593e8.txt learned +8758b603d3ce23de68cbd13665a128d4.txt belles_lettres +87d7774f30d9221f856bab02a3f5ffc4.txt belles_lettres +8b2d2ff3e27f2d56f5c51f85c2754cf9.txt belles_lettres +8babd57d7cbd695d8c04d698626593e8.txt belles_lettres 8ce16ec688419c614801d5c29cec6153.txt learned 8d2066cd72a448eb69348dbb68f754d8.txt learned -8fb3df3b7d96dc4383c84447a4fdd1a3.txt learned -9101cbf87bfd4ef26e71f5b8c1e61d18.txt learned -93c4b35148e7dcb767ea607fe7edf2c3.txt learned +8fb3df3b7d96dc4383c84447a4fdd1a3.txt belles_lettres +9101cbf87bfd4ef26e71f5b8c1e61d18.txt belles_lettres +93c4b35148e7dcb767ea607fe7edf2c3.txt belles_lettres 990e5a79b032e5cb9ab3e56cab71a6ef.txt learned -9b9ed2005178bb6098ae874260128fc6.txt learned -9c97ea8f2d4dea9c31ebe73765f2396b.txt learned +9b9ed2005178bb6098ae874260128fc6.txt belles_lettres +9c97ea8f2d4dea9c31ebe73765f2396b.txt belles_lettres 9f08d188f8174081f5b02a7f07668846.txt learned 9f9b19682a8401fd40bce446f33d508b.txt learned 9fe0cd0d62c294ed1bc7b29e7e65c18a.txt learned 9febf62c0e6509f3e1ad065a5a6aef8d.txt learned a03db0b1e3bb05fc0f961d2a655e8dad.txt learned -a716803991f9713e7986d252e26e7382.txt learned -a98e64947521853ff24f52e12b77c789.txt learned -aa5156a64316e6836b14c61879d80712.txt learned -ac848bdeda712352e09e5fa392be4574.txt learned -ad12792f75798b70a59b37178798e145.txt learned -ad3b98d2d08faf751ccfd7f8d0b4f045.txt learned -af3d510667a872139daf2df8c2a17c1e.txt learned -b07fc0f7edd49dcd538372888095d3d6.txt learned -b303c034152030a3594d72626d1f784d.txt learned -b31afca8898a09c9087b272701d61c89.txt learned -b3346fa7bed6f5b9ad06bc831c59ad6c.txt learned -b3681b289f0dd87a5c1f9573cd825866.txt learned +a716803991f9713e7986d252e26e7382.txt belles_lettres +a98e64947521853ff24f52e12b77c789.txt belles_lettres +aa5156a64316e6836b14c61879d80712.txt belles_lettres +ac848bdeda712352e09e5fa392be4574.txt belles_lettres +ad12792f75798b70a59b37178798e145.txt belles_lettres +ad3b98d2d08faf751ccfd7f8d0b4f045.txt belles_lettres +af3d510667a872139daf2df8c2a17c1e.txt belles_lettres +b07fc0f7edd49dcd538372888095d3d6.txt belles_lettres +b303c034152030a3594d72626d1f784d.txt belles_lettres +b31afca8898a09c9087b272701d61c89.txt belles_lettres +b3346fa7bed6f5b9ad06bc831c59ad6c.txt belles_lettres +b3681b289f0dd87a5c1f9573cd825866.txt belles_lettres b4d65c8e57797e496834f5f6d9d3e49e.txt learned -b65707c01e68cc6d4d59e18d9f98f423.txt learned -b8a039ba1694ce7ce87737ce5c7480d8.txt learned +b65707c01e68cc6d4d59e18d9f98f423.txt belles_lettres +b8a039ba1694ce7ce87737ce5c7480d8.txt belles_lettres b998ac20277e09a1c3fecbdfb028b33a.txt learned -ba6843edc446617d1e6e5ec53246d849.txt learned -bb6d375a8b847c7c10f9bdbf7324eb03.txt learned -bbbda4cef7aeb20352c9f1d9b453a9e5.txt learned +ba6843edc446617d1e6e5ec53246d849.txt belles_lettres +bb6d375a8b847c7c10f9bdbf7324eb03.txt belles_lettres +bbbda4cef7aeb20352c9f1d9b453a9e5.txt belles_lettres be6f1bd428b9933bedbc6bd401868415.txt learned -bf8ce15b10cb746bb1181645a42012db.txt learned +bf8ce15b10cb746bb1181645a42012db.txt belles_lettres bfd0a578b0ec650d83963ddcf443f7a1.txt learned c1bdfb06016223b3b2c5e03e02af81f3.txt learned -c22274385e9d77bbb900ef9db6ef66ff.txt learned -c39fda6fbf81d87bb6508b1bbe7faf93.txt learned +c22274385e9d77bbb900ef9db6ef66ff.txt belles_lettres +c39fda6fbf81d87bb6508b1bbe7faf93.txt belles_lettres c5a19f446f960c849d67b25238a08397.txt learned -c65f6ecdb1ba01da0e6525dd525621e1.txt learned +c65f6ecdb1ba01da0e6525dd525621e1.txt belles_lettres c942ba590a82fd0827b79e3d6bfb25d3.txt learned c9497d141930518b8005ba352b4d1637.txt learned -cb24d378b3966cf4f3f663f8b13430f2.txt learned +cb24d378b3966cf4f3f663f8b13430f2.txt belles_lettres ce39b27592fc593d0ee117651b072cc1.txt learned -ceacd82d3757974d93538f67b74bc25e.txt learned -cfdd298764ed82fa2304e427dcb53db9.txt learned -d027a28847a6228383dd9594f0984bdf.txt learned -d1f9469856a51f6007f0f785aadf8c1f.txt learned -d59cd5ad1285a9094a1f82a67fe4ba7b.txt learned -d5aa7d7a519c1600db10ad01a00a7e3a.txt learned -d86c9cee65263cdfddbfaaffab1aeeb7.txt learned -dc713f9e699e9e610b458b5c991ce514.txt learned -dc89c7bfd3f0eefd385f0a81c1a59981.txt learned +ceacd82d3757974d93538f67b74bc25e.txt belles_lettres +cfdd298764ed82fa2304e427dcb53db9.txt belles_lettres +d027a28847a6228383dd9594f0984bdf.txt belles_lettres +d1f9469856a51f6007f0f785aadf8c1f.txt belles_lettres +d59cd5ad1285a9094a1f82a67fe4ba7b.txt belles_lettres +d5aa7d7a519c1600db10ad01a00a7e3a.txt belles_lettres +d86c9cee65263cdfddbfaaffab1aeeb7.txt belles_lettres +dc713f9e699e9e610b458b5c991ce514.txt belles_lettres +dc89c7bfd3f0eefd385f0a81c1a59981.txt belles_lettres dc9a7b20833ff389ae573597095f253d.txt learned -dcacb995ec95ede56ba389128922603c.txt learned -dd1a33aada4ffb0564f709c10b95cedc.txt learned +dcacb995ec95ede56ba389128922603c.txt belles_lettres +dd1a33aada4ffb0564f709c10b95cedc.txt belles_lettres e058a15d26f17f7193a032eed51bbbfc.txt learned e2daacfa9c33ea659beaa1a7763bfe57.txt learned e43c7ff67adf6fdd0710c0ec91776481.txt learned e852750e57424cf3e5968b6a3f642553.txt learned e88e97dfcade103cef59919bf49f46d3.txt learned eb6bf7af7572cc1fa1a9aa36c0d0feb3.txt learned -ecf327ee7344767f939a3e7695607be5.txt learned +ecf327ee7344767f939a3e7695607be5.txt belles_lettres ef98917ffbb5b1f6e3ce0428d47f2f23.txt learned f083fda6715b3b3860162e8367ea1209.txt learned -f2b173d5ffa6eda874a71aea5ba076d2.txt learned -f3b16a0072a6afc3a64e592f6c8ab78b.txt learned -f433e3a3fdf6455b68183790d72f7fd8.txt learned -f7099ffdcda8a3e231652cdfbdfe1d26.txt learned -fc97d173fc6d18448bd334ccdbf36e4c.txt learned -fdcc797bb8b504885a2ce07017555f33.txt learned +f2b173d5ffa6eda874a71aea5ba076d2.txt belles_lettres +f3b16a0072a6afc3a64e592f6c8ab78b.txt belles_lettres +f433e3a3fdf6455b68183790d72f7fd8.txt belles_lettres +f7099ffdcda8a3e231652cdfbdfe1d26.txt belles_lettres +fc97d173fc6d18448bd334ccdbf36e4c.txt belles_lettres +fdcc797bb8b504885a2ce07017555f33.txt belles_lettres diff --git a/ss2013/1_Web Mining/Uebungen/4_Uebung/code/naive_bayes.py b/ss2013/1_Web Mining/Uebungen/4_Uebung/code/naive_bayes.py index e598eb71..5fb34d94 100644 --- a/ss2013/1_Web Mining/Uebungen/4_Uebung/code/naive_bayes.py +++ b/ss2013/1_Web Mining/Uebungen/4_Uebung/code/naive_bayes.py @@ -115,13 +115,14 @@ class multiclassClassifier: f = open(pathToFile, 'r') lines = f.readlines(); for line in lines: - thisline = line.split(" "); - for word in thisline: + thisline = line.split(" "); + for word in thisline: word = self.clean_word(word) - if dictonary.has_key(word): - dictonary[str(word)] += 1 - else: - dictonary[str(word)] = 1 + if word <> "": + if dictonary.has_key(word): + dictonary[str(word)] += 1 + else: + dictonary[str(word)] = 1 f.close() return dictonary @@ -134,11 +135,11 @@ class multiclassClassifier: word = self.clean_word(word) if word <> "": #result = 1.0 - for word in text: - if termfrequenciesOfClasses.has_key(str(word)): - result += math.log((termfrequenciesOfClasses[word]+1.)/(termCount+1)) - else: - result += math.log(1./(termCount+1)) + #for word in text: + if termfrequenciesOfClasses.has_key(str(word)): + result += math.log((termfrequenciesOfClasses[word]+1.)/(termCount+1)) + else: + result += math.log(1./(termCount+1)) result += math.log(percentage) return result