From 3126edc59919c8fd64df12e784a72be416fc8ed7 Mon Sep 17 00:00:00 2001 From: rylon Date: Sun, 16 Jun 2013 21:11:42 +0200 Subject: [PATCH] =?UTF-8?q?fixes=20f=C3=BCr=20clasifizierer=20->=2021%=20e?= =?UTF-8?q?rkennchance,?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../Uebungen/4_Uebung/G22_predictions.txt | 302 +++++++-------- .../Uebungen/4_Uebung/code/PorterStemmer.py | 344 ++++++++++++++++++ .../Uebungen/4_Uebung/code/naive_bayes.py | 61 +++- 3 files changed, 548 insertions(+), 159 deletions(-) create mode 100644 ss2013/1_Web Mining/Uebungen/4_Uebung/code/PorterStemmer.py diff --git a/ss2013/1_Web Mining/Uebungen/4_Uebung/G22_predictions.txt b/ss2013/1_Web Mining/Uebungen/4_Uebung/G22_predictions.txt index 1ca8157f..4b2e1884 100644 --- a/ss2013/1_Web Mining/Uebungen/4_Uebung/G22_predictions.txt +++ b/ss2013/1_Web Mining/Uebungen/4_Uebung/G22_predictions.txt @@ -1,165 +1,165 @@ 00f0316054ddf9504f87ea28e73683b6.txt belles_lettres -025338a8f0b21608e843df13d54c8c70.txt belles_lettres -05aac70e552dd51430af3c194ad0fdec.txt learned -083ae11a870e96d1f5c9835eaf48118c.txt learned -08932387850eea34daae545225dcf8a2.txt belles_lettres -08bfc610c065764b9dfec4eed039fa69.txt belles_lettres +025338a8f0b21608e843df13d54c8c70.txt news +05aac70e552dd51430af3c194ad0fdec.txt belles_lettres +083ae11a870e96d1f5c9835eaf48118c.txt news +08932387850eea34daae545225dcf8a2.txt lore +08bfc610c065764b9dfec4eed039fa69.txt lore 096fddf36cb7d1de3f236d85bdefb938.txt belles_lettres 0bd46492f5ad09df3b80cb62a335e689.txt belles_lettres 0c16f57f865c4e0e072546097b1d4adb.txt belles_lettres -0c267d9b541a7d3e07eb0e841609b307.txt learned -0f0c1baf6f76f3e16cac7855405029a6.txt learned -100ec74fe0d1dd74956246df46d9b845.txt belles_lettres +0c267d9b541a7d3e07eb0e841609b307.txt hobbies +0f0c1baf6f76f3e16cac7855405029a6.txt news +100ec74fe0d1dd74956246df46d9b845.txt fiction 10609aabf1b727d9728d96ffe9064f11.txt belles_lettres -1361da4407b0db04d4fd9b0dc51331b8.txt belles_lettres -13a8cf112cb61bb237b91bffac75f506.txt learned -14c10df92c36df39ec7d541654884a6f.txt belles_lettres -15d5e6a3a826d0471c4dec0215169c94.txt belles_lettres -17543675c21f3d1961df70e4bc05b677.txt learned -1af257d94c2c7c31650edd49fe5c3888.txt learned -1da789efbc92bb26ab551c749a498714.txt belles_lettres -21ee90d49d229072cdfe3373f386fbc6.txt belles_lettres -2269c21867d5c492b2e223bc5589897e.txt belles_lettres -233809d44fe0b4625aba2b21e2a090c3.txt belles_lettres -2448920f7507a90ae5de9895518a4256.txt learned -25544eed4ec559512c188e8d73c61576.txt learned -280024d9ca375fe894e0c3852153d91c.txt belles_lettres -2ae543a13eb502dfb34efcd691af4c19.txt learned -2bfe3d3546118761639b703dc042174b.txt belles_lettres -3068c168367e3ed5cac6af3bde2e566e.txt belles_lettres -31307aa6842b932e7f3073b253b687d4.txt belles_lettres -32e8d2431fed46743b954c35de544335.txt belles_lettres -332614e6d84d25bcb0724247debae9a6.txt learned -3455cd8f4a88bbe179c3d16ff2d08aed.txt belles_lettres -3463870779e91a0b3ef42dcb5614c417.txt learned -347c4ff3005261de62a1350cf3552db1.txt belles_lettres -371e9a205208a5fe2d058b1373246b06.txt learned -3a0e840d849fb693fb0350a9bca049a7.txt learned -3cfe918d71f0216d698a656bb261754a.txt belles_lettres -42f560bacaae1ac7960efdafc40c9957.txt belles_lettres +1361da4407b0db04d4fd9b0dc51331b8.txt lore +13a8cf112cb61bb237b91bffac75f506.txt news +14c10df92c36df39ec7d541654884a6f.txt news +15d5e6a3a826d0471c4dec0215169c94.txt fiction +17543675c21f3d1961df70e4bc05b677.txt lore +1af257d94c2c7c31650edd49fe5c3888.txt editorial +1da789efbc92bb26ab551c749a498714.txt lore +21ee90d49d229072cdfe3373f386fbc6.txt lore +2269c21867d5c492b2e223bc5589897e.txt adventure +233809d44fe0b4625aba2b21e2a090c3.txt news +2448920f7507a90ae5de9895518a4256.txt news +25544eed4ec559512c188e8d73c61576.txt news +280024d9ca375fe894e0c3852153d91c.txt news +2ae543a13eb502dfb34efcd691af4c19.txt hobbies +2bfe3d3546118761639b703dc042174b.txt editorial +3068c168367e3ed5cac6af3bde2e566e.txt lore +31307aa6842b932e7f3073b253b687d4.txt fiction +32e8d2431fed46743b954c35de544335.txt romance +332614e6d84d25bcb0724247debae9a6.txt editorial +3455cd8f4a88bbe179c3d16ff2d08aed.txt news +3463870779e91a0b3ef42dcb5614c417.txt news +347c4ff3005261de62a1350cf3552db1.txt fiction +371e9a205208a5fe2d058b1373246b06.txt news +3a0e840d849fb693fb0350a9bca049a7.txt lore +3cfe918d71f0216d698a656bb261754a.txt news +42f560bacaae1ac7960efdafc40c9957.txt editorial 444c4cd32cbc3f38551a7cddc23c65bb.txt belles_lettres -461965dbfcd3a75d610b913fd51b93e5.txt belles_lettres -46795cf89bb03979cf64942c96be6fa1.txt belles_lettres -46ace5b2774edd552502d72d113a2537.txt learned -48ffdc34faa528fe84ba1575ad6cf022.txt learned -4a8178c328135fabac148a10a7dbd795.txt belles_lettres -4ad03bf39d4b20405d92877d8a2d620c.txt learned +461965dbfcd3a75d610b913fd51b93e5.txt news +46795cf89bb03979cf64942c96be6fa1.txt lore +46ace5b2774edd552502d72d113a2537.txt hobbies +48ffdc34faa528fe84ba1575ad6cf022.txt news +4a8178c328135fabac148a10a7dbd795.txt news +4ad03bf39d4b20405d92877d8a2d620c.txt lore 4b1042b36a2e8d19883107213a55d4fd.txt belles_lettres -4cd9f5cf912b67d8d541cf805e35ec9d.txt learned -4d1f93581f8df325a0a8fd9df3a60f49.txt belles_lettres -4e7a71284825f9b8302c914b3bf65c41.txt belles_lettres -4f8b6422ab5ad965d2925bb93f1a5ad1.txt belles_lettres -4feeaa056745eaa93855a6d05cc21d20.txt belles_lettres -5122f89d4fff6ec6e26062ded7c5387e.txt belles_lettres -5185857492e797eb189d39ded8a8b64f.txt learned -53115e407b6ae7d1d6b90edd4ac7f2b7.txt learned -541e21b0a2ab6b31a44b787ffef004d7.txt belles_lettres -5759e663a1214223b2068cf85e891953.txt learned -5a17378f15a3eaac38b1245f842cd0d6.txt belles_lettres -5a3733909b787420f2ae4a84095d90b6.txt learned -5a6fe4735711b757130334f30a5c0d8e.txt learned -5cfb1bed9bb97b6a0aabd93ea65d677b.txt belles_lettres -5db5250d2936c795389841699a64b1dc.txt belles_lettres -5e9a239de5aeb08b0713d0245fc914c7.txt belles_lettres +4cd9f5cf912b67d8d541cf805e35ec9d.txt lore +4d1f93581f8df325a0a8fd9df3a60f49.txt lore +4e7a71284825f9b8302c914b3bf65c41.txt fiction +4f8b6422ab5ad965d2925bb93f1a5ad1.txt adventure +4feeaa056745eaa93855a6d05cc21d20.txt editorial +5122f89d4fff6ec6e26062ded7c5387e.txt news +5185857492e797eb189d39ded8a8b64f.txt lore +53115e407b6ae7d1d6b90edd4ac7f2b7.txt hobbies +541e21b0a2ab6b31a44b787ffef004d7.txt lore +5759e663a1214223b2068cf85e891953.txt belles_lettres +5a17378f15a3eaac38b1245f842cd0d6.txt lore +5a3733909b787420f2ae4a84095d90b6.txt lore +5a6fe4735711b757130334f30a5c0d8e.txt lore +5cfb1bed9bb97b6a0aabd93ea65d677b.txt news +5db5250d2936c795389841699a64b1dc.txt lore +5e9a239de5aeb08b0713d0245fc914c7.txt news 5f606972d66ed49044f3eadaf4eb2a54.txt belles_lettres -60e338de63774c5ef4e7beba18bc6577.txt belles_lettres -61f7508fa32ee25eb9ee4cf982eb6d27.txt learned -625237d5189df7054c13e62318cd9819.txt learned -63167efcd7a7bdbd4b742f6e482312f4.txt belles_lettres -635e2c48ef4a37462fd8a4cd17375c5c.txt belles_lettres -64812690c6155fba3f1aba0514496dd9.txt learned -65f1d037cb5f92da6605cea6d0d703d0.txt learned -66abce82b770b4368691f2926f87089e.txt belles_lettres -691c1e5e341a19e59b27dfb4f71fc0e0.txt belles_lettres -712b9c9622c73dbb0e6dc5ba2c231cf0.txt learned -71aa03bfef20157578b6b613174d3fe6.txt belles_lettres -71f153ecdef94026a97b635a40b375c8.txt learned -7341b4fda4d972adfbf854a0d6be3400.txt learned -74486d71097c34544195b52bdd844839.txt learned -745df40e8d2ba4bf6abfcb197c65359c.txt learned -74a1421e246c3ffc08398609f75e292c.txt belles_lettres +60e338de63774c5ef4e7beba18bc6577.txt news +61f7508fa32ee25eb9ee4cf982eb6d27.txt news +625237d5189df7054c13e62318cd9819.txt lore +63167efcd7a7bdbd4b742f6e482312f4.txt editorial +635e2c48ef4a37462fd8a4cd17375c5c.txt lore +64812690c6155fba3f1aba0514496dd9.txt belles_lettres +65f1d037cb5f92da6605cea6d0d703d0.txt news +66abce82b770b4368691f2926f87089e.txt editorial +691c1e5e341a19e59b27dfb4f71fc0e0.txt lore +712b9c9622c73dbb0e6dc5ba2c231cf0.txt lore +71aa03bfef20157578b6b613174d3fe6.txt lore +71f153ecdef94026a97b635a40b375c8.txt news +7341b4fda4d972adfbf854a0d6be3400.txt editorial +74486d71097c34544195b52bdd844839.txt news +745df40e8d2ba4bf6abfcb197c65359c.txt news +74a1421e246c3ffc08398609f75e292c.txt fiction 784346fad149c3736d309036e925526c.txt belles_lettres -787d5f0883aa5fa768a624c226fc7294.txt learned +787d5f0883aa5fa768a624c226fc7294.txt lore 791f3304bbd155e0211904d1d002b081.txt belles_lettres -7a297cedd35c3ffb12ab6011d34f1244.txt belles_lettres +7a297cedd35c3ffb12ab6011d34f1244.txt news 7c809ae6732c39ea9a020a307ff35b3a.txt belles_lettres -7f8b847188c77b75a2b00e906e0ae693.txt belles_lettres -805ea08c406a72dbff755a3627aeb677.txt belles_lettres -8459fa5551ec11ae82c5fc404f2b3988.txt belles_lettres +7f8b847188c77b75a2b00e906e0ae693.txt adventure +805ea08c406a72dbff755a3627aeb677.txt editorial +8459fa5551ec11ae82c5fc404f2b3988.txt lore 853f9d4b400a22d2abbf0f2e17d6ae33.txt belles_lettres -864ff44244fb6229ba79ce3df93df701.txt learned -8758b603d3ce23de68cbd13665a128d4.txt belles_lettres -87d7774f30d9221f856bab02a3f5ffc4.txt belles_lettres -8b2d2ff3e27f2d56f5c51f85c2754cf9.txt belles_lettres +864ff44244fb6229ba79ce3df93df701.txt hobbies +8758b603d3ce23de68cbd13665a128d4.txt news +87d7774f30d9221f856bab02a3f5ffc4.txt lore +8b2d2ff3e27f2d56f5c51f85c2754cf9.txt news 8babd57d7cbd695d8c04d698626593e8.txt belles_lettres -8ce16ec688419c614801d5c29cec6153.txt learned -8d2066cd72a448eb69348dbb68f754d8.txt learned -8fb3df3b7d96dc4383c84447a4fdd1a3.txt belles_lettres -9101cbf87bfd4ef26e71f5b8c1e61d18.txt belles_lettres -93c4b35148e7dcb767ea607fe7edf2c3.txt belles_lettres -990e5a79b032e5cb9ab3e56cab71a6ef.txt learned -9b9ed2005178bb6098ae874260128fc6.txt belles_lettres -9c97ea8f2d4dea9c31ebe73765f2396b.txt belles_lettres -9f08d188f8174081f5b02a7f07668846.txt learned -9f9b19682a8401fd40bce446f33d508b.txt learned -9fe0cd0d62c294ed1bc7b29e7e65c18a.txt learned -9febf62c0e6509f3e1ad065a5a6aef8d.txt learned -a03db0b1e3bb05fc0f961d2a655e8dad.txt learned -a716803991f9713e7986d252e26e7382.txt belles_lettres -a98e64947521853ff24f52e12b77c789.txt belles_lettres -aa5156a64316e6836b14c61879d80712.txt belles_lettres -ac848bdeda712352e09e5fa392be4574.txt belles_lettres +8ce16ec688419c614801d5c29cec6153.txt hobbies +8d2066cd72a448eb69348dbb68f754d8.txt lore +8fb3df3b7d96dc4383c84447a4fdd1a3.txt news +9101cbf87bfd4ef26e71f5b8c1e61d18.txt lore +93c4b35148e7dcb767ea607fe7edf2c3.txt news +990e5a79b032e5cb9ab3e56cab71a6ef.txt lore +9b9ed2005178bb6098ae874260128fc6.txt news +9c97ea8f2d4dea9c31ebe73765f2396b.txt fiction +9f08d188f8174081f5b02a7f07668846.txt lore +9f9b19682a8401fd40bce446f33d508b.txt news +9fe0cd0d62c294ed1bc7b29e7e65c18a.txt news +9febf62c0e6509f3e1ad065a5a6aef8d.txt news +a03db0b1e3bb05fc0f961d2a655e8dad.txt lore +a716803991f9713e7986d252e26e7382.txt news +a98e64947521853ff24f52e12b77c789.txt news +aa5156a64316e6836b14c61879d80712.txt news +ac848bdeda712352e09e5fa392be4574.txt fiction ad12792f75798b70a59b37178798e145.txt belles_lettres -ad3b98d2d08faf751ccfd7f8d0b4f045.txt belles_lettres -af3d510667a872139daf2df8c2a17c1e.txt belles_lettres -b07fc0f7edd49dcd538372888095d3d6.txt belles_lettres -b303c034152030a3594d72626d1f784d.txt belles_lettres -b31afca8898a09c9087b272701d61c89.txt belles_lettres -b3346fa7bed6f5b9ad06bc831c59ad6c.txt belles_lettres -b3681b289f0dd87a5c1f9573cd825866.txt belles_lettres -b4d65c8e57797e496834f5f6d9d3e49e.txt learned -b65707c01e68cc6d4d59e18d9f98f423.txt belles_lettres -b8a039ba1694ce7ce87737ce5c7480d8.txt belles_lettres -b998ac20277e09a1c3fecbdfb028b33a.txt learned -ba6843edc446617d1e6e5ec53246d849.txt belles_lettres -bb6d375a8b847c7c10f9bdbf7324eb03.txt belles_lettres -bbbda4cef7aeb20352c9f1d9b453a9e5.txt belles_lettres -be6f1bd428b9933bedbc6bd401868415.txt learned -bf8ce15b10cb746bb1181645a42012db.txt belles_lettres -bfd0a578b0ec650d83963ddcf443f7a1.txt learned -c1bdfb06016223b3b2c5e03e02af81f3.txt learned -c22274385e9d77bbb900ef9db6ef66ff.txt belles_lettres -c39fda6fbf81d87bb6508b1bbe7faf93.txt belles_lettres -c5a19f446f960c849d67b25238a08397.txt learned -c65f6ecdb1ba01da0e6525dd525621e1.txt belles_lettres -c942ba590a82fd0827b79e3d6bfb25d3.txt learned -c9497d141930518b8005ba352b4d1637.txt learned -cb24d378b3966cf4f3f663f8b13430f2.txt belles_lettres -ce39b27592fc593d0ee117651b072cc1.txt learned -ceacd82d3757974d93538f67b74bc25e.txt belles_lettres -cfdd298764ed82fa2304e427dcb53db9.txt belles_lettres -d027a28847a6228383dd9594f0984bdf.txt belles_lettres -d1f9469856a51f6007f0f785aadf8c1f.txt belles_lettres -d59cd5ad1285a9094a1f82a67fe4ba7b.txt belles_lettres -d5aa7d7a519c1600db10ad01a00a7e3a.txt belles_lettres -d86c9cee65263cdfddbfaaffab1aeeb7.txt belles_lettres -dc713f9e699e9e610b458b5c991ce514.txt belles_lettres -dc89c7bfd3f0eefd385f0a81c1a59981.txt belles_lettres -dc9a7b20833ff389ae573597095f253d.txt learned -dcacb995ec95ede56ba389128922603c.txt belles_lettres -dd1a33aada4ffb0564f709c10b95cedc.txt belles_lettres -e058a15d26f17f7193a032eed51bbbfc.txt learned -e2daacfa9c33ea659beaa1a7763bfe57.txt learned -e43c7ff67adf6fdd0710c0ec91776481.txt learned -e852750e57424cf3e5968b6a3f642553.txt learned -e88e97dfcade103cef59919bf49f46d3.txt learned -eb6bf7af7572cc1fa1a9aa36c0d0feb3.txt learned -ecf327ee7344767f939a3e7695607be5.txt belles_lettres -ef98917ffbb5b1f6e3ce0428d47f2f23.txt learned -f083fda6715b3b3860162e8367ea1209.txt learned -f2b173d5ffa6eda874a71aea5ba076d2.txt belles_lettres -f3b16a0072a6afc3a64e592f6c8ab78b.txt belles_lettres -f433e3a3fdf6455b68183790d72f7fd8.txt belles_lettres -f7099ffdcda8a3e231652cdfbdfe1d26.txt belles_lettres -fc97d173fc6d18448bd334ccdbf36e4c.txt belles_lettres -fdcc797bb8b504885a2ce07017555f33.txt belles_lettres +ad3b98d2d08faf751ccfd7f8d0b4f045.txt editorial +af3d510667a872139daf2df8c2a17c1e.txt fiction +b07fc0f7edd49dcd538372888095d3d6.txt lore +b303c034152030a3594d72626d1f784d.txt news +b31afca8898a09c9087b272701d61c89.txt adventure +b3346fa7bed6f5b9ad06bc831c59ad6c.txt lore +b3681b289f0dd87a5c1f9573cd825866.txt lore +b4d65c8e57797e496834f5f6d9d3e49e.txt belles_lettres +b65707c01e68cc6d4d59e18d9f98f423.txt lore +b8a039ba1694ce7ce87737ce5c7480d8.txt news +b998ac20277e09a1c3fecbdfb028b33a.txt lore +ba6843edc446617d1e6e5ec53246d849.txt lore +bb6d375a8b847c7c10f9bdbf7324eb03.txt lore +bbbda4cef7aeb20352c9f1d9b453a9e5.txt lore +be6f1bd428b9933bedbc6bd401868415.txt lore +bf8ce15b10cb746bb1181645a42012db.txt lore +bfd0a578b0ec650d83963ddcf443f7a1.txt lore +c1bdfb06016223b3b2c5e03e02af81f3.txt lore +c22274385e9d77bbb900ef9db6ef66ff.txt fiction +c39fda6fbf81d87bb6508b1bbe7faf93.txt fiction +c5a19f446f960c849d67b25238a08397.txt lore +c65f6ecdb1ba01da0e6525dd525621e1.txt editorial +c942ba590a82fd0827b79e3d6bfb25d3.txt lore +c9497d141930518b8005ba352b4d1637.txt hobbies +cb24d378b3966cf4f3f663f8b13430f2.txt adventure +ce39b27592fc593d0ee117651b072cc1.txt news +ceacd82d3757974d93538f67b74bc25e.txt news +cfdd298764ed82fa2304e427dcb53db9.txt editorial +d027a28847a6228383dd9594f0984bdf.txt lore +d1f9469856a51f6007f0f785aadf8c1f.txt news +d59cd5ad1285a9094a1f82a67fe4ba7b.txt lore +d5aa7d7a519c1600db10ad01a00a7e3a.txt lore +d86c9cee65263cdfddbfaaffab1aeeb7.txt news +dc713f9e699e9e610b458b5c991ce514.txt lore +dc89c7bfd3f0eefd385f0a81c1a59981.txt lore +dc9a7b20833ff389ae573597095f253d.txt lore +dcacb995ec95ede56ba389128922603c.txt lore +dd1a33aada4ffb0564f709c10b95cedc.txt lore +e058a15d26f17f7193a032eed51bbbfc.txt editorial +e2daacfa9c33ea659beaa1a7763bfe57.txt news +e43c7ff67adf6fdd0710c0ec91776481.txt lore +e852750e57424cf3e5968b6a3f642553.txt lore +e88e97dfcade103cef59919bf49f46d3.txt lore +eb6bf7af7572cc1fa1a9aa36c0d0feb3.txt hobbies +ecf327ee7344767f939a3e7695607be5.txt news +ef98917ffbb5b1f6e3ce0428d47f2f23.txt lore +f083fda6715b3b3860162e8367ea1209.txt hobbies +f2b173d5ffa6eda874a71aea5ba076d2.txt news +f3b16a0072a6afc3a64e592f6c8ab78b.txt editorial +f433e3a3fdf6455b68183790d72f7fd8.txt news +f7099ffdcda8a3e231652cdfbdfe1d26.txt editorial +fc97d173fc6d18448bd334ccdbf36e4c.txt hobbies +fdcc797bb8b504885a2ce07017555f33.txt news diff --git a/ss2013/1_Web Mining/Uebungen/4_Uebung/code/PorterStemmer.py b/ss2013/1_Web Mining/Uebungen/4_Uebung/code/PorterStemmer.py new file mode 100644 index 00000000..405a5f98 --- /dev/null +++ b/ss2013/1_Web Mining/Uebungen/4_Uebung/code/PorterStemmer.py @@ -0,0 +1,344 @@ +#!/usr/bin/env python + +"""Porter Stemming Algorithm +This is the Porter stemming algorithm, ported to Python from the +version coded up in ANSI C by the author. It may be be regarded +as canonical, in that it follows the algorithm presented in + +Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14, +no. 3, pp 130-137, + +only differing from it at the points maked --DEPARTURE-- below. + +See also http://www.tartarus.org/~martin/PorterStemmer + +The algorithm as described in the paper could be exactly replicated +by adjusting the points of DEPARTURE, but this is barely necessary, +because (a) the points of DEPARTURE are definitely improvements, and +(b) no encoding of the Porter stemmer I have seen is anything like +as exact as this version, even with the points of DEPARTURE! + +Vivake Gupta (v@nano.com) + +Release 1: January 2001 + +Further adjustments by Santiago Bruno (bananabruno@gmail.com) +to allow word input not restricted to one word per line, leading +to: + +release 2: July 2008 +""" + +import sys + +class PorterStemmer: + + def __init__(self): + """The main part of the stemming algorithm starts here. + b is a buffer holding a word to be stemmed. The letters are in b[k0], + b[k0+1] ... ending at b[k]. In fact k0 = 0 in this demo program. k is + readjusted downwards as the stemming progresses. Zero termination is + not in fact used in the algorithm. + + Note that only lower case sequences are stemmed. Forcing to lower case + should be done before stem(...) is called. + """ + + self.b = "" # buffer for word to be stemmed + self.k = 0 + self.k0 = 0 + self.j = 0 # j is a general offset into the string + + def cons(self, i): + """cons(i) is TRUE <=> b[i] is a consonant.""" + if self.b[i] == 'a' or self.b[i] == 'e' or self.b[i] == 'i' or self.b[i] == 'o' or self.b[i] == 'u': + return 0 + if self.b[i] == 'y': + if i == self.k0: + return 1 + else: + return (not self.cons(i - 1)) + return 1 + + def m(self): + """m() measures the number of consonant sequences between k0 and j. + if c is a consonant sequence and v a vowel sequence, and <..> + indicates arbitrary presence, + + gives 0 + vc gives 1 + vcvc gives 2 + vcvcvc gives 3 + .... + """ + n = 0 + i = self.k0 + while 1: + if i > self.j: + return n + if not self.cons(i): + break + i = i + 1 + i = i + 1 + while 1: + while 1: + if i > self.j: + return n + if self.cons(i): + break + i = i + 1 + i = i + 1 + n = n + 1 + while 1: + if i > self.j: + return n + if not self.cons(i): + break + i = i + 1 + i = i + 1 + + def vowelinstem(self): + """vowelinstem() is TRUE <=> k0,...j contains a vowel""" + for i in range(self.k0, self.j + 1): + if not self.cons(i): + return 1 + return 0 + + def doublec(self, j): + """doublec(j) is TRUE <=> j,(j-1) contain a double consonant.""" + if j < (self.k0 + 1): + return 0 + if (self.b[j] != self.b[j-1]): + return 0 + return self.cons(j) + + def cvc(self, i): + """cvc(i) is TRUE <=> i-2,i-1,i has the form consonant - vowel - consonant + and also if the second c is not w,x or y. this is used when trying to + restore an e at the end of a short e.g. + + cav(e), lov(e), hop(e), crim(e), but + snow, box, tray. + """ + if i < (self.k0 + 2) or not self.cons(i) or self.cons(i-1) or not self.cons(i-2): + return 0 + ch = self.b[i] + if ch == 'w' or ch == 'x' or ch == 'y': + return 0 + return 1 + + def ends(self, s): + """ends(s) is TRUE <=> k0,...k ends with the string s.""" + length = len(s) + if s[length - 1] != self.b[self.k]: # tiny speed-up + return 0 + if length > (self.k - self.k0 + 1): + return 0 + if self.b[self.k-length+1:self.k+1] != s: + return 0 + self.j = self.k - length + return 1 + + def setto(self, s): + """setto(s) sets (j+1),...k to the characters in the string s, readjusting k.""" + length = len(s) + self.b = self.b[:self.j+1] + s + self.b[self.j+length+1:] + self.k = self.j + length + + def r(self, s): + """r(s) is used further down.""" + if self.m() > 0: + self.setto(s) + + def step1ab(self): + """step1ab() gets rid of plurals and -ed or -ing. e.g. + + caresses -> caress + ponies -> poni + ties -> ti + caress -> caress + cats -> cat + + feed -> feed + agreed -> agree + disabled -> disable + + matting -> mat + mating -> mate + meeting -> meet + milling -> mill + messing -> mess + + meetings -> meet + """ + if self.b[self.k] == 's': + if self.ends("sses"): + self.k = self.k - 2 + elif self.ends("ies"): + self.setto("i") + elif self.b[self.k - 1] != 's': + self.k = self.k - 1 + if self.ends("eed"): + if self.m() > 0: + self.k = self.k - 1 + elif (self.ends("ed") or self.ends("ing")) and self.vowelinstem(): + self.k = self.j + if self.ends("at"): self.setto("ate") + elif self.ends("bl"): self.setto("ble") + elif self.ends("iz"): self.setto("ize") + elif self.doublec(self.k): + self.k = self.k - 1 + ch = self.b[self.k] + if ch == 'l' or ch == 's' or ch == 'z': + self.k = self.k + 1 + elif (self.m() == 1 and self.cvc(self.k)): + self.setto("e") + + def step1c(self): + """step1c() turns terminal y to i when there is another vowel in the stem.""" + if (self.ends("y") and self.vowelinstem()): + self.b = self.b[:self.k] + 'i' + self.b[self.k+1:] + + def step2(self): + """step2() maps double suffices to single ones. + so -ization ( = -ize plus -ation) maps to -ize etc. note that the + string before the suffix must give m() > 0. + """ + if self.b[self.k - 1] == 'a': + if self.ends("ational"): self.r("ate") + elif self.ends("tional"): self.r("tion") + elif self.b[self.k - 1] == 'c': + if self.ends("enci"): self.r("ence") + elif self.ends("anci"): self.r("ance") + elif self.b[self.k - 1] == 'e': + if self.ends("izer"): self.r("ize") + elif self.b[self.k - 1] == 'l': + if self.ends("bli"): self.r("ble") # --DEPARTURE-- + # To match the published algorithm, replace this phrase with + # if self.ends("abli"): self.r("able") + elif self.ends("alli"): self.r("al") + elif self.ends("entli"): self.r("ent") + elif self.ends("eli"): self.r("e") + elif self.ends("ousli"): self.r("ous") + elif self.b[self.k - 1] == 'o': + if self.ends("ization"): self.r("ize") + elif self.ends("ation"): self.r("ate") + elif self.ends("ator"): self.r("ate") + elif self.b[self.k - 1] == 's': + if self.ends("alism"): self.r("al") + elif self.ends("iveness"): self.r("ive") + elif self.ends("fulness"): self.r("ful") + elif self.ends("ousness"): self.r("ous") + elif self.b[self.k - 1] == 't': + if self.ends("aliti"): self.r("al") + elif self.ends("iviti"): self.r("ive") + elif self.ends("biliti"): self.r("ble") + elif self.b[self.k - 1] == 'g': # --DEPARTURE-- + if self.ends("logi"): self.r("log") + # To match the published algorithm, delete this phrase + + def step3(self): + """step3() dels with -ic-, -full, -ness etc. similar strategy to step2.""" + if self.b[self.k] == 'e': + if self.ends("icate"): self.r("ic") + elif self.ends("ative"): self.r("") + elif self.ends("alize"): self.r("al") + elif self.b[self.k] == 'i': + if self.ends("iciti"): self.r("ic") + elif self.b[self.k] == 'l': + if self.ends("ical"): self.r("ic") + elif self.ends("ful"): self.r("") + elif self.b[self.k] == 's': + if self.ends("ness"): self.r("") + + def step4(self): + """step4() takes off -ant, -ence etc., in context vcvc.""" + if self.b[self.k - 1] == 'a': + if self.ends("al"): pass + else: return + elif self.b[self.k - 1] == 'c': + if self.ends("ance"): pass + elif self.ends("ence"): pass + else: return + elif self.b[self.k - 1] == 'e': + if self.ends("er"): pass + else: return + elif self.b[self.k - 1] == 'i': + if self.ends("ic"): pass + else: return + elif self.b[self.k - 1] == 'l': + if self.ends("able"): pass + elif self.ends("ible"): pass + else: return + elif self.b[self.k - 1] == 'n': + if self.ends("ant"): pass + elif self.ends("ement"): pass + elif self.ends("ment"): pass + elif self.ends("ent"): pass + else: return + elif self.b[self.k - 1] == 'o': + if self.ends("ion") and (self.b[self.j] == 's' or self.b[self.j] == 't'): pass + elif self.ends("ou"): pass + # takes care of -ous + else: return + elif self.b[self.k - 1] == 's': + if self.ends("ism"): pass + else: return + elif self.b[self.k - 1] == 't': + if self.ends("ate"): pass + elif self.ends("iti"): pass + else: return + elif self.b[self.k - 1] == 'u': + if self.ends("ous"): pass + else: return + elif self.b[self.k - 1] == 'v': + if self.ends("ive"): pass + else: return + elif self.b[self.k - 1] == 'z': + if self.ends("ize"): pass + else: return + else: + return + if self.m() > 1: + self.k = self.j + + def step5(self): + """step5() removes a final -e if m() > 1, and changes -ll to -l if + m() > 1. + """ + self.j = self.k + if self.b[self.k] == 'e': + a = self.m() + if a > 1 or (a == 1 and not self.cvc(self.k-1)): + self.k = self.k - 1 + if self.b[self.k] == 'l' and self.doublec(self.k) and self.m() > 1: + self.k = self.k -1 + + def stem(self, p, i, j): + """In stem(p,i,j), p is a char pointer, and the string to be stemmed + is from p[i] to p[j] inclusive. Typically i is zero and j is the + offset to the last character of a string, (p[j+1] == '\0'). The + stemmer adjusts the characters p[i] ... p[j] and returns the new + end-point of the string, k. Stemming never increases word length, so + i <= k <= j. To turn the stemmer into a module, declare 'stem' as + extern, and delete the remainder of this file. + """ + # copy the parameters into statics + self.b = p + self.k = j + self.k0 = i + if self.k <= self.k0 + 1: + return self.b # --DEPARTURE-- + + # With this line, strings of length 1 or 2 don't go through the + # stemming process, although no mention is made of this in the + # published algorithm. Remove the line to match the published + # algorithm. + + self.step1ab() + self.step1c() + self.step2() + self.step3() + self.step4() + self.step5() + return self.b[self.k0:self.k+1] \ No newline at end of file diff --git a/ss2013/1_Web Mining/Uebungen/4_Uebung/code/naive_bayes.py b/ss2013/1_Web Mining/Uebungen/4_Uebung/code/naive_bayes.py index 5fb34d94..b182f8e8 100644 --- a/ss2013/1_Web Mining/Uebungen/4_Uebung/code/naive_bayes.py +++ b/ss2013/1_Web Mining/Uebungen/4_Uebung/code/naive_bayes.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- # imports import os @@ -6,7 +7,7 @@ import random import sys import math import re - +from PorterStemmer import PorterStemmer # config variables @@ -129,32 +130,75 @@ class multiclassClassifier: def bayes(self, text, termfrequenciesOfClasses, termCount, percentage): result = 1.0 + wordcount = 0.0 for line in text: thisline = line.split(" "); - for word in thisline: + for word in thisline: word = self.clean_word(word) if word <> "": + ''' + Accuracy: 21.2121% + Precision per class: adventure:40.0% belles_lettres:22.2222% editorial:17.6471% fiction:36.3636% government:0.0% hobbies:11.1111% learned:0.0% lore:17.5439% mystery:0.0% news:23.4043% romance:0.0% + Precision Macroavg: 15.2993% + Precision Microavg: 21.2121% + Recall per class: adventure:20.0% belles_lettres:14.8148% editorial:30.0% fiction:36.3636% government:0.0% hobbies:7.6923% learned:0.0% lore:55.5556% mystery:0.0% news:68.75% romance:0.0% + Recall Microavg: 2.6217% + + if termfrequenciesOfClasses.has_key(str(word)): + result += math.log(1./((termfrequenciesOfClasses[word]+1.)/(termCount+1))) #gewichte häufig auftretende worter am wenigsten, wenigauftretende am stärksten + termcount -> was ist das? + ... + return result + + Accuracy: 21.8182% + Precision per class: adventure:40.0% belles_lettres:22.2222% editorial:20.0% fiction:36.3636% government:0.0% hobbies:20.0% learned:0.0% lore:17.5439% mystery:0.0% news:22.9167% romance:0.0% + Precision Macroavg: 16.2769% + Precision Microavg: 21.8182% + Recall per class: adventure:20.0% belles_lettres:14.8148% editorial:30.0% fiction:36.3636% government:0.0% hobbies:15.3846% learned:0.0% lore:55.5556% mystery:0.0% news:68.75% romance:0.0% + Recall Microavg: 2.7149% + + if termfrequenciesOfClasses.has_key(str(word)): + wordcount += 1 + result += math.log(1./((termfrequenciesOfClasses[word]+1.)/(termCount+1))) #gewichte häufig auftretende worter am wenigsten, wenigauftretende am stärksten + termcount -> was ist das? + ... + result += math.log(percentage) + result += math.log(wordcount) + return result + ''' #result = 1.0 #for word in text: if termfrequenciesOfClasses.has_key(str(word)): - result += math.log((termfrequenciesOfClasses[word]+1.)/(termCount+1)) - else: - result += math.log(1./(termCount+1)) - result += math.log(percentage) - return result + wordcount += 1 + #result += math.log(1./(termfrequenciesOfClasses[word]+1.)) + #result += math.log((termfrequenciesOfClasses[word]+1.)/(termCount+1)) #gewichte häufig auftretende terme am stärksten + #result += math.log(1./((termfrequenciesOfClasses[word]+1.)/(termCount+1))) #gewichte häufig auftretende worter am wenigsten, wenigauftretende am stärksten + termcount -> was ist das? + result += termfrequenciesOfClasses[word] + #print "known word: "+word + #else: + #result += math.log(1./(termCount+1)) + #result += math.log(1.) + #print "new word: "+word + #result += math.log(percentage) + result *= percentage + #result += math.log(wordcount) + result *= wordcount + #return result + return math.log(result) def clean_word(self, word): #print word word = word.lower() #lowercase + word = word.strip() # remove lineendings etc #return word word = "".join(re.findall("[a-z]+", word)) #only characters #return word - if len(word) <= 4: #only words longer 4 + if len(word) <= 4: #only words longer 4 return "" #return word if self.isStopWord(word): #stopwordfilter return "" #print word + p = PorterStemmer() #stemming + word = p.stem(word, 0,len(word)-1) return word def isStopWord(self,word): @@ -212,6 +256,7 @@ if __name__ == '__main__': maxRes = temp mc.filesToPrediction[infile] = cl f.close() + print currentPath + " " + mc.filesToPrediction[infile] mc.writePredictionFile()