fixes für clasifizierer -> 21% erkennchance,

This commit is contained in:
Ulf Gebhardt 2013-06-16 21:11:42 +02:00
parent 9acfc59dae
commit 3126edc599
3 changed files with 548 additions and 159 deletions

View File

@ -1,165 +1,165 @@
00f0316054ddf9504f87ea28e73683b6.txt belles_lettres
025338a8f0b21608e843df13d54c8c70.txt belles_lettres
05aac70e552dd51430af3c194ad0fdec.txt learned
083ae11a870e96d1f5c9835eaf48118c.txt learned
08932387850eea34daae545225dcf8a2.txt belles_lettres
08bfc610c065764b9dfec4eed039fa69.txt belles_lettres
025338a8f0b21608e843df13d54c8c70.txt news
05aac70e552dd51430af3c194ad0fdec.txt belles_lettres
083ae11a870e96d1f5c9835eaf48118c.txt news
08932387850eea34daae545225dcf8a2.txt lore
08bfc610c065764b9dfec4eed039fa69.txt lore
096fddf36cb7d1de3f236d85bdefb938.txt belles_lettres
0bd46492f5ad09df3b80cb62a335e689.txt belles_lettres
0c16f57f865c4e0e072546097b1d4adb.txt belles_lettres
0c267d9b541a7d3e07eb0e841609b307.txt learned
0f0c1baf6f76f3e16cac7855405029a6.txt learned
100ec74fe0d1dd74956246df46d9b845.txt belles_lettres
0c267d9b541a7d3e07eb0e841609b307.txt hobbies
0f0c1baf6f76f3e16cac7855405029a6.txt news
100ec74fe0d1dd74956246df46d9b845.txt fiction
10609aabf1b727d9728d96ffe9064f11.txt belles_lettres
1361da4407b0db04d4fd9b0dc51331b8.txt belles_lettres
13a8cf112cb61bb237b91bffac75f506.txt learned
14c10df92c36df39ec7d541654884a6f.txt belles_lettres
15d5e6a3a826d0471c4dec0215169c94.txt belles_lettres
17543675c21f3d1961df70e4bc05b677.txt learned
1af257d94c2c7c31650edd49fe5c3888.txt learned
1da789efbc92bb26ab551c749a498714.txt belles_lettres
21ee90d49d229072cdfe3373f386fbc6.txt belles_lettres
2269c21867d5c492b2e223bc5589897e.txt belles_lettres
233809d44fe0b4625aba2b21e2a090c3.txt belles_lettres
2448920f7507a90ae5de9895518a4256.txt learned
25544eed4ec559512c188e8d73c61576.txt learned
280024d9ca375fe894e0c3852153d91c.txt belles_lettres
2ae543a13eb502dfb34efcd691af4c19.txt learned
2bfe3d3546118761639b703dc042174b.txt belles_lettres
3068c168367e3ed5cac6af3bde2e566e.txt belles_lettres
31307aa6842b932e7f3073b253b687d4.txt belles_lettres
32e8d2431fed46743b954c35de544335.txt belles_lettres
332614e6d84d25bcb0724247debae9a6.txt learned
3455cd8f4a88bbe179c3d16ff2d08aed.txt belles_lettres
3463870779e91a0b3ef42dcb5614c417.txt learned
347c4ff3005261de62a1350cf3552db1.txt belles_lettres
371e9a205208a5fe2d058b1373246b06.txt learned
3a0e840d849fb693fb0350a9bca049a7.txt learned
3cfe918d71f0216d698a656bb261754a.txt belles_lettres
42f560bacaae1ac7960efdafc40c9957.txt belles_lettres
1361da4407b0db04d4fd9b0dc51331b8.txt lore
13a8cf112cb61bb237b91bffac75f506.txt news
14c10df92c36df39ec7d541654884a6f.txt news
15d5e6a3a826d0471c4dec0215169c94.txt fiction
17543675c21f3d1961df70e4bc05b677.txt lore
1af257d94c2c7c31650edd49fe5c3888.txt editorial
1da789efbc92bb26ab551c749a498714.txt lore
21ee90d49d229072cdfe3373f386fbc6.txt lore
2269c21867d5c492b2e223bc5589897e.txt adventure
233809d44fe0b4625aba2b21e2a090c3.txt news
2448920f7507a90ae5de9895518a4256.txt news
25544eed4ec559512c188e8d73c61576.txt news
280024d9ca375fe894e0c3852153d91c.txt news
2ae543a13eb502dfb34efcd691af4c19.txt hobbies
2bfe3d3546118761639b703dc042174b.txt editorial
3068c168367e3ed5cac6af3bde2e566e.txt lore
31307aa6842b932e7f3073b253b687d4.txt fiction
32e8d2431fed46743b954c35de544335.txt romance
332614e6d84d25bcb0724247debae9a6.txt editorial
3455cd8f4a88bbe179c3d16ff2d08aed.txt news
3463870779e91a0b3ef42dcb5614c417.txt news
347c4ff3005261de62a1350cf3552db1.txt fiction
371e9a205208a5fe2d058b1373246b06.txt news
3a0e840d849fb693fb0350a9bca049a7.txt lore
3cfe918d71f0216d698a656bb261754a.txt news
42f560bacaae1ac7960efdafc40c9957.txt editorial
444c4cd32cbc3f38551a7cddc23c65bb.txt belles_lettres
461965dbfcd3a75d610b913fd51b93e5.txt belles_lettres
46795cf89bb03979cf64942c96be6fa1.txt belles_lettres
46ace5b2774edd552502d72d113a2537.txt learned
48ffdc34faa528fe84ba1575ad6cf022.txt learned
4a8178c328135fabac148a10a7dbd795.txt belles_lettres
4ad03bf39d4b20405d92877d8a2d620c.txt learned
461965dbfcd3a75d610b913fd51b93e5.txt news
46795cf89bb03979cf64942c96be6fa1.txt lore
46ace5b2774edd552502d72d113a2537.txt hobbies
48ffdc34faa528fe84ba1575ad6cf022.txt news
4a8178c328135fabac148a10a7dbd795.txt news
4ad03bf39d4b20405d92877d8a2d620c.txt lore
4b1042b36a2e8d19883107213a55d4fd.txt belles_lettres
4cd9f5cf912b67d8d541cf805e35ec9d.txt learned
4d1f93581f8df325a0a8fd9df3a60f49.txt belles_lettres
4e7a71284825f9b8302c914b3bf65c41.txt belles_lettres
4f8b6422ab5ad965d2925bb93f1a5ad1.txt belles_lettres
4feeaa056745eaa93855a6d05cc21d20.txt belles_lettres
5122f89d4fff6ec6e26062ded7c5387e.txt belles_lettres
5185857492e797eb189d39ded8a8b64f.txt learned
53115e407b6ae7d1d6b90edd4ac7f2b7.txt learned
541e21b0a2ab6b31a44b787ffef004d7.txt belles_lettres
5759e663a1214223b2068cf85e891953.txt learned
5a17378f15a3eaac38b1245f842cd0d6.txt belles_lettres
5a3733909b787420f2ae4a84095d90b6.txt learned
5a6fe4735711b757130334f30a5c0d8e.txt learned
5cfb1bed9bb97b6a0aabd93ea65d677b.txt belles_lettres
5db5250d2936c795389841699a64b1dc.txt belles_lettres
5e9a239de5aeb08b0713d0245fc914c7.txt belles_lettres
4cd9f5cf912b67d8d541cf805e35ec9d.txt lore
4d1f93581f8df325a0a8fd9df3a60f49.txt lore
4e7a71284825f9b8302c914b3bf65c41.txt fiction
4f8b6422ab5ad965d2925bb93f1a5ad1.txt adventure
4feeaa056745eaa93855a6d05cc21d20.txt editorial
5122f89d4fff6ec6e26062ded7c5387e.txt news
5185857492e797eb189d39ded8a8b64f.txt lore
53115e407b6ae7d1d6b90edd4ac7f2b7.txt hobbies
541e21b0a2ab6b31a44b787ffef004d7.txt lore
5759e663a1214223b2068cf85e891953.txt belles_lettres
5a17378f15a3eaac38b1245f842cd0d6.txt lore
5a3733909b787420f2ae4a84095d90b6.txt lore
5a6fe4735711b757130334f30a5c0d8e.txt lore
5cfb1bed9bb97b6a0aabd93ea65d677b.txt news
5db5250d2936c795389841699a64b1dc.txt lore
5e9a239de5aeb08b0713d0245fc914c7.txt news
5f606972d66ed49044f3eadaf4eb2a54.txt belles_lettres
60e338de63774c5ef4e7beba18bc6577.txt belles_lettres
61f7508fa32ee25eb9ee4cf982eb6d27.txt learned
625237d5189df7054c13e62318cd9819.txt learned
63167efcd7a7bdbd4b742f6e482312f4.txt belles_lettres
635e2c48ef4a37462fd8a4cd17375c5c.txt belles_lettres
64812690c6155fba3f1aba0514496dd9.txt learned
65f1d037cb5f92da6605cea6d0d703d0.txt learned
66abce82b770b4368691f2926f87089e.txt belles_lettres
691c1e5e341a19e59b27dfb4f71fc0e0.txt belles_lettres
712b9c9622c73dbb0e6dc5ba2c231cf0.txt learned
71aa03bfef20157578b6b613174d3fe6.txt belles_lettres
71f153ecdef94026a97b635a40b375c8.txt learned
7341b4fda4d972adfbf854a0d6be3400.txt learned
74486d71097c34544195b52bdd844839.txt learned
745df40e8d2ba4bf6abfcb197c65359c.txt learned
74a1421e246c3ffc08398609f75e292c.txt belles_lettres
60e338de63774c5ef4e7beba18bc6577.txt news
61f7508fa32ee25eb9ee4cf982eb6d27.txt news
625237d5189df7054c13e62318cd9819.txt lore
63167efcd7a7bdbd4b742f6e482312f4.txt editorial
635e2c48ef4a37462fd8a4cd17375c5c.txt lore
64812690c6155fba3f1aba0514496dd9.txt belles_lettres
65f1d037cb5f92da6605cea6d0d703d0.txt news
66abce82b770b4368691f2926f87089e.txt editorial
691c1e5e341a19e59b27dfb4f71fc0e0.txt lore
712b9c9622c73dbb0e6dc5ba2c231cf0.txt lore
71aa03bfef20157578b6b613174d3fe6.txt lore
71f153ecdef94026a97b635a40b375c8.txt news
7341b4fda4d972adfbf854a0d6be3400.txt editorial
74486d71097c34544195b52bdd844839.txt news
745df40e8d2ba4bf6abfcb197c65359c.txt news
74a1421e246c3ffc08398609f75e292c.txt fiction
784346fad149c3736d309036e925526c.txt belles_lettres
787d5f0883aa5fa768a624c226fc7294.txt learned
787d5f0883aa5fa768a624c226fc7294.txt lore
791f3304bbd155e0211904d1d002b081.txt belles_lettres
7a297cedd35c3ffb12ab6011d34f1244.txt belles_lettres
7a297cedd35c3ffb12ab6011d34f1244.txt news
7c809ae6732c39ea9a020a307ff35b3a.txt belles_lettres
7f8b847188c77b75a2b00e906e0ae693.txt belles_lettres
805ea08c406a72dbff755a3627aeb677.txt belles_lettres
8459fa5551ec11ae82c5fc404f2b3988.txt belles_lettres
7f8b847188c77b75a2b00e906e0ae693.txt adventure
805ea08c406a72dbff755a3627aeb677.txt editorial
8459fa5551ec11ae82c5fc404f2b3988.txt lore
853f9d4b400a22d2abbf0f2e17d6ae33.txt belles_lettres
864ff44244fb6229ba79ce3df93df701.txt learned
8758b603d3ce23de68cbd13665a128d4.txt belles_lettres
87d7774f30d9221f856bab02a3f5ffc4.txt belles_lettres
8b2d2ff3e27f2d56f5c51f85c2754cf9.txt belles_lettres
864ff44244fb6229ba79ce3df93df701.txt hobbies
8758b603d3ce23de68cbd13665a128d4.txt news
87d7774f30d9221f856bab02a3f5ffc4.txt lore
8b2d2ff3e27f2d56f5c51f85c2754cf9.txt news
8babd57d7cbd695d8c04d698626593e8.txt belles_lettres
8ce16ec688419c614801d5c29cec6153.txt learned
8d2066cd72a448eb69348dbb68f754d8.txt learned
8fb3df3b7d96dc4383c84447a4fdd1a3.txt belles_lettres
9101cbf87bfd4ef26e71f5b8c1e61d18.txt belles_lettres
93c4b35148e7dcb767ea607fe7edf2c3.txt belles_lettres
990e5a79b032e5cb9ab3e56cab71a6ef.txt learned
9b9ed2005178bb6098ae874260128fc6.txt belles_lettres
9c97ea8f2d4dea9c31ebe73765f2396b.txt belles_lettres
9f08d188f8174081f5b02a7f07668846.txt learned
9f9b19682a8401fd40bce446f33d508b.txt learned
9fe0cd0d62c294ed1bc7b29e7e65c18a.txt learned
9febf62c0e6509f3e1ad065a5a6aef8d.txt learned
a03db0b1e3bb05fc0f961d2a655e8dad.txt learned
a716803991f9713e7986d252e26e7382.txt belles_lettres
a98e64947521853ff24f52e12b77c789.txt belles_lettres
aa5156a64316e6836b14c61879d80712.txt belles_lettres
ac848bdeda712352e09e5fa392be4574.txt belles_lettres
8ce16ec688419c614801d5c29cec6153.txt hobbies
8d2066cd72a448eb69348dbb68f754d8.txt lore
8fb3df3b7d96dc4383c84447a4fdd1a3.txt news
9101cbf87bfd4ef26e71f5b8c1e61d18.txt lore
93c4b35148e7dcb767ea607fe7edf2c3.txt news
990e5a79b032e5cb9ab3e56cab71a6ef.txt lore
9b9ed2005178bb6098ae874260128fc6.txt news
9c97ea8f2d4dea9c31ebe73765f2396b.txt fiction
9f08d188f8174081f5b02a7f07668846.txt lore
9f9b19682a8401fd40bce446f33d508b.txt news
9fe0cd0d62c294ed1bc7b29e7e65c18a.txt news
9febf62c0e6509f3e1ad065a5a6aef8d.txt news
a03db0b1e3bb05fc0f961d2a655e8dad.txt lore
a716803991f9713e7986d252e26e7382.txt news
a98e64947521853ff24f52e12b77c789.txt news
aa5156a64316e6836b14c61879d80712.txt news
ac848bdeda712352e09e5fa392be4574.txt fiction
ad12792f75798b70a59b37178798e145.txt belles_lettres
ad3b98d2d08faf751ccfd7f8d0b4f045.txt belles_lettres
af3d510667a872139daf2df8c2a17c1e.txt belles_lettres
b07fc0f7edd49dcd538372888095d3d6.txt belles_lettres
b303c034152030a3594d72626d1f784d.txt belles_lettres
b31afca8898a09c9087b272701d61c89.txt belles_lettres
b3346fa7bed6f5b9ad06bc831c59ad6c.txt belles_lettres
b3681b289f0dd87a5c1f9573cd825866.txt belles_lettres
b4d65c8e57797e496834f5f6d9d3e49e.txt learned
b65707c01e68cc6d4d59e18d9f98f423.txt belles_lettres
b8a039ba1694ce7ce87737ce5c7480d8.txt belles_lettres
b998ac20277e09a1c3fecbdfb028b33a.txt learned
ba6843edc446617d1e6e5ec53246d849.txt belles_lettres
bb6d375a8b847c7c10f9bdbf7324eb03.txt belles_lettres
bbbda4cef7aeb20352c9f1d9b453a9e5.txt belles_lettres
be6f1bd428b9933bedbc6bd401868415.txt learned
bf8ce15b10cb746bb1181645a42012db.txt belles_lettres
bfd0a578b0ec650d83963ddcf443f7a1.txt learned
c1bdfb06016223b3b2c5e03e02af81f3.txt learned
c22274385e9d77bbb900ef9db6ef66ff.txt belles_lettres
c39fda6fbf81d87bb6508b1bbe7faf93.txt belles_lettres
c5a19f446f960c849d67b25238a08397.txt learned
c65f6ecdb1ba01da0e6525dd525621e1.txt belles_lettres
c942ba590a82fd0827b79e3d6bfb25d3.txt learned
c9497d141930518b8005ba352b4d1637.txt learned
cb24d378b3966cf4f3f663f8b13430f2.txt belles_lettres
ce39b27592fc593d0ee117651b072cc1.txt learned
ceacd82d3757974d93538f67b74bc25e.txt belles_lettres
cfdd298764ed82fa2304e427dcb53db9.txt belles_lettres
d027a28847a6228383dd9594f0984bdf.txt belles_lettres
d1f9469856a51f6007f0f785aadf8c1f.txt belles_lettres
d59cd5ad1285a9094a1f82a67fe4ba7b.txt belles_lettres
d5aa7d7a519c1600db10ad01a00a7e3a.txt belles_lettres
d86c9cee65263cdfddbfaaffab1aeeb7.txt belles_lettres
dc713f9e699e9e610b458b5c991ce514.txt belles_lettres
dc89c7bfd3f0eefd385f0a81c1a59981.txt belles_lettres
dc9a7b20833ff389ae573597095f253d.txt learned
dcacb995ec95ede56ba389128922603c.txt belles_lettres
dd1a33aada4ffb0564f709c10b95cedc.txt belles_lettres
e058a15d26f17f7193a032eed51bbbfc.txt learned
e2daacfa9c33ea659beaa1a7763bfe57.txt learned
e43c7ff67adf6fdd0710c0ec91776481.txt learned
e852750e57424cf3e5968b6a3f642553.txt learned
e88e97dfcade103cef59919bf49f46d3.txt learned
eb6bf7af7572cc1fa1a9aa36c0d0feb3.txt learned
ecf327ee7344767f939a3e7695607be5.txt belles_lettres
ef98917ffbb5b1f6e3ce0428d47f2f23.txt learned
f083fda6715b3b3860162e8367ea1209.txt learned
f2b173d5ffa6eda874a71aea5ba076d2.txt belles_lettres
f3b16a0072a6afc3a64e592f6c8ab78b.txt belles_lettres
f433e3a3fdf6455b68183790d72f7fd8.txt belles_lettres
f7099ffdcda8a3e231652cdfbdfe1d26.txt belles_lettres
fc97d173fc6d18448bd334ccdbf36e4c.txt belles_lettres
fdcc797bb8b504885a2ce07017555f33.txt belles_lettres
ad3b98d2d08faf751ccfd7f8d0b4f045.txt editorial
af3d510667a872139daf2df8c2a17c1e.txt fiction
b07fc0f7edd49dcd538372888095d3d6.txt lore
b303c034152030a3594d72626d1f784d.txt news
b31afca8898a09c9087b272701d61c89.txt adventure
b3346fa7bed6f5b9ad06bc831c59ad6c.txt lore
b3681b289f0dd87a5c1f9573cd825866.txt lore
b4d65c8e57797e496834f5f6d9d3e49e.txt belles_lettres
b65707c01e68cc6d4d59e18d9f98f423.txt lore
b8a039ba1694ce7ce87737ce5c7480d8.txt news
b998ac20277e09a1c3fecbdfb028b33a.txt lore
ba6843edc446617d1e6e5ec53246d849.txt lore
bb6d375a8b847c7c10f9bdbf7324eb03.txt lore
bbbda4cef7aeb20352c9f1d9b453a9e5.txt lore
be6f1bd428b9933bedbc6bd401868415.txt lore
bf8ce15b10cb746bb1181645a42012db.txt lore
bfd0a578b0ec650d83963ddcf443f7a1.txt lore
c1bdfb06016223b3b2c5e03e02af81f3.txt lore
c22274385e9d77bbb900ef9db6ef66ff.txt fiction
c39fda6fbf81d87bb6508b1bbe7faf93.txt fiction
c5a19f446f960c849d67b25238a08397.txt lore
c65f6ecdb1ba01da0e6525dd525621e1.txt editorial
c942ba590a82fd0827b79e3d6bfb25d3.txt lore
c9497d141930518b8005ba352b4d1637.txt hobbies
cb24d378b3966cf4f3f663f8b13430f2.txt adventure
ce39b27592fc593d0ee117651b072cc1.txt news
ceacd82d3757974d93538f67b74bc25e.txt news
cfdd298764ed82fa2304e427dcb53db9.txt editorial
d027a28847a6228383dd9594f0984bdf.txt lore
d1f9469856a51f6007f0f785aadf8c1f.txt news
d59cd5ad1285a9094a1f82a67fe4ba7b.txt lore
d5aa7d7a519c1600db10ad01a00a7e3a.txt lore
d86c9cee65263cdfddbfaaffab1aeeb7.txt news
dc713f9e699e9e610b458b5c991ce514.txt lore
dc89c7bfd3f0eefd385f0a81c1a59981.txt lore
dc9a7b20833ff389ae573597095f253d.txt lore
dcacb995ec95ede56ba389128922603c.txt lore
dd1a33aada4ffb0564f709c10b95cedc.txt lore
e058a15d26f17f7193a032eed51bbbfc.txt editorial
e2daacfa9c33ea659beaa1a7763bfe57.txt news
e43c7ff67adf6fdd0710c0ec91776481.txt lore
e852750e57424cf3e5968b6a3f642553.txt lore
e88e97dfcade103cef59919bf49f46d3.txt lore
eb6bf7af7572cc1fa1a9aa36c0d0feb3.txt hobbies
ecf327ee7344767f939a3e7695607be5.txt news
ef98917ffbb5b1f6e3ce0428d47f2f23.txt lore
f083fda6715b3b3860162e8367ea1209.txt hobbies
f2b173d5ffa6eda874a71aea5ba076d2.txt news
f3b16a0072a6afc3a64e592f6c8ab78b.txt editorial
f433e3a3fdf6455b68183790d72f7fd8.txt news
f7099ffdcda8a3e231652cdfbdfe1d26.txt editorial
fc97d173fc6d18448bd334ccdbf36e4c.txt hobbies
fdcc797bb8b504885a2ce07017555f33.txt news

View File

@ -0,0 +1,344 @@
#!/usr/bin/env python
"""Porter Stemming Algorithm
This is the Porter stemming algorithm, ported to Python from the
version coded up in ANSI C by the author. It may be be regarded
as canonical, in that it follows the algorithm presented in
Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
no. 3, pp 130-137,
only differing from it at the points maked --DEPARTURE-- below.
See also http://www.tartarus.org/~martin/PorterStemmer
The algorithm as described in the paper could be exactly replicated
by adjusting the points of DEPARTURE, but this is barely necessary,
because (a) the points of DEPARTURE are definitely improvements, and
(b) no encoding of the Porter stemmer I have seen is anything like
as exact as this version, even with the points of DEPARTURE!
Vivake Gupta (v@nano.com)
Release 1: January 2001
Further adjustments by Santiago Bruno (bananabruno@gmail.com)
to allow word input not restricted to one word per line, leading
to:
release 2: July 2008
"""
import sys
class PorterStemmer:
def __init__(self):
"""The main part of the stemming algorithm starts here.
b is a buffer holding a word to be stemmed. The letters are in b[k0],
b[k0+1] ... ending at b[k]. In fact k0 = 0 in this demo program. k is
readjusted downwards as the stemming progresses. Zero termination is
not in fact used in the algorithm.
Note that only lower case sequences are stemmed. Forcing to lower case
should be done before stem(...) is called.
"""
self.b = "" # buffer for word to be stemmed
self.k = 0
self.k0 = 0
self.j = 0 # j is a general offset into the string
def cons(self, i):
"""cons(i) is TRUE <=> b[i] is a consonant."""
if self.b[i] == 'a' or self.b[i] == 'e' or self.b[i] == 'i' or self.b[i] == 'o' or self.b[i] == 'u':
return 0
if self.b[i] == 'y':
if i == self.k0:
return 1
else:
return (not self.cons(i - 1))
return 1
def m(self):
"""m() measures the number of consonant sequences between k0 and j.
if c is a consonant sequence and v a vowel sequence, and <..>
indicates arbitrary presence,
<c><v> gives 0
<c>vc<v> gives 1
<c>vcvc<v> gives 2
<c>vcvcvc<v> gives 3
....
"""
n = 0
i = self.k0
while 1:
if i > self.j:
return n
if not self.cons(i):
break
i = i + 1
i = i + 1
while 1:
while 1:
if i > self.j:
return n
if self.cons(i):
break
i = i + 1
i = i + 1
n = n + 1
while 1:
if i > self.j:
return n
if not self.cons(i):
break
i = i + 1
i = i + 1
def vowelinstem(self):
"""vowelinstem() is TRUE <=> k0,...j contains a vowel"""
for i in range(self.k0, self.j + 1):
if not self.cons(i):
return 1
return 0
def doublec(self, j):
"""doublec(j) is TRUE <=> j,(j-1) contain a double consonant."""
if j < (self.k0 + 1):
return 0
if (self.b[j] != self.b[j-1]):
return 0
return self.cons(j)
def cvc(self, i):
"""cvc(i) is TRUE <=> i-2,i-1,i has the form consonant - vowel - consonant
and also if the second c is not w,x or y. this is used when trying to
restore an e at the end of a short e.g.
cav(e), lov(e), hop(e), crim(e), but
snow, box, tray.
"""
if i < (self.k0 + 2) or not self.cons(i) or self.cons(i-1) or not self.cons(i-2):
return 0
ch = self.b[i]
if ch == 'w' or ch == 'x' or ch == 'y':
return 0
return 1
def ends(self, s):
"""ends(s) is TRUE <=> k0,...k ends with the string s."""
length = len(s)
if s[length - 1] != self.b[self.k]: # tiny speed-up
return 0
if length > (self.k - self.k0 + 1):
return 0
if self.b[self.k-length+1:self.k+1] != s:
return 0
self.j = self.k - length
return 1
def setto(self, s):
"""setto(s) sets (j+1),...k to the characters in the string s, readjusting k."""
length = len(s)
self.b = self.b[:self.j+1] + s + self.b[self.j+length+1:]
self.k = self.j + length
def r(self, s):
"""r(s) is used further down."""
if self.m() > 0:
self.setto(s)
def step1ab(self):
"""step1ab() gets rid of plurals and -ed or -ing. e.g.
caresses -> caress
ponies -> poni
ties -> ti
caress -> caress
cats -> cat
feed -> feed
agreed -> agree
disabled -> disable
matting -> mat
mating -> mate
meeting -> meet
milling -> mill
messing -> mess
meetings -> meet
"""
if self.b[self.k] == 's':
if self.ends("sses"):
self.k = self.k - 2
elif self.ends("ies"):
self.setto("i")
elif self.b[self.k - 1] != 's':
self.k = self.k - 1
if self.ends("eed"):
if self.m() > 0:
self.k = self.k - 1
elif (self.ends("ed") or self.ends("ing")) and self.vowelinstem():
self.k = self.j
if self.ends("at"): self.setto("ate")
elif self.ends("bl"): self.setto("ble")
elif self.ends("iz"): self.setto("ize")
elif self.doublec(self.k):
self.k = self.k - 1
ch = self.b[self.k]
if ch == 'l' or ch == 's' or ch == 'z':
self.k = self.k + 1
elif (self.m() == 1 and self.cvc(self.k)):
self.setto("e")
def step1c(self):
"""step1c() turns terminal y to i when there is another vowel in the stem."""
if (self.ends("y") and self.vowelinstem()):
self.b = self.b[:self.k] + 'i' + self.b[self.k+1:]
def step2(self):
"""step2() maps double suffices to single ones.
so -ization ( = -ize plus -ation) maps to -ize etc. note that the
string before the suffix must give m() > 0.
"""
if self.b[self.k - 1] == 'a':
if self.ends("ational"): self.r("ate")
elif self.ends("tional"): self.r("tion")
elif self.b[self.k - 1] == 'c':
if self.ends("enci"): self.r("ence")
elif self.ends("anci"): self.r("ance")
elif self.b[self.k - 1] == 'e':
if self.ends("izer"): self.r("ize")
elif self.b[self.k - 1] == 'l':
if self.ends("bli"): self.r("ble") # --DEPARTURE--
# To match the published algorithm, replace this phrase with
# if self.ends("abli"): self.r("able")
elif self.ends("alli"): self.r("al")
elif self.ends("entli"): self.r("ent")
elif self.ends("eli"): self.r("e")
elif self.ends("ousli"): self.r("ous")
elif self.b[self.k - 1] == 'o':
if self.ends("ization"): self.r("ize")
elif self.ends("ation"): self.r("ate")
elif self.ends("ator"): self.r("ate")
elif self.b[self.k - 1] == 's':
if self.ends("alism"): self.r("al")
elif self.ends("iveness"): self.r("ive")
elif self.ends("fulness"): self.r("ful")
elif self.ends("ousness"): self.r("ous")
elif self.b[self.k - 1] == 't':
if self.ends("aliti"): self.r("al")
elif self.ends("iviti"): self.r("ive")
elif self.ends("biliti"): self.r("ble")
elif self.b[self.k - 1] == 'g': # --DEPARTURE--
if self.ends("logi"): self.r("log")
# To match the published algorithm, delete this phrase
def step3(self):
"""step3() dels with -ic-, -full, -ness etc. similar strategy to step2."""
if self.b[self.k] == 'e':
if self.ends("icate"): self.r("ic")
elif self.ends("ative"): self.r("")
elif self.ends("alize"): self.r("al")
elif self.b[self.k] == 'i':
if self.ends("iciti"): self.r("ic")
elif self.b[self.k] == 'l':
if self.ends("ical"): self.r("ic")
elif self.ends("ful"): self.r("")
elif self.b[self.k] == 's':
if self.ends("ness"): self.r("")
def step4(self):
"""step4() takes off -ant, -ence etc., in context <c>vcvc<v>."""
if self.b[self.k - 1] == 'a':
if self.ends("al"): pass
else: return
elif self.b[self.k - 1] == 'c':
if self.ends("ance"): pass
elif self.ends("ence"): pass
else: return
elif self.b[self.k - 1] == 'e':
if self.ends("er"): pass
else: return
elif self.b[self.k - 1] == 'i':
if self.ends("ic"): pass
else: return
elif self.b[self.k - 1] == 'l':
if self.ends("able"): pass
elif self.ends("ible"): pass
else: return
elif self.b[self.k - 1] == 'n':
if self.ends("ant"): pass
elif self.ends("ement"): pass
elif self.ends("ment"): pass
elif self.ends("ent"): pass
else: return
elif self.b[self.k - 1] == 'o':
if self.ends("ion") and (self.b[self.j] == 's' or self.b[self.j] == 't'): pass
elif self.ends("ou"): pass
# takes care of -ous
else: return
elif self.b[self.k - 1] == 's':
if self.ends("ism"): pass
else: return
elif self.b[self.k - 1] == 't':
if self.ends("ate"): pass
elif self.ends("iti"): pass
else: return
elif self.b[self.k - 1] == 'u':
if self.ends("ous"): pass
else: return
elif self.b[self.k - 1] == 'v':
if self.ends("ive"): pass
else: return
elif self.b[self.k - 1] == 'z':
if self.ends("ize"): pass
else: return
else:
return
if self.m() > 1:
self.k = self.j
def step5(self):
"""step5() removes a final -e if m() > 1, and changes -ll to -l if
m() > 1.
"""
self.j = self.k
if self.b[self.k] == 'e':
a = self.m()
if a > 1 or (a == 1 and not self.cvc(self.k-1)):
self.k = self.k - 1
if self.b[self.k] == 'l' and self.doublec(self.k) and self.m() > 1:
self.k = self.k -1
def stem(self, p, i, j):
"""In stem(p,i,j), p is a char pointer, and the string to be stemmed
is from p[i] to p[j] inclusive. Typically i is zero and j is the
offset to the last character of a string, (p[j+1] == '\0'). The
stemmer adjusts the characters p[i] ... p[j] and returns the new
end-point of the string, k. Stemming never increases word length, so
i <= k <= j. To turn the stemmer into a module, declare 'stem' as
extern, and delete the remainder of this file.
"""
# copy the parameters into statics
self.b = p
self.k = j
self.k0 = i
if self.k <= self.k0 + 1:
return self.b # --DEPARTURE--
# With this line, strings of length 1 or 2 don't go through the
# stemming process, although no mention is made of this in the
# published algorithm. Remove the line to match the published
# algorithm.
self.step1ab()
self.step1c()
self.step2()
self.step3()
self.step4()
self.step5()
return self.b[self.k0:self.k+1]

View File

@ -1,3 +1,4 @@
# -*- coding: utf-8 -*-
# imports
import os
@ -6,7 +7,7 @@ import random
import sys
import math
import re
from PorterStemmer import PorterStemmer
# config variables
@ -129,32 +130,75 @@ class multiclassClassifier:
def bayes(self, text, termfrequenciesOfClasses, termCount, percentage):
result = 1.0
wordcount = 0.0
for line in text:
thisline = line.split(" ");
for word in thisline:
for word in thisline:
word = self.clean_word(word)
if word <> "":
'''
Accuracy: 21.2121%
Precision per class: adventure:40.0% belles_lettres:22.2222% editorial:17.6471% fiction:36.3636% government:0.0% hobbies:11.1111% learned:0.0% lore:17.5439% mystery:0.0% news:23.4043% romance:0.0%
Precision Macroavg: 15.2993%
Precision Microavg: 21.2121%
Recall per class: adventure:20.0% belles_lettres:14.8148% editorial:30.0% fiction:36.3636% government:0.0% hobbies:7.6923% learned:0.0% lore:55.5556% mystery:0.0% news:68.75% romance:0.0%
Recall Microavg: 2.6217%
if termfrequenciesOfClasses.has_key(str(word)):
result += math.log(1./((termfrequenciesOfClasses[word]+1.)/(termCount+1))) #gewichte häufig auftretende worter am wenigsten, wenigauftretende am stärksten + termcount -> was ist das?
...
return result
Accuracy: 21.8182%
Precision per class: adventure:40.0% belles_lettres:22.2222% editorial:20.0% fiction:36.3636% government:0.0% hobbies:20.0% learned:0.0% lore:17.5439% mystery:0.0% news:22.9167% romance:0.0%
Precision Macroavg: 16.2769%
Precision Microavg: 21.8182%
Recall per class: adventure:20.0% belles_lettres:14.8148% editorial:30.0% fiction:36.3636% government:0.0% hobbies:15.3846% learned:0.0% lore:55.5556% mystery:0.0% news:68.75% romance:0.0%
Recall Microavg: 2.7149%
if termfrequenciesOfClasses.has_key(str(word)):
wordcount += 1
result += math.log(1./((termfrequenciesOfClasses[word]+1.)/(termCount+1))) #gewichte häufig auftretende worter am wenigsten, wenigauftretende am stärksten + termcount -> was ist das?
...
result += math.log(percentage)
result += math.log(wordcount)
return result
'''
#result = 1.0
#for word in text:
if termfrequenciesOfClasses.has_key(str(word)):
result += math.log((termfrequenciesOfClasses[word]+1.)/(termCount+1))
else:
result += math.log(1./(termCount+1))
result += math.log(percentage)
return result
wordcount += 1
#result += math.log(1./(termfrequenciesOfClasses[word]+1.))
#result += math.log((termfrequenciesOfClasses[word]+1.)/(termCount+1)) #gewichte häufig auftretende terme am stärksten
#result += math.log(1./((termfrequenciesOfClasses[word]+1.)/(termCount+1))) #gewichte häufig auftretende worter am wenigsten, wenigauftretende am stärksten + termcount -> was ist das?
result += termfrequenciesOfClasses[word]
#print "known word: "+word
#else:
#result += math.log(1./(termCount+1))
#result += math.log(1.)
#print "new word: "+word
#result += math.log(percentage)
result *= percentage
#result += math.log(wordcount)
result *= wordcount
#return result
return math.log(result)
def clean_word(self, word):
#print word
word = word.lower() #lowercase
word = word.strip() # remove lineendings etc
#return word
word = "".join(re.findall("[a-z]+", word)) #only characters
#return word
if len(word) <= 4: #only words longer 4
if len(word) <= 4: #only words longer 4
return ""
#return word
if self.isStopWord(word): #stopwordfilter
return ""
#print word
p = PorterStemmer() #stemming
word = p.stem(word, 0,len(word)-1)
return word
def isStopWord(self,word):
@ -212,6 +256,7 @@ if __name__ == '__main__':
maxRes = temp
mc.filesToPrediction[infile] = cl
f.close()
print currentPath + " " + mc.filesToPrediction[infile]
mc.writePredictionFile()