college/ss2013/1_Web Mining/Uebungen/4_Uebung/code/naive_bayes.py

# -*- coding: utf-8 -*-
# imports

import os
import shutil
import random
import sys
import math
import re
from PorterStemmer import PorterStemmer


# config variables
actualDir = os.path.dirname(os.path.realpath(__file__))
dataDir   = os.path.join(actualDir, '../data')
trainDir  = os.path.join(dataDir, 'u4_train')
testDir   = os.path.join(dataDir, 'u4_test')
stopwords = os.path.join(dataDir, 'stopwords/english')


'''
################################################################################################################################
           --> CLASS Trainingsset <--
################################################################################################################################
'''

class trainingsset:

    #def __init__(self):


    def createTrainingsset(self):
        self.splitTrainingsdataRandomly(self) # first split our data into trainings- and testdata


    # copies files randomly to new directories. Each directory will contain fileCount / 2 numbers of files
    # If fileCount is uneven /trainingsdata will contain one file more than /testdata
    def splitTrainingsdataRandomly(self):
        for dirpath, dirnames, filenames in os.walk(trainDir, topdown=False):
            newTrainDir = dirpath+'/trainingsdata'
            newTestDir  = dirpath+'/testdata'
            fileCount = len(filenames)

            if(fileCount > 0):
                #remove old dirs if they already exist
                if os.path.isdir(newTrainDir):
                    shutil.rmtree(newTrainDir)
                if os.path.isdir(newTestDir):
                    shutil.rmtree(newTestDir)

                # create new directories
                os.mkdir(newTestDir)
                os.mkdir(newTrainDir)
                numberOfFilesInTraining = 0
                numberOfFilesInTest     = 0

                for actualFile in filenames:
                    fileCopied = False

                    while(fileCopied == False):
                        randomBool = bool(random.getrandbits(1))
                        if(randomBool and numberOfFilesInTraining <= fileCount / 2):
                            numberOfFilesInTraining += 1
                            shutil.copy(dirpath+'/'+actualFile, dirpath+'/trainingsdata/'+actualFile)
                            fileCopied = True
                        else:
                            if numberOfFilesInTest < fileCount / 2:
                                numberOfFilesInTest += 1
                                fileCopied = True
                                shutil.copy(dirpath+'/'+actualFile, dirpath+'/testdata/'+actualFile)


'''
################################################################################################################################
           --> CLASS MulticlassClassifier <--
################################################################################################################################
'''
class multiclassClassifier:

        filesToPrediction = {}
        termfrequenciesOfClasses = {};
        countClasses = {}
        percentage = {}

        def writePredictionFile(self):
            with open(actualDir+'/../G22_predictions.txt', 'w') as f:
                for k in sorted(self.filesToPrediction.iterkeys()):
                    f.write(str(k)+'\t'+str(self.filesToPrediction[k])+'\n')
            f.closed
            return

        # calculates all necessary stuff for multiclass classifier
        def getTermfrequenciesOfClasses(self):
            listing = os.listdir(trainDir)
            for classes in listing: # classes
                self.termfrequenciesOfClasses[classes] = {}
            for classes in self.termfrequenciesOfClasses.keys():
                currentPath = trainDir+'/'+classes
                listing = os.listdir(currentPath)
                for infile in listing:
                    if self.countClasses.has_key(classes):
                        self.countClasses[classes] += 1
                    else:
                        self.countClasses[classes] = 1
                    currentPath = trainDir+'/'+classes+'/'+infile
                    # update termfrequency for specific class:
                    self.termfrequenciesOfClasses[classes] = self.updateDictonary(currentPath, self.termfrequenciesOfClasses[classes])


        # "incudludes" a file into the termfrequency dictonary
        def updateDictonary(self, pathToFile, dictonary):
            f = open(pathToFile, 'r')
            lines = f.readlines();
            for line in lines:
                thisline = line.split(" ");
                for word in thisline:
                    word = self.clean_word(word)
                    if word != "":
                        if dictonary.has_key(word):
                            dictonary[str(word)] += 1
                        else:
                            dictonary[str(word)] = 1
            f.close()
            return dictonary


        def bayes(self, text, termfrequenciesOfClasses, termCount, percentage, cl):
            result = 1.0
            wordcount = 0.0
            notwordcount = 0.0
            for line in text:
                thisline = line.split(" ");
                for word in thisline:
                    word = self.clean_word(word)
                    if word != "":
                        '''
                        Accuracy: 21.2121%
                        Precision per class: adventure:40.0% belles_lettres:22.2222% editorial:17.6471% fiction:36.3636% government:0.0% hobbies:11.1111% learned:0.0% lore:17.5439% mystery:0.0% news:23.4043% romance:0.0%
                        Precision Macroavg: 15.2993%
                        Precision Microavg: 21.2121%
                        Recall per class: adventure:20.0% belles_lettres:14.8148% editorial:30.0% fiction:36.3636% government:0.0% hobbies:7.6923% learned:0.0% lore:55.5556% mystery:0.0% news:68.75% romance:0.0%
                        Recall Microavg: 2.6217%

                        if termfrequenciesOfClasses.has_key(str(word)):
                            result += math.log(1./((termfrequenciesOfClasses[word]+1.)/(termCount+1))) #gewichte häufig auftretende worter am wenigsten, wenigauftretende am stärksten + termcount -> was ist das?
                        ...
                        return result

                        Accuracy: 21.8182%
                        Precision per class: adventure:40.0% belles_lettres:22.2222% editorial:20.0% fiction:36.3636% government:0.0% hobbies:20.0% learned:0.0% lore:17.5439% mystery:0.0% news:22.9167% romance:0.0%
                        Precision Macroavg: 16.2769%
                        Precision Microavg: 21.8182%
                        Recall per class: adventure:20.0% belles_lettres:14.8148% editorial:30.0% fiction:36.3636% government:0.0% hobbies:15.3846% learned:0.0% lore:55.5556% mystery:0.0% news:68.75% romance:0.0%
                        Recall Microavg: 2.7149%

                        if termfrequenciesOfClasses.has_key(str(word)):
                            wordcount += 1
                            result += math.log(1./((termfrequenciesOfClasses[word]+1.)/(termCount+1))) #gewichte häufig auftretende worter am wenigsten, wenigauftretende am stärksten + termcount -> was ist das?
                        ...
                        result += math.log(percentage)
                        result += math.log(wordcount)
                        return result

                        Accuracy: 33.9394%
                        Precision per class: adventure:0.0% belles_lettres:36.8421% editorial:0.0% fiction:0.0% government:20.0% hobbies:0.0% learned:38.9831% lore:0.0% mystery:36.8421% news:0.0% romance:52.9412%
                        Precision Macroavg: 16.8735%
                        Precision Microavg: 33.9394%
                        Recall per class: adventure:0.0% belles_lettres:25.9259% editorial:0.0% fiction:0.0% government:90.9091% hobbies:0.0% learned:76.6667% lore:0.0% mystery:87.5% news:0.0% romance:81.8182%
                        Recall Microavg: 4.8866%

                        if termfrequenciesOfClasses.has_key(str(word)):
                            wordcount += 1
                            result += termfrequenciesOfClasses[word]/(termCount+1)
                            #print "known word: "+word
                        else:
                            result -= 1./(termCount+1)
                            #print "new word: "+word
                        ...
                        result /= len(termfrequenciesOfClasses)
                        print cl +" "+str(result)
                        return math.log(result)

                        Accuracy: 37.5758%
                        Precision per class: adventure:66.6667% belles_lettres:36.5385% editorial:0.0% fiction:0.0% government:28.0% hobbies:0.0% learned:36.8421% lore:0.0% mystery:50.0% news:100.0% romance:37.5%
                        Precision Macroavg: 32.3225%
                        Precision Microavg: 37.5758%
                        Recall per class: adventure:20.0% belles_lettres:70.3704% editorial:0.0% fiction:0.0% government:63.6364% hobbies:0.0% learned:70.0% lore:0.0% mystery:50.0% news:18.75% romance:54.5455%
                        Recall Microavg: 5.6777%

                        if termfrequenciesOfClasses.has_key(str(word)):
                            wordcount += 1
                            result += termfrequenciesOfClasses[word]/(termCount+1)
                        else:
                            result -= 1./(termCount+1)
                        ...
                        result *= wordcount
                        result /= len(termfrequenciesOfClasses)
                        #return result
                        print cl +" "+str(result)
                        return math.log(result)

                        Accuracy: 40.6061%
                        Precision per class: adventure:40.0% belles_lettres:44.7368% editorial:0.0% fiction:0.0% government:23.6842% hobbies:66.6667% learned:40.0% lore:0.0% mystery:46.1538% news:100.0% romance:47.3684%
                        Precision Macroavg: 37.1464%
                        Precision Microavg: 40.6061%
                        Recall per class: adventure:20.0% belles_lettres:62.963% editorial:0.0% fiction:0.0% government:81.8182% hobbies:15.3846% learned:60.0% lore:0.0% mystery:75.0% news:25.0% romance:81.8182%
                        Recall Microavg: 6.3992%

                        if termfrequenciesOfClasses.has_key(str(word)):
                            wordcount += 1
                            result += (termfrequenciesOfClasses[word]/(termCount+1))*(1-percentage)
                        ...
                        result *= wordcount
                        result /= len(termfrequenciesOfClasses)
                        print cl +" "+str(result)
                        return math.log(result)

                        Accuracy: 46.0606%
                        Precision per class: adventure:25.0% belles_lettres:35.3846% editorial:0.0% fiction:31.25% government:40.0% hobbies:66.6667% learned:72.0% lore:20.0% mystery:66.6667% news:70.5882% romance:25.0%
                        Precision Macroavg: 41.1415%
                        Precision Microavg: 46.0606%
                        Recall per class: adventure:20.0% belles_lettres:85.1852% editorial:0.0% fiction:45.4545% government:36.3636% hobbies:46.1538% learned:60.0% lore:5.5556% mystery:50.0% news:75.0% romance:9.0909%
                        Recall Microavg: 7.8675%

                        if termfrequenciesOfClasses.has_key(str(word)):
                            wordcount += 1
                            result += (termfrequenciesOfClasses[word]/(termCount+1))
                        else:
                            notwordcount += 1
                            result += (1./(termCount+1))
                        ...
                        result *= (1-percentage)*wordcount
                        result /= percentage*notwordcount
                        print cl +" "+str(result)
                        return math.log(result)
                        '''
                        #result = 1.0
                        #for word in text:
                        if termfrequenciesOfClasses.has_key(str(word)):
                            wordcount += 1
                            #result += math.log(1./(termfrequenciesOfClasses[word]+1.))
                            #result += math.log((termfrequenciesOfClasses[word]+1.)/(termCount+1)) #gewichte häufig auftretende terme am stärksten
                            #result += math.log(1./((termfrequenciesOfClasses[word]+1.)/(termCount+1))) #gewichte häufig auftretende worter am wenigsten, wenigauftretende am stärksten + termcount -> was ist das?
                            result += (termfrequenciesOfClasses[word]/(termCount+1))
                            #print "known word: "+word
                        else:
                            notwordcount += 1
                            result += (1./(termCount+1))
                            #result += math.log(1./(termCount+1))
                            #result += math.log(1.)
                            #print "new word: "+word
            #result += math.log(percentage)
            #result /= percentage
            #result += math.log(wordcount)
            #result *= (wordcount/(wordcount + notwordcount))
            result *= (1-percentage)*wordcount
            result /= percentage*notwordcount
            #result /= len(termfrequenciesOfClasses)
            #return result
            #print cl +" "+str(result)
            return math.log(result)

        def clean_word(self, word):
            #print word
            word = word.lower()                                                 #lowercase
            word = word.strip()                                                 # remove lineendings etc
            #return word
            word = "".join(re.findall("[a-z]+", word))                          #only characters
            #return word
            if len(word) <= 4:                                                  #only words longer 4
                return ""
            #return word
            if self.isStopWord(word):                                           #stopwordfilter
                return ""
            #print word
            p = PorterStemmer()                                                 #stemming
            word = p.stem(word, 0,len(word)-1)
            return word

        def isStopWord(self,word):
            for line in open(stopwords,'r').readlines():
                if line.strip() == word:
                    return True
            return False


'''
################################################################################################################################
           --> Main method <--
################################################################################################################################
'''

# main method
if __name__ == '__main__':
    ts = trainingsset()
    #ts.splitTrainingsdataRandomly();   already done -> specific folder structure

    mc = multiclassClassifier()

    # calculates a dictonary depending on all testdata with the form:
    # dictonary[CLASSNAME][WORD] = Integer
    mc.getTermfrequenciesOfClasses()


    # calculates the percentage of P(C) for all given classes
    sumOfClasses = 0.0
    for v in mc.countClasses.values():
        sumOfClasses += v

    for classes in mc.countClasses.keys():
        mc.percentage[classes] = mc.countClasses[classes]/sumOfClasses


    #class_matches = []
    #listing = os.listdir(trainDir)
    #for classes in listing: # classes
    path = trainDir+'/'+classes+'/testdata'
    path = testDir
    listing = os.listdir(path)
    for infile in listing:
        currentPath = testDir+'/'+infile
        #print currentPath
        maxRes      = sys.maxint * -1
        # check all possible classes
        for cl in mc.percentage.keys():
            f = open(currentPath, 'r')
            temp = mc.bayes(f.readlines(), mc.termfrequenciesOfClasses[cl], sumOfClasses, mc.percentage[cl], cl)
            #class_matches.append([infile,cl,temp])
            #print class_matches
            if (temp >= maxRes):
                maxRes = temp
                mc.filesToPrediction[infile] = cl
            f.close()
        print currentPath + " " + mc.filesToPrediction[infile]


    mc.writePredictionFile()