web mining update

2013-06-10 10:53:44 +02:00 · 2013-06-10 10:53:44 +02:00 · 49d6eef95d
commit 49d6eef95d
parent 80e36cf86e
2 changed files with 44 additions and 20 deletions
--- a/Mining/Uebungen/4_Uebung/code/naive_bayes.py
+++ b/Mining/Uebungen/4_Uebung/code/naive_bayes.py
@ -6,6 +6,7 @@ import random



+
 # config variables
 actualDir = os.path.dirname(os.path.realpath(__file__))
 dataDir   = os.path.join(actualDir, '../data')
@ -23,18 +24,10 @@ class trainingsset:
    
    #def __init__(self):
    
-    classes = ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'learned', 'lore','mystery','news','romantic']
    
    def createTrainingsset(self):
        self.splitTrainingsdataRandomly(self) # first split our data into trainings- and testdata
-        
-    
-    def getClassesToTrain(self):
-        for dirpath, dirnames, filenames in os.walk(trainDir):
-            # TODO: implement
-            pass
-            
-            
+                        
        
    # copies files randomly to new directories. Each directory will contain fileCount / 2 numbers of files
    # If fileCount is uneven /trainingsdata will contain one file more than /testdata
@ -82,8 +75,9 @@ class trainingsset:
 '''
 class multiclassClassifier:
        
-        filesToPrediction = {};
-        testFiles = {}
+
+        termfrequenciesOfClasses = {};
+        countClass = {}
        
        
        def writePredictionFile(self):
@ -93,13 +87,43 @@ class multiclassClassifier:
            f.closed
            return
        
-        # reads all testData from /u4_train to the list 
-        def getTestData(self):
-            listing = os.listdir(testDir)                
-            for infile in listing:
-                f = open(testDir+'/'+infile, 'r')
-                self.testFiles[infile] = f.readline()
-                f.close()
+        # calculates all necessary stuff for multiclass classifier
+        def getTermfrequenciesOfClasses(self):
+            listing = os.listdir(trainDir)              
+            for classes in listing: # classes
+                self.termfrequenciesOfClasses[classes] = {}
+            for classes in self.termfrequenciesOfClasses.keys():
+                currentPath = trainDir+'/'+classes+'/trainingsdata'
+                listing = os.listdir(currentPath)  
+                for infile in listing:
+                    if self.countClass.has_key(classes):
+                        self.countClass[classes] += 1
+                    else:
+                        self.countClass[classes] = 1
+                    currentPath = trainDir+'/'+classes+'/trainingsdata/'+infile 
+                    # update termfrequency for specific class:
+                    self.termfrequenciesOfClasses[classes] = self.updateDictonary(currentPath, self.termfrequenciesOfClasses[classes])
+                    
+                    
+                                    
+        # "incudludes" a file into the termfrequency dictonary
+        def updateDictonary(self, pathToFile, dictonary):
+            f = open(pathToFile, 'r')
+            lines = f.readlines();
+            for line in lines:
+                thisline = line.split(" ");
+                for word in thisline:
+                    word = word.lower()
+                    word = word.replace(".", " ")
+                    word = word.replace(",", " ")
+                    if dictonary.has_key(word):
+                        dictonary[str(word)] += 1
+                    else:
+                        dictonary[str(word)] = 1    
+            f.close()
+            return dictonary
+                
+                
        
        
        def bayes(self, listOfFiles):
@ -119,10 +143,10 @@ class multiclassClassifier:
 if __name__ == '__main__':
    ts = trainingsset()
    #ts.splitTrainingsdataRandomly();
-    ts.getClassesToTrain();
+    #ts.getClassesToTrain();
    
    mc = multiclassClassifier()
-    #mc.getTestData()
+    mc.getTermfrequenciesOfClasses()
    #mc.writePredictionFile()
    
    
--- a/Mining/Uebungen/4_Uebung/solution.odt
+++ b/Mining/Uebungen/4_Uebung/solution.odt