web mining update
This commit is contained in:
parent
80e36cf86e
commit
49d6eef95d
@ -6,6 +6,7 @@ import random
|
||||
|
||||
|
||||
|
||||
|
||||
# config variables
|
||||
actualDir = os.path.dirname(os.path.realpath(__file__))
|
||||
dataDir = os.path.join(actualDir, '../data')
|
||||
@ -23,18 +24,10 @@ class trainingsset:
|
||||
|
||||
#def __init__(self):
|
||||
|
||||
classes = ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'learned', 'lore','mystery','news','romantic']
|
||||
|
||||
def createTrainingsset(self):
|
||||
self.splitTrainingsdataRandomly(self) # first split our data into trainings- and testdata
|
||||
|
||||
|
||||
def getClassesToTrain(self):
|
||||
for dirpath, dirnames, filenames in os.walk(trainDir):
|
||||
# TODO: implement
|
||||
pass
|
||||
|
||||
|
||||
|
||||
|
||||
# copies files randomly to new directories. Each directory will contain fileCount / 2 numbers of files
|
||||
# If fileCount is uneven /trainingsdata will contain one file more than /testdata
|
||||
@ -82,8 +75,9 @@ class trainingsset:
|
||||
'''
|
||||
class multiclassClassifier:
|
||||
|
||||
filesToPrediction = {};
|
||||
testFiles = {}
|
||||
|
||||
termfrequenciesOfClasses = {};
|
||||
countClass = {}
|
||||
|
||||
|
||||
def writePredictionFile(self):
|
||||
@ -93,13 +87,43 @@ class multiclassClassifier:
|
||||
f.closed
|
||||
return
|
||||
|
||||
# reads all testData from /u4_train to the list
|
||||
def getTestData(self):
|
||||
listing = os.listdir(testDir)
|
||||
for infile in listing:
|
||||
f = open(testDir+'/'+infile, 'r')
|
||||
self.testFiles[infile] = f.readline()
|
||||
f.close()
|
||||
# calculates all necessary stuff for multiclass classifier
|
||||
def getTermfrequenciesOfClasses(self):
|
||||
listing = os.listdir(trainDir)
|
||||
for classes in listing: # classes
|
||||
self.termfrequenciesOfClasses[classes] = {}
|
||||
for classes in self.termfrequenciesOfClasses.keys():
|
||||
currentPath = trainDir+'/'+classes+'/trainingsdata'
|
||||
listing = os.listdir(currentPath)
|
||||
for infile in listing:
|
||||
if self.countClass.has_key(classes):
|
||||
self.countClass[classes] += 1
|
||||
else:
|
||||
self.countClass[classes] = 1
|
||||
currentPath = trainDir+'/'+classes+'/trainingsdata/'+infile
|
||||
# update termfrequency for specific class:
|
||||
self.termfrequenciesOfClasses[classes] = self.updateDictonary(currentPath, self.termfrequenciesOfClasses[classes])
|
||||
|
||||
|
||||
|
||||
# "incudludes" a file into the termfrequency dictonary
|
||||
def updateDictonary(self, pathToFile, dictonary):
|
||||
f = open(pathToFile, 'r')
|
||||
lines = f.readlines();
|
||||
for line in lines:
|
||||
thisline = line.split(" ");
|
||||
for word in thisline:
|
||||
word = word.lower()
|
||||
word = word.replace(".", " ")
|
||||
word = word.replace(",", " ")
|
||||
if dictonary.has_key(word):
|
||||
dictonary[str(word)] += 1
|
||||
else:
|
||||
dictonary[str(word)] = 1
|
||||
f.close()
|
||||
return dictonary
|
||||
|
||||
|
||||
|
||||
|
||||
def bayes(self, listOfFiles):
|
||||
@ -119,10 +143,10 @@ class multiclassClassifier:
|
||||
if __name__ == '__main__':
|
||||
ts = trainingsset()
|
||||
#ts.splitTrainingsdataRandomly();
|
||||
ts.getClassesToTrain();
|
||||
#ts.getClassesToTrain();
|
||||
|
||||
mc = multiclassClassifier()
|
||||
#mc.getTestData()
|
||||
mc.getTermfrequenciesOfClasses()
|
||||
#mc.writePredictionFile()
|
||||
|
||||
|
||||
|
||||
BIN
ss2013/1_Web Mining/Uebungen/4_Uebung/solution.odt
Normal file
BIN
ss2013/1_Web Mining/Uebungen/4_Uebung/solution.odt
Normal file
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user