From 30f1255f8b90f40ce295453e91820d7a8f33faa2 Mon Sep 17 00:00:00 2001 From: Vamsi Krishna Davuluri Date: Tue, 20 Oct 2009 17:26:21 +0000 Subject: NLTK version of the lib. A badly written irc testbot, and a test file. --- (limited to 'libRLP.py') diff --git a/libRLP.py b/libRLP.py new file mode 100644 index 0000000..4d66954 --- /dev/null +++ b/libRLP.py @@ -0,0 +1,102 @@ +''' +The library for natural language processing, and categorizing +information using NLTK +''' +import nltk +from nltk import tokenize + +import math +import os + +class InputBreaker: + def __init__(self, text): + self.fSentences = [] + self.qEntitySentences = [] + self.analyzeText(text) + #print self.qEntitySentences + + def analyzeText(self, text): + tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') + sents = tokenizer.tokenize(text) + for sentence in sents: + self.determineSentenceType(sentence) + + def determineSentenceType(self, sentence): + words = nltk.word_tokenize(sentence) + tagit = nltk.pos_tag(words) + #print tagit + if (tagit[0][1] == 'NNP' and tagit[-1][1] == 'PRP') or (tagit[0][1] == 'WP' and tagit[-1][1] == 'NN'): + self.qEntitySentences.append(tagit) + + +class Match: + def __init__(self, questions): + #print questions + self.file = open("./FAQ/sugar.txt","rb") + self.mostprobableQ = "" + self.tomatchwords = [] + for question in questions: + self.findOptimalQuestion(question) + def findOptimalQuestion(self, question): + i = 0 + + listofmetricsforsentence = [] + self.tomatchwords = self.extractKeywords(question) + #print self.tomatchwords + for line in self.file.readlines(): + linetosave = line + line = line.lower() + + if line[0:2] == 'q.': + successmeter = 0 + for word in self.tomatchwords: + if line.find(word) != -1: + successmeter+=1 + else: + pass + if successmeter > 0: + listofmetricsforsentence.append([self.tomatchwords, successmeter, linetosave, ]) + + #print listofmetricsforsentence + self.mostprobableQ = self.doMath(listofmetricsforsentence)[2] + print self.mostprobableQ + self.file.close() + + def doMath(self,metrics): + greatestmetric = [] + metricno = 10 + for metric in metrics: + userqtags = metric[0] + success = metric[1] + words = nltk.word_tokenize(metric[2].split('Q.')[1]) + tagit = nltk.pos_tag(words) + databaseqtags = self.extractKeywords(tagit) + diff = math.sqrt((len(databaseqtags) - len(userqtags))*(len(databaseqtags) - len(userqtags))) + if diff > success: + tocheck = diff - success + else: + tocheck = success - diff + if tocheck < metricno or (success < diff and metricno != 0 and success > 0): + metricno = tocheck + greatestmetric = metric + print metricno + print greatestmetric + return greatestmetric + + def extractKeywords(self, question): + self.tomatchwords = [] + print question + for tag in question: + if tag[1] == "NN" or tag[1] == "JJ" or tag[1] == "NNP": + self.tomatchwords.append(tag[0]) + print self.tomatchwords + return self.tomatchwords + + def reportAnswer(self,): + self.file = open("./FAQ/sugar.txt","rb") + content = self.file.read() + tuple = content.split(self.mostprobableQ) + self.file.close() + return tuple[1].split('\n')[1] + + -- cgit v0.9.1