Web   ·   Wiki   ·   Activities   ·   Blog   ·   Lists   ·   Chat   ·   Meeting   ·   Bugs   ·   Git   ·   Translate   ·   Archive   ·   People   ·   Donate
summaryrefslogtreecommitdiffstats
path: root/libRLP.py
diff options
context:
space:
mode:
authorVamsi Krishna Davuluri <iwikiwi@huecomundo.themachineninja.org>2009-10-20 17:26:21 (GMT)
committer Vamsi Krishna Davuluri <iwikiwi@huecomundo.themachineninja.org>2009-10-20 17:26:21 (GMT)
commit30f1255f8b90f40ce295453e91820d7a8f33faa2 (patch)
tree6b19c535a5c878c292c37b8b024d28a5fc21d324 /libRLP.py
NLTK version of the lib. A badly written irc testbot, and a test file.
Diffstat (limited to 'libRLP.py')
-rw-r--r--libRLP.py102
1 files changed, 102 insertions, 0 deletions
diff --git a/libRLP.py b/libRLP.py
new file mode 100644
index 0000000..4d66954
--- /dev/null
+++ b/libRLP.py
@@ -0,0 +1,102 @@
+'''
+The library for natural language processing, and categorizing
+information using NLTK
+'''
+import nltk
+from nltk import tokenize
+
+import math
+import os
+
+class InputBreaker:
+ def __init__(self, text):
+ self.fSentences = []
+ self.qEntitySentences = []
+ self.analyzeText(text)
+ #print self.qEntitySentences
+
+ def analyzeText(self, text):
+ tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
+ sents = tokenizer.tokenize(text)
+ for sentence in sents:
+ self.determineSentenceType(sentence)
+
+ def determineSentenceType(self, sentence):
+ words = nltk.word_tokenize(sentence)
+ tagit = nltk.pos_tag(words)
+ #print tagit
+ if (tagit[0][1] == 'NNP' and tagit[-1][1] == 'PRP') or (tagit[0][1] == 'WP' and tagit[-1][1] == 'NN'):
+ self.qEntitySentences.append(tagit)
+
+
+class Match:
+ def __init__(self, questions):
+ #print questions
+ self.file = open("./FAQ/sugar.txt","rb")
+ self.mostprobableQ = ""
+ self.tomatchwords = []
+ for question in questions:
+ self.findOptimalQuestion(question)
+ def findOptimalQuestion(self, question):
+ i = 0
+
+ listofmetricsforsentence = []
+ self.tomatchwords = self.extractKeywords(question)
+ #print self.tomatchwords
+ for line in self.file.readlines():
+ linetosave = line
+ line = line.lower()
+
+ if line[0:2] == 'q.':
+ successmeter = 0
+ for word in self.tomatchwords:
+ if line.find(word) != -1:
+ successmeter+=1
+ else:
+ pass
+ if successmeter > 0:
+ listofmetricsforsentence.append([self.tomatchwords, successmeter, linetosave, ])
+
+ #print listofmetricsforsentence
+ self.mostprobableQ = self.doMath(listofmetricsforsentence)[2]
+ print self.mostprobableQ
+ self.file.close()
+
+ def doMath(self,metrics):
+ greatestmetric = []
+ metricno = 10
+ for metric in metrics:
+ userqtags = metric[0]
+ success = metric[1]
+ words = nltk.word_tokenize(metric[2].split('Q.')[1])
+ tagit = nltk.pos_tag(words)
+ databaseqtags = self.extractKeywords(tagit)
+ diff = math.sqrt((len(databaseqtags) - len(userqtags))*(len(databaseqtags) - len(userqtags)))
+ if diff > success:
+ tocheck = diff - success
+ else:
+ tocheck = success - diff
+ if tocheck < metricno or (success < diff and metricno != 0 and success > 0):
+ metricno = tocheck
+ greatestmetric = metric
+ print metricno
+ print greatestmetric
+ return greatestmetric
+
+ def extractKeywords(self, question):
+ self.tomatchwords = []
+ print question
+ for tag in question:
+ if tag[1] == "NN" or tag[1] == "JJ" or tag[1] == "NNP":
+ self.tomatchwords.append(tag[0])
+ print self.tomatchwords
+ return self.tomatchwords
+
+ def reportAnswer(self,):
+ self.file = open("./FAQ/sugar.txt","rb")
+ content = self.file.read()
+ tuple = content.split(self.mostprobableQ)
+ self.file.close()
+ return tuple[1].split('\n')[1]
+
+