NLTK version of the lib. A badly written irc testbot, and a test file.

author: Vamsi Krishna Davuluri <iwikiwi@huecomundo.themachineninja.org> 2009-10-20 17:26:21 (GMT)
committer: Vamsi Krishna Davuluri <iwikiwi@huecomundo.themachineninja.org> 2009-10-20 17:26:21 (GMT)
commit: 30f1255f8b90f40ce295453e91820d7a8f33faa2 (patch)
tree: 6b19c535a5c878c292c37b8b024d28a5fc21d324 /libRLP.py
1 files changed, 102 insertions, 0 deletions
diff --git a/libRLP.py b/libRLP.py
new file mode 100644
index 0000000..4d66954
--- /dev/null
+++ b/libRLP.py
@@ -0,0 +1,102 @@
+''' 
+The library for natural language processing, and categorizing
+information using NLTK
+'''
+import nltk
+from nltk import tokenize
+
+import math
+import os
+
+class InputBreaker:
+    def __init__(self, text):
+        self.fSentences = []
+        self.qEntitySentences = []
+        self.analyzeText(text)
+        #print self.qEntitySentences
+
+    def analyzeText(self, text):
+        tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
+        sents = tokenizer.tokenize(text)
+        for sentence in sents:
+           self.determineSentenceType(sentence)
+
+    def determineSentenceType(self, sentence):
+        words = nltk.word_tokenize(sentence)
+        tagit = nltk.pos_tag(words)
+        #print tagit
+        if (tagit[0][1] == 'NNP' and tagit[-1][1] == 'PRP') or (tagit[0][1] == 'WP' and tagit[-1][1] == 'NN'):
+           self.qEntitySentences.append(tagit)
+             
+
+class Match:
+    def __init__(self, questions):
+        #print questions
+        self.file = open("./FAQ/sugar.txt","rb")
+        self.mostprobableQ = ""
+        self.tomatchwords = []
+        for question in questions:
+            self.findOptimalQuestion(question)
+    def findOptimalQuestion(self, question):
+        i = 0
+        
+        listofmetricsforsentence = []
+        self.tomatchwords = self.extractKeywords(question)
+        #print self.tomatchwords
+        for line in self.file.readlines():
+            linetosave = line
+            line = line.lower()
+            
+            if line[0:2] == 'q.':
+                successmeter = 0
+                for word in self.tomatchwords:
+                    if line.find(word) != -1:
+                        successmeter+=1
+                    else:
+                        pass
+                if successmeter > 0:        
+                    listofmetricsforsentence.append([self.tomatchwords, successmeter, linetosave, ])
+        
+        #print listofmetricsforsentence
+        self.mostprobableQ = self.doMath(listofmetricsforsentence)[2]
+        print self.mostprobableQ
+        self.file.close()
+
+    def doMath(self,metrics):
+        greatestmetric = []
+        metricno = 10
+        for metric in metrics:
+            userqtags = metric[0]
+            success = metric[1]
+            words = nltk.word_tokenize(metric[2].split('Q.')[1])
+            tagit = nltk.pos_tag(words)
+            databaseqtags = self.extractKeywords(tagit)
+            diff =  math.sqrt((len(databaseqtags) - len(userqtags))*(len(databaseqtags) - len(userqtags)))
+            if diff > success:
+               tocheck = diff - success
+            else:
+               tocheck = success - diff
+            if tocheck < metricno or (success < diff and metricno != 0 and success > 0):
+               metricno = tocheck
+               greatestmetric = metric
+               print metricno
+        print greatestmetric       
+        return greatestmetric              
+                        
+    def extractKeywords(self, question):
+        self.tomatchwords = []
+        print question
+        for tag in question:
+            if tag[1] == "NN" or tag[1] == "JJ" or tag[1] == "NNP":
+               self.tomatchwords.append(tag[0])
+        print self.tomatchwords       
+        return self.tomatchwords                       
+
+    def reportAnswer(self,):
+        self.file = open("./FAQ/sugar.txt","rb")
+        content = self.file.read()
+        tuple = content.split(self.mostprobableQ)
+        self.file.close()
+        return tuple[1].split('\n')[1]
+
+
author	Vamsi Krishna Davuluri <iwikiwi@huecomundo.themachineninja.org>	2009-10-20 17:26:21 (GMT)
committer	Vamsi Krishna Davuluri <iwikiwi@huecomundo.themachineninja.org>	2009-10-20 17:26:21 (GMT)
commit	30f1255f8b90f40ce295453e91820d7a8f33faa2 (patch)
tree	6b19c535a5c878c292c37b8b024d28a5fc21d324 /libRLP.py