libRLP.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104

''' 
The library for natural language processing, and categorizing
information using NLTK
'''
import nltk
from nltk import tokenize

import math
import os

class InputBreaker:
    def __init__(self, text):
        self.fSentences = []
        self.qEntitySentences = []
        self.analyzeText(text)
        print self.qEntitySentences

    def analyzeText(self, text):
        text = text.lower()
        tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
        sents = tokenizer.tokenize(text)
        for sentence in sents:
           self.determineSentenceType(sentence)

    def determineSentenceType(self, sentence):
        words = nltk.word_tokenize(sentence)
        tagit = nltk.pos_tag(words)
        print tagit
        #if (tagit[0][1] == 'NN*' and (tagit[-1][1] == 'PRP'or \
        #tagit[-2][1] == 'PRP')) or ((tagit[0][1] == 'NN' or\
        #tagit[0][1] == 'WP') and (tagit[-1][1] == 'NN' or \
        #tagit[-2][1] == 'NN')):
        self.qEntitySentences.append(tagit)
             

class Match:
    def __init__(self, questions):
        print questions
        self.file = open("FAQ/sugar.txt","rb")
        self.mostprobableQ = ""
        self.tomatchwords = []
        for question in questions:
            self.findOptimalQuestion(question)
    def findOptimalQuestion(self, question):
        i = 0
        
        listofmetricsforsentence = []
        self.tomatchwords = self.extractKeywords(question)
        #print self.tomatchwords
        for line in self.file.readlines():
            linetosave = line
            line = line.lower()
            
            if line[0:2] == 'q.':
                successmeter = 0
                for word in self.tomatchwords:
                    if line.find(word) != -1:
                        successmeter+=1
                    else:
                        pass
                if successmeter > 0:        
                    listofmetricsforsentence.append([self.tomatchwords, successmeter, linetosave, ])
        
        #print listofmetricsforsentence
        self.mostprobableQ = self.doMath(listofmetricsforsentence)[2]
        print self.mostprobableQ
        self.file.close()

    def doMath(self,metrics):
        greatestmetric = []
        metricno = -10
        print 'lolcat'
        for metric in metrics:
            userqtags = metric[0]
            print 'len of userqtags' + str(len(userqtags))
            success = metric[1]
            words = nltk.word_tokenize(metric[2].split('Q.')[1])
            print 'success is' + str(success)
            tagit = nltk.pos_tag(words)
            databaseqtags = self.extractKeywords(tagit)
            tocheck = success - len(userqtags)
            if tocheck  >  metricno:
                metricno = tocheck
                greatestmetric = metric
                print metricno
        print greatestmetric       
        return greatestmetric              
                        
    def extractKeywords(self, question):
        self.tomatchwords = []
        print question
        for tag in question:
            if tag[1] == "NN" or tag[1] == "JJ" \
            or tag[1] == "NNP" :
               self.tomatchwords.append(tag[0])
        print self.tomatchwords       
        return self.tomatchwords                       

    def reportAnswer(self,):
        file = open("FAQ/sugar.txt","rb")
        content = file.read()
        tuple = content.split(self.mostprobableQ)
        file.close()
        return tuple[1].split('Q.')[0]