libRLP.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102

''' 
The library for natural language processing, and categorizing
information using NLTK
'''
import nltk
from nltk import tokenize

import math
import os

class InputBreaker:
    def __init__(self, text):
        self.fSentences = []
        self.qEntitySentences = []
        self.analyzeText(text)
        #print self.qEntitySentences

    def analyzeText(self, text):
        tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
        sents = tokenizer.tokenize(text)
        for sentence in sents:
           self.determineSentenceType(sentence)

    def determineSentenceType(self, sentence):
        words = nltk.word_tokenize(sentence)
        tagit = nltk.pos_tag(words)
        #print tagit
        if (tagit[0][1] == 'NNP' and tagit[-1][1] == 'PRP') or (tagit[0][1] == 'WP' and tagit[-1][1] == 'NN'):
           self.qEntitySentences.append(tagit)
             

class Match:
    def __init__(self, questions):
        #print questions
        self.file = open("./FAQ/sugar.txt","rb")
        self.mostprobableQ = ""
        self.tomatchwords = []
        for question in questions:
            self.findOptimalQuestion(question)
    def findOptimalQuestion(self, question):
        i = 0
        
        listofmetricsforsentence = []
        self.tomatchwords = self.extractKeywords(question)
        #print self.tomatchwords
        for line in self.file.readlines():
            linetosave = line
            line = line.lower()
            
            if line[0:2] == 'q.':
                successmeter = 0
                for word in self.tomatchwords:
                    if line.find(word) != -1:
                        successmeter+=1
                    else:
                        pass
                if successmeter > 0:        
                    listofmetricsforsentence.append([self.tomatchwords, successmeter, linetosave, ])
        
        #print listofmetricsforsentence
        self.mostprobableQ = self.doMath(listofmetricsforsentence)[2]
        print self.mostprobableQ
        self.file.close()

    def doMath(self,metrics):
        greatestmetric = []
        metricno = 10
        for metric in metrics:
            userqtags = metric[0]
            success = metric[1]
            words = nltk.word_tokenize(metric[2].split('Q.')[1])
            tagit = nltk.pos_tag(words)
            databaseqtags = self.extractKeywords(tagit)
            diff =  math.sqrt((len(databaseqtags) - len(userqtags))*(len(databaseqtags) - len(userqtags)))
            if diff > success:
               tocheck = diff - success
            else:
               tocheck = success - diff
            if tocheck < metricno or (success < diff and metricno != 0 and success > 0):
               metricno = tocheck
               greatestmetric = metric
               print metricno
        print greatestmetric       
        return greatestmetric              
                        
    def extractKeywords(self, question):
        self.tomatchwords = []
        print question
        for tag in question:
            if tag[1] == "NN" or tag[1] == "JJ" or tag[1] == "NNP":
               self.tomatchwords.append(tag[0])
        print self.tomatchwords       
        return self.tomatchwords                       

    def reportAnswer(self,):
        self.file = open("./FAQ/sugar.txt","rb")
        content = self.file.read()
        tuple = content.split(self.mostprobableQ)
        self.file.close()
        return tuple[1].split('\n')[1]