1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
|
'''
The library for natural language processing, and categorizing
information using NLTK
'''
import nltk
from nltk import tokenize
import math
import os
class InputBreaker:
def __init__(self, text):
self.fSentences = []
self.qEntitySentences = []
self.analyzeText(text)
print self.qEntitySentences
def analyzeText(self, text):
text = text.lower()
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
sents = tokenizer.tokenize(text)
for sentence in sents:
self.determineSentenceType(sentence)
def determineSentenceType(self, sentence):
words = nltk.word_tokenize(sentence)
tagit = nltk.pos_tag(words)
print tagit
#if (tagit[0][1] == 'NN*' and (tagit[-1][1] == 'PRP'or \
#tagit[-2][1] == 'PRP')) or ((tagit[0][1] == 'NN' or\
#tagit[0][1] == 'WP') and (tagit[-1][1] == 'NN' or \
#tagit[-2][1] == 'NN')):
self.qEntitySentences.append(tagit)
class Match:
def __init__(self, questions):
print questions
self.file = open("FAQ/sugar.txt","rb")
self.mostprobableQ = ""
self.tomatchwords = []
for question in questions:
self.findOptimalQuestion(question)
def findOptimalQuestion(self, question):
i = 0
listofmetricsforsentence = []
self.tomatchwords = self.extractKeywords(question)
#print self.tomatchwords
for line in self.file.readlines():
linetosave = line
line = line.lower()
if line[0:2] == 'q.':
successmeter = 0
for word in self.tomatchwords:
if line.find(word) != -1:
successmeter+=1
else:
pass
if successmeter > 0:
listofmetricsforsentence.append([self.tomatchwords, successmeter, linetosave, ])
#print listofmetricsforsentence
self.mostprobableQ = self.doMath(listofmetricsforsentence)[2]
print self.mostprobableQ
self.file.close()
def doMath(self,metrics):
greatestmetric = []
metricno = -10
print 'lolcat'
for metric in metrics:
userqtags = metric[0]
print 'len of userqtags' + str(len(userqtags))
success = metric[1]
words = nltk.word_tokenize(metric[2].split('Q.')[1])
print 'success is' + str(success)
tagit = nltk.pos_tag(words)
databaseqtags = self.extractKeywords(tagit)
tocheck = success - len(userqtags)
if tocheck > metricno:
metricno = tocheck
greatestmetric = metric
print metricno
print greatestmetric
return greatestmetric
def extractKeywords(self, question):
self.tomatchwords = []
print question
for tag in question:
if tag[1] == "NN" or tag[1] == "JJ" \
or tag[1] == "NNP" :
self.tomatchwords.append(tag[0])
print self.tomatchwords
return self.tomatchwords
def reportAnswer(self,):
file = open("FAQ/sugar.txt","rb")
content = file.read()
tuple = content.split(self.mostprobableQ)
file.close()
return tuple[1].split('Q.')[0]
|