forked from corinnelhh/chatbot
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtrainbot.py
More file actions
158 lines (135 loc) · 5.43 KB
/
trainbot.py
File metadata and controls
158 lines (135 loc) · 5.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import nltk
import random
import os
from nltk import pos_tag
from nltk.tokenize import wordpunct_tokenize
import pdb
import sys
class Trainbot(object):
def __init__(self, training_file='Doctorow.txt'):
self.training_file = training_file
self.tri_lexicon = {}
self.bi_lexicon = {}
self.word_pos = {}
self.pos_word = {}
self.stop_puncts = ['.', '!', '?']
self.puncts = [
',', ';', ':', '"', "'",
'--', ",?", '."', ',"',
"Mr", "Mrs", '?"', '-'
]
def parse_training_input(self, text):
while True:
our_text = text.read(20048)
if not our_text:
break
yield wordpunct_tokenize(our_text)
def remove_non_final_punctuation(self, our_list):
for i in our_list[:]:
if i in self.puncts:
our_list.remove(i)
return our_list
def tag_input(self, our_string):
our_string = wordpunct_tokenize(our_string)
return pos_tag(our_string)
def _pos_lexicons(self):
"""Creates two pos dictionaries.
One with every word as a key and the values being a list of all it's parts of speech.
The list may look like [noun,verb,noun,noun] implying that it
is a noun more often than a verb. The second is the same, but inverted."""
f = open(self.training_file)
print "opened PART OF SPEECH DICT"
counter = 0
for words in self.parse_training_input(f):
tagged = pos_tag(words)
for word, pos in tagged:
counter += 1
if word in self.word_pos:
self.word_pos[word].append(pos)
else:
self.word_pos[word] = [pos]
if pos in self.pos_word:
self.pos_word[pos].append(word)
else:
self.pos_word[pos] = [word]
print "Building dict..." + str(counter)
print "Done with POS DICT"
# return len(self.word_pos), self.pos_word
def _fill_lexicon(self):
f = open(self.training_file)
for words in self.parse_training_input(f):
words = self.remove_non_final_punctuation(words)
for idx, word in enumerate(words[2:]):
word_pair = "{} {}".format(words[idx], words[idx + 1])
first_word = str(words[idx])
second_word = str(words[idx + 1])
if first_word not in self.bi_lexicon:
self.bi_lexicon[first_word] = [second_word]
if word_pair not in self.tri_lexicon:
self.tri_lexicon[word_pair] = [word]
else:
self.bi_lexicon[first_word].append(second_word)
self.tri_lexicon[word_pair].append(word)
def generate_pos_dict(self, prefix):
self._pos_lexicons()
training_dict_file = "%s/%s_word_pos_dict.txt" % (prefix, prefix)
dict_text = open(training_dict_file, 'w')
dict_text.write(str(tb.word_pos))
training_dict_file = "%s/%s_pos_word_dict.txt" % (prefix, prefix)
dict_text = open(training_dict_file, 'w')
dict_text.write(str(tb.pos_word))
def generate_gram_dict(self, prefix):
self._fill_lexicon()
training_dict_file = "%s/%s_bi_gram_dict.txt" % (prefix, prefix)
dict_text = open(training_dict_file, 'w')
dict_text.write(str(tb.bi_lexicon))
training_dict_file = "%s/%s_tri_gram_dict.txt" % (prefix, prefix)
dict_text = open(training_dict_file, 'w')
dict_text.write(str(tb.tri_lexicon))
def generate_all_dicts(self):
prefix = str(self.training_file)[:-4]
if not os.path.exists(prefix):
os.makedirs(prefix)
self.generate_gram_dict(prefix)
self.generate_pos_dict(prefix)
os.rename(self.training_file, "%s/%s" % (prefix, self.training_file))
def reformat_dict(self, dict_):
for k, v in dict_.items():
pos = {}
for i in v:
if i not in pos:
pos[i] = 1
else:
pos[i] += 1
dict_[k] = pos
for k, v in dict_.items():
list_ = []
for pos, count in v.items():
list_.append((count, pos))
list_.sort(reverse=True)
dict_[k] = list_
return dict_
def load_lexicons(self):
prefix = str(self.training_file)[:-4]
print prefix
if os.path.exists(prefix):
training_dict_file = "%s/%s_bi_gram_dict.txt" % (prefix, prefix)
dict_text = open(training_dict_file, 'r').read()
self.bi_lexicon = eval(dict_text)
training_dict_file = "%s/%s_tri_gram_dict.txt" % (prefix, prefix)
dict_text = open(training_dict_file, 'r').read()
self.tri_lexicon = eval(dict_text)
training_dict_file = "%s/%s_word_pos_dict.txt" % (prefix, prefix)
dict_text = open(training_dict_file, 'r').read()
self.word_pos = eval(dict_text)
training_dict_file = "%s/%s_pos_word_dict.txt" % (prefix, prefix)
dict_text = open(training_dict_file, 'r').read()
self.pos_word = eval(dict_text)
return True
else:
self.generate_all_dicts()
return False
if __name__ == '__main__':
tb = Trainbot(sys.argv[1])
tb.load_lexicons()
print "Done!"