-
Notifications
You must be signed in to change notification settings - Fork 1
/
ngram_model.py
44 lines (35 loc) · 1.32 KB
/
ngram_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import itertools
import config
import dataAccessor
class Ngrams:
def __init__(self, sentence, language):
self.sentence = sentence
self.language = language
self.ngrams = self.__build_ngrams()
self.values = dataAccessor.DataAccessor(sentence, self.get_all_ngrams(), language).get_values()
def get_language(self):
return self.language
def get_values(self):
return self.values
def get_all_ngrams(self):
return list(itertools.chain.from_iterable(self.ngrams))
def get_ngrams(self, n):
return self.ngrams[n-1]
def __build_ngrams(self):
ngrams = []
tokens = ["_START_"] + self.sentence.get_content().split()
for i in range(1, config.MAX_NGRAM_SIZE + 1):
ngrams.append(self.__build_ngrams_size_n(tokens, i))
return ngrams
@staticmethod
def __build_ngrams_size_n(tokens, n):
ngrams_size_n = []
for word_position in range(len(tokens) - (n - 1)):
ngrams_size_n.append(Ngrams.__concatenate_ngram_from_tokens(word_position, n, tokens))
return ngrams_size_n
@staticmethod
def __concatenate_ngram_from_tokens(word_position, n, tokens):
ngram = tokens[word_position]
for i in range(1, n):
ngram += " " + tokens[word_position + i]
return ngram