This repository has been archived by the owner on Aug 24, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
/
translator.py
114 lines (82 loc) · 3.76 KB
/
translator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
"""Sockeye model loading and inference"""
import sys
import sockeye
import mxnet as mx
import sentencepiece as spm
import json
from truecaser import applytc
from log import log
from collections import namedtuple
from sockeye.translate import inference
def _preprocess(sentence, index, lang_factor, style_factor,
models, constraints):
truecased_sentence = applytc.processLine(models.truecaser, sentence)
pieces = models.segmenter.EncodeAsPieces(truecased_sentence)
segmented_sentence = ' '.join(pieces)
rawlen = len(pieces)
prejsson = { 'text': segmented_sentence, 'factors': [" ".join([lang_factor] * rawlen), " ".join([style_factor] * rawlen), " ".join(['f0'] * rawlen), " ".join(['g0'] * rawlen)]}
try:
if constraints and constraints[index]:
prejsson['avoid'] = constraints[index]
except IndexError as e:
sys.stderr.write(str(constraints) + ", " + str(index))
raise e
jsson = json.dumps(prejsson)
log("PREPROC received '" + sentence + "', turned it into '" + segmented_sentence + "'")
return jsson
def _doMany(many, func, args):
return [func(one, idx, *args) for idx, one in enumerate(many)]
def _postprocess(sentence, idx, models):
de_segmented_sentence = models.segmenter.DecodePieces(sentence.split())
try:
de_truecased_sentence = de_segmented_sentence[0].upper() + de_segmented_sentence[1:]
except:
de_truecased_sentence = de_segmented_sentence
log("POSTPROC received '" + sentence + "', turned it into '" + de_truecased_sentence + "'")
return de_truecased_sentence
def _forward(sentences, models):
trans_inputs = [inference.make_input_from_json_string(sentence_id=i, json_string=sentence, translator=models.translator) for i, sentence in enumerate(sentences)]
outputs = models.translator.translate(trans_inputs)
return [(output.translation, output.score) for output in outputs]
def _loadTranslator(model_folders, ctx = mx.gpu()):
models, source_vocabs, target_vocab = inference.load_models(
context=ctx,
max_input_len=None,
beam_size=3,
batch_size=16,
model_folders=model_folders,
checkpoints=None,
softmax_temperature=None,
max_output_length_num_stds=2,
decoder_return_logit_inputs=False,
cache_output_layer_w_b=False)
return inference.Translator(context=ctx,
ensemble_mode="linear",
bucket_source_width=10,
length_penalty=inference.LengthPenalty(1.0, 0.0),
beam_prune=0,
beam_search_stop='all',
models=models,
source_vocabs=source_vocabs,
target_vocab=target_vocab,
restrict_lexicon=None,
store_beam=False,
strip_unknown_words=False)
def loadModels(translationModelPath, truecaserModelPath, segmenterModelPath):
"""Load translation, truecasing and segmentation models and
return them as a named tuple"""
translationModel = _loadTranslator([translationModelPath,])
truecaserModel = applytc.loadModel(truecaserModelPath)
segmenterModel = spm.SentencePieceProcessor()
segmenterModel.Load(segmenterModelPath)
Models = namedtuple("Models", ["translator", "truecaser", "segmenter"])
return Models(translationModel, truecaserModel, segmenterModel)
def translate(models, sentences, outputLanguage, outputStyle, constraints):
"""Take list of sentences, output language and style as well as a list of constraints,
and feed them through a set of loaded NMT models.
Return list of translations, list of scores, list of preprocessed input sentences and list of raw translations prior to postprocessing."""
cleaninputs = _doMany(sentences, _preprocess, (outputLanguage, outputStyle, models, constraints))
scoredTranslations = _forward(cleaninputs, models)
translations, scores = zip(*scoredTranslations)
postprocessed_translations = _doMany(translations, _postprocess, (models,))
return postprocessed_translations, scores, cleaninputs, translations