-
Notifications
You must be signed in to change notification settings - Fork 1
/
autocomplete.py
126 lines (104 loc) · 4.21 KB
/
autocomplete.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# -*- coding: utf-8 -*-
"""autocomplete.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/15nory2OiHSgRzAxwo5uHRCNgAc5HEkoH
"""
import csv
from fast_autocomplete import AutoComplete
from typing import List, Dict
import re
from fast_autocomplete.misc import read_csv_gen
import string as s
import tqdm
class LayoutCorrector:
def __init__(self):
self.rus = "§1234567890-=qwertyuiop[]asdfghjkl;'\`zxcvbnm,./"
self.eng = ">1234567890-=йцукенгшщзхъфывапролджэё]ячсмитьбю/"
self.ru2en_dict = dict(zip(self.eng, self.rus))
self.en2ru_dict = dict(zip(self.rus, self.eng))
def en2ru(self, string) -> str:
return "".join([self.en2ru_dict[a] for a in string])
def ru2en(self, string) -> str:
return "".join([self.ru2en_dict[a] for a in string])
def isEnglish(self, string) -> bool:
count = 0
for x in string:
if x in self.eng:
count += 1
if len(string) * 0.5 < count:
return False
return True
class AutoCompleter:
def __init__(
self,
path2csv : str,
trainColumnName = "Название СТЕ",
) -> None:
self.symbols = self.getValidSymbols()
self.trainColumnName = trainColumnName
self.words = self.getWords(path2csv)
self.autoComplete = self.getAutoComplete(self.words, self.symbols)
self.corrector = LayoutCorrector()
def getValidSymbols(self):
return s.ascii_lowercase + "абвгдеёжзийклмнопрстуфхцчшщъыьэюя" + "1234567890"
def processString(self, query: str, do_print = False) -> str:
do_print = False
# Remove Unicode
output_string = re.sub(r'\W+', ' ', query)
# Remove Mentions
output_string = re.sub(r'@\w+', '', output_string)
# Lowercase the document
output_string = output_string.lower()
# Remove punctuations
# soutput_string = re.sub(r'[%s]' % re.escape('.,/"'), ' ', output_string)
# Remove the doubled space
output_string = re.sub(r'\s{2,}', ' ', output_string)
if do_print:
print(output_string)
return output_string
def getWords(self, path : str) -> Dict[str, dict]:
csv_gen = read_csv_gen(path, csv_func=csv.DictReader)
words = {}
for line in tqdm.tqdm(csv_gen,
desc = "processing dataset"):
processed_string = self.processString(line[self.trainColumnName])
local_words = processed_string.split(" ")
while local_words:
word = local_words.pop()
if word not in words:
words[word] = {}
if processed_string not in words:
words[processed_string] = {}
return words
def getAutoComplete(self, words, symbols):
return AutoComplete(
words=words,
valid_chars_for_string= symbols
)
def search(self, query : str, size = 5, max_cost = 2, translated = False) -> List[str]:
outputs = self.autoComplete.search(
word=self.processString(query, do_print = True),
size = size,
max_cost = max_cost
)
output = [item[0] for item in outputs]
if not output and not translated:
if self.corrector.isEnglish(query):
output = self.search(
self.corrector.en2ru(query), size, max_cost, True
)
else:
output = self.search(
self.corrector.ru2en(query), size, max_cost, True
)
if not output:
return "Товаров по данному запросу не найдено"
return output
def __call__(self, query : str, size = 5, max_cost = 2, translated = False) -> List[str]:
return self.search(query, size, max_cost, translated)
if __name__ == "__main__":
corrector = LayoutCorrector()
completer = AutoCompleter(r'C:\Users\alina\Downloads\СТЕ_Иркутск.csv')
while True:
print(completer(input("search...")))