Skip to content

Commit

Permalink
Feat/wiki parser RU (#114)
Browse files Browse the repository at this point in the history
* update

* codestyle

* add language parameter

* fix: language arg

* fix: language arg and revert generative in dockercompose

* fix tests

* codestyle

* fix: tests for ru

* fix: language value

* fix: ru test results

* fix: test pipe

* fix: sort types_2hop

* fix: black codestyle

* fix: tests for en wiki

* fix: quotes

* fix: codestyle

* fix: sort objects

* fix: test for wiki parser

* fix: codestyle

Co-authored-by: dilyararimovna <[email protected]>
  • Loading branch information
dmitrijeuseew and dilyararimovna committed Jun 28, 2022
1 parent e05b67d commit 1da2d6d
Show file tree
Hide file tree
Showing 7 changed files with 105 additions and 19 deletions.
4 changes: 3 additions & 1 deletion annotators/entity_linking_rus/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,9 @@ def respond():
st_time = time.time()
inp = request.json
entity_substr_batch = inp.get("entity_substr", [[""]])
entity_tags_batch = inp.get("entity_tags", [""])
entity_tags_batch = inp.get(
"entity_tags", [["" for _ in entity_substr_list] for entity_substr_list in entity_substr_batch]
)
context_batch = inp.get("context", [[""]])
opt_context_batch = []
for entity_substr_list, hist_utt in zip(entity_substr_batch, context_batch):
Expand Down
2 changes: 2 additions & 0 deletions annotators/wiki_parser/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@ ARG CONFIG
ARG COMMIT
ARG PORT
ARG SRC_DIR
ARG LANGUAGE

ENV CONFIG=$CONFIG
ENV PORT=$PORT
ENV COMMIT=$COMMIT
ENV LANGUAGE=$LANGUAGE

COPY ./annotators/wiki_parser/requirements.txt /src/requirements.txt
RUN pip install -r /src/requirements.txt
Expand Down
105 changes: 91 additions & 14 deletions annotators/wiki_parser/test_wiki_parser.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,36 @@
import os
import requests


if os.getenv("LANGUAGE", "EN") == "RU":
lang = "@ru"
else:
lang = "@en"


def main():
url = "http://0.0.0.0:8077/model"

request_data = [
request_data_en = [
{
"parser_info": ["find_top_triplets"],
"query": [[{"entity_substr": "Jürgen Schmidhuber", "entity_ids": ["Q92735"]}]],
"query": [[{"entity_substr": "Jurgen Schmidhuber", "entity_ids": ["Q92735"]}]],
}
]

gold_results = [
request_data_ru = [
{
"parser_info": ["find_top_triplets"],
"query": [[{"entity_substr": "Юрген Шмидхубер", "entity_ids": ["Q92735"]}]],
}
]
gold_results_en = [
[
{
"animals_skill_entities_info": {},
"entities_info": {
"Jürgen Schmidhuber": {
"age": 58,
"Jurgen Schmidhuber": {
"age": 59,
"conf": 1.0,
"country of sitizenship": [["Q183", "Germany"]],
"date of birth": [['"+1963-01-17^^T"', "17 January 1963"]],
"entity_label": "Jürgen Schmidhuber",
Expand All @@ -27,23 +41,86 @@ def main():
["Q82594", "computer scientist"],
],
"plain_entity": "Q92735",
"pos": 0,
"token_conf": 1.0,
"types_2hop": [
["Q14565186", "cognitive scientist"],
["Q15976092", "artificial intelligence researcher"],
["Q1622272", "university teacher"],
["Q28640", "profession"],
["Q3400985", "academic"],
["Q37226", "teacher"],
["Q41835716", "faculty member"],
["Q5", "human"],
["Q66666607", "academic profession"],
["Q82594", "computer scientist"],
["Q901", "scientist"],
],
}
},
"topic_skill_entities_info": {},
"utt_num": 0,
"wiki_skill_entities_info": {},
}
]
]
gold_results_ru = [
[
{
"animals_skill_entities_info": {},
"entities_info": {
"Юрген Шмидхубер": {
"age": 59,
"conf": 1.0,
"country of sitizenship": [["Q183", "Германия"]],
"date of birth": [['"+1963-01-17^^T"', "17 January 1963"]],
"entity_label": "Шмидхубер, Юрген",
"instance of": [["Q5", "человек"]],
"occupation": [
["Q15976092", "исследователь искусственного интеллекта"],
["Q1622272", "преподаватель университета"],
["Q82594", "специалист в области информатики"],
],
"plain_entity": "Q92735",
"pos": 0,
"token_conf": 1.0,
"types_2hop": [
["Q15976092", "исследователь искусственного интеллекта"],
["Q1622272", "преподаватель университета"],
["Q28640", "профессия"],
["Q3400985", "научный работник"],
["Q37226", "учитель"],
["Q41835716", "преподаватель"],
["Q5", "человек"],
["Q66666607", "академическая профессия"],
["Q82594", "специалист в области информатики"],
["Q901", "учёный"],
],
}
},
"topic_skill_entities_info": {},
"utt_num": 0,
"wiki_skill_entities_info": {},
}
]
]

count = 0
for data, gold_result in zip(request_data, gold_results):
result = requests.post(url, json=data).json()
if result == gold_result:
count += 1
else:
print(f"Got {result}, but expected: {gold_result}")

if count == len(request_data):
if lang == "@ru":
for data, gold_result in zip(request_data_ru, gold_results_ru):
result = requests.post(url, json=data).json()
if result == gold_result:
count += 1
assert count == len(request_data_ru), print(f"Got {result}, but expected: {gold_result}")

print("Success")
elif lang == "@en":
for data, gold_result in zip(request_data_en, gold_results_en):
result = requests.post(url, json=data).json()
if result == gold_result:
count += 1
assert count == len(request_data_en), print(f"Got {result}, but expected: {gold_result}")

print("Success")


Expand Down
8 changes: 6 additions & 2 deletions annotators/wiki_parser/wiki_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,11 @@
"statement": "http://ws",
}
max_comb_num = 1e6
lang = "@en"

if os.getenv("LANGUAGE", "EN") == "RU":
lang = "@ru"
else:
lang = "@en"
wiki_filename = "/root/.deeppavlov/downloads/wikidata/wikidata_lite.hdt"
document = HDTDocument(wiki_filename)
USE_CACHE = True
Expand Down Expand Up @@ -363,7 +367,7 @@ def find_objects_info(objects, num_objects=25):
obj_label = find_label(obj, "")
if obj_label and obj_label not in {"Not Found", "anonymous"}:
objects_info.append((obj, obj_label))
return objects_info
return sorted(objects_info)


def find_intersection(entity1, entity2, rel, direction):
Expand Down
1 change: 1 addition & 0 deletions assistant_dists/dream_russian/docker-compose.override.yml
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,7 @@ services:
PORT: 8077
SRC_DIR: annotators/wiki_parser
COMMIT: ff5b156d16a949c3ec99da7fb60ae907dec37a41
LANGUAGE: RU
context: ./
dockerfile: annotators/wiki_parser/Dockerfile
command: flask run -h 0.0.0.0 -p 8077
Expand Down
2 changes: 1 addition & 1 deletion assistant_dists/dream_russian/pipeline_conf.json
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@
"timeout": 1,
"url": "http://entity-detection:8103/respond"
},
"dialog_formatter": "state_formatters.dp_formatters:preproc_last_human_utt_dialog",
"dialog_formatter": "state_formatters.dp_formatters:entity_detection_formatter_dialog",
"response_formatter": "state_formatters.dp_formatters:simple_formatter_service",
"state_manager_method": "add_annotation",
"required_previous_services": [
Expand Down
2 changes: 1 addition & 1 deletion common/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1117,7 +1117,7 @@ def get_common_tokens_in_lists_of_strings(list_of_strings_0, list_of_strings_1):
return common_substrings


SYMBOLS_EXCEPT_LETTERS_AND_DIGITS = re.compile(r"[^a-zA-Z0-9\-_ ]")
SYMBOLS_EXCEPT_LETTERS_AND_DIGITS = re.compile(r"[^a-zA-Zа-яА-ЯёЁ0-9\-_ ]")
DOUBLE_SPACES = re.compile(r"\s+")


Expand Down

0 comments on commit 1da2d6d

Please sign in to comment.