Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

property extraction annotator #202

Merged
merged 50 commits into from
Feb 21, 2023
Merged
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
50 commits
Select commit Hold shift + click to select a range
f38010e
property extraction
dmitrijeuseew Sep 26, 2022
edcd39d
fixes
dmitrijeuseew Sep 27, 2022
ce37842
fixes
dmitrijeuseew Sep 27, 2022
766d852
Merge remote-tracking branch 'origin/dev' into feat/property_extraction
dilyararimovna Sep 28, 2022
26eee51
fixes
dmitrijeuseew Sep 28, 2022
44ddf46
add finegrained_types to property extraction
dmitrijeuseew Sep 29, 2022
17c7534
update
dmitrijeuseew Sep 29, 2022
db192c9
fixes
dmitrijeuseew Oct 3, 2022
5c0e407
fixes
dmitrijeuseew Oct 4, 2022
f6cbead
fix plural nouns
dmitrijeuseew Oct 4, 2022
e389411
add triplets
dmitrijeuseew Oct 5, 2022
436a075
add to yml files
dmitrijeuseew Oct 6, 2022
a4906aa
fix tests
dmitrijeuseew Oct 6, 2022
9648507
entity linking input from property extraction
dmitrijeuseew Oct 8, 2022
e1f03ae
Merge branch 'dev' into feat/property_extraction
dmitrijeuseew Oct 13, 2022
55e3c73
fixes
dmitrijeuseew Oct 13, 2022
6df8689
change gpu number
dmitrijeuseew Oct 13, 2022
b235720
model on cpu
dmitrijeuseew Oct 13, 2022
16f7f6d
add entity linking to proxy.yml
dmitrijeuseew Oct 13, 2022
ef744a2
fix tests
dmitrijeuseew Oct 13, 2022
ce11678
fix dp version and property extraction to cpu
dmitrijeuseew Oct 14, 2022
1bbc1e9
remove env variables
dmitrijeuseew Oct 17, 2022
6f36573
annotatate bot utterances
dmitrijeuseew Dec 14, 2022
c604088
add timeout
dmitrijeuseew Dec 15, 2022
9c98d35
add property extraction to readme
dmitrijeuseew Dec 26, 2022
9b559f6
Merge remote-tracking branch 'origin/dev' into feat/property_extraction
dmitrijeuseew Dec 29, 2022
c778559
update
dmitrijeuseew Dec 29, 2022
8b7b92f
codestyle
dmitrijeuseew Dec 29, 2022
da6efd2
fix state formatter
dmitrijeuseew Dec 30, 2022
e590473
Merge remote-tracking branch 'origin/dev' into feat/property_extraction
dmitrijeuseew Jan 13, 2023
02c4dd2
update requirements
dmitrijeuseew Jan 15, 2023
d697c5d
fix requirements
dmitrijeuseew Jan 15, 2023
def0c71
fix requirements
dmitrijeuseew Jan 15, 2023
401df87
update el requirements
dmitrijeuseew Jan 15, 2023
be2711c
Merge branch 'dev' into feat/property_extraction
dilyararimovna Jan 19, 2023
6a85675
fix: revert entity detection
dilyararimovna Jan 19, 2023
0e3e533
Merge branch 'dev' into feat/property_extraction
dilyararimovna Jan 20, 2023
bf5ee59
Merge branch 'dev' into feat/property_extraction
dilyararimovna Jan 26, 2023
b3caa34
sentence rewrite
dmitrijeuseew Jan 30, 2023
54d61a0
update
dmitrijeuseew Feb 2, 2023
b72d641
Merge branch 'dev' into feat/property_extraction
dmitrijeuseew Feb 2, 2023
dbc1604
fix typo
dmitrijeuseew Feb 3, 2023
05fe4af
fix requirements
dmitrijeuseew Feb 3, 2023
617e719
Merge branch 'dev' into feat/property_extraction
dmitrijeuseew Feb 15, 2023
896b153
fix typo
dmitrijeuseew Feb 15, 2023
01c0222
change port
dmitrijeuseew Feb 15, 2023
7e8e68e
update el
dmitrijeuseew Feb 16, 2023
f58f7ea
fix tests
dmitrijeuseew Feb 16, 2023
8c8fa95
Merge branch 'dev' into feat/property_extraction
dilyararimovna Feb 21, 2023
1db56ba
fix: readme and paths
dilyararimovna Feb 21, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions annotators/entity_linking/entity_linking_eng.json
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
{
"class_name": "src.entity_linking:EntityLinker",
"in": ["entity_substr", "entity_tags", "sentences"],
"out": ["entity_ids", "entity_conf", "entity_pages", "first_pars", "dbpedia_types"],
"out": ["entity_ids", "entity_conf", "entity_id_tags", "entity_pages", "first_pars", "dbpedia_types"],
"load_path": "{DOWNLOADS_PATH}/entity_linking_eng/el_eng_dream",
"add_info_filename": "{DOWNLOADS_PATH}/entity_linking_eng/el_eng_dream/add_info.db",
"tags_filename": "{MODELS_PATH}/finegrained_tags/tag.dict",
Expand All @@ -35,7 +35,7 @@
"lang": "en"
}
],
"out": ["entity_substr", "entity_ids", "entity_conf", "entity_pages", "first_pars", "dbpedia_types"]
"out": ["entity_substr", "entity_ids", "entity_conf", "entity_id_tags", "entity_pages", "first_pars", "dbpedia_types"]
},
"metadata": {
"variables": {
Expand Down
21 changes: 18 additions & 3 deletions annotators/entity_linking/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def respond():
entity_substr_batch,
entity_ids_batch,
conf_batch,
entity_id_tags_batch,
entity_pages_batch,
first_pars_batch,
dbpedia_types_batch,
Expand All @@ -60,21 +61,35 @@ def respond():
entity_substr_list,
entity_ids_list,
conf_list,
entity_id_tags_list,
entity_pages_list,
first_pars_list,
dbpedia_types_list,
) in zip(
entity_substr_batch, entity_ids_batch, conf_batch, entity_pages_batch, first_pars_batch, dbpedia_types_batch
entity_substr_batch,
entity_ids_batch,
conf_batch,
entity_id_tags_batch,
entity_pages_batch,
first_pars_batch,
dbpedia_types_batch,
):
entity_info_list = []
for entity_substr, entity_ids, confs, entity_pages, first_pars, dbpedia_types in zip(
entity_substr_list, entity_ids_list, conf_list, entity_pages_list, first_pars_list, dbpedia_types_list
for entity_substr, entity_ids, confs, entity_id_tags, entity_pages, first_pars, dbpedia_types in zip(
entity_substr_list,
entity_ids_list,
conf_list,
entity_id_tags_list,
entity_pages_list,
first_pars_list,
dbpedia_types_list,
):
entity_info = {}
entity_info["entity_substr"] = entity_substr
entity_info["entity_ids"] = entity_ids
entity_info["confidences"] = [float(elem[2]) for elem in confs]
entity_info["tokens_match_conf"] = [float(elem[0]) for elem in confs]
entity_info["entity_id_tags"] = entity_id_tags
entity_info["pages_titles"] = entity_pages
entity_info["first_paragraphs"] = first_pars
entity_info["dbpedia_types"] = dbpedia_types
Expand Down
86 changes: 61 additions & 25 deletions annotators/entity_linking/src/entity_linking.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,6 @@ def __init__(
**kwargs,
) -> None:
"""

Args:
load_path: path to folder with inverted index files
entity_ranker: component deeppavlov.models.kbqa.rel_ranking_bert
Expand Down Expand Up @@ -93,6 +92,7 @@ def __init__(
self.full_paragraph = full_paragraph
self.re_tokenizer = re.compile(r"[\w']+|[^\w ]")
self.not_found_str = "not in wiki"
self.stemmer = nltk.PorterStemmer()
self.related_tags = {
"loc": ["gpe", "country", "city", "us_state", "river"],
"gpe": ["loc", "country", "city", "us_state"],
Expand All @@ -107,6 +107,16 @@ def __init__(
"politician": ["per"],
"writer": ["per"],
}
self.not_named_entities_tags = {
"animal",
"food",
"music_genre",
"misc",
"language",
"occupation",
"type_of_sport",
"product",
}
self.word_searcher = None
if self.words_dict_filename:
self.word_searcher = WordSearcher(self.words_dict_filename, self.ngrams_matrix_filename)
Expand Down Expand Up @@ -168,11 +178,11 @@ def __call__(
entity_offsets_list.append([st_offset, end_offset])
entity_offsets_batch.append(entity_offsets_list)

entity_ids_batch, entity_conf_batch, entity_pages_batch = [], [], []
entity_ids_batch, entity_conf_batch, entity_pages_batch, entity_id_tags_batch = [], [], [], []
for entity_substr_list, entity_offsets_list, entity_tags_list, sentences_list, sentences_offsets_list in zip(
entity_substr_batch, entity_offsets_batch, entity_tags_batch, sentences_batch, sentences_offsets_batch
):
entity_ids_list, entity_conf_list, entity_pages_list = self.link_entities(
entity_ids_list, entity_conf_list, entity_pages_list, entity_id_tags_list = self.link_entities(
entity_substr_list,
entity_offsets_list,
entity_tags_list,
Expand All @@ -186,9 +196,17 @@ def __call__(
entity_pages_list = [entity_pages[: self.num_entities_to_return] for entity_pages in entity_pages_list]
entity_ids_batch.append(entity_ids_list)
entity_conf_batch.append(entity_conf_list)
entity_id_tags_batch.append(entity_id_tags_list)
entity_pages_batch.append(entity_pages_list)
first_par_batch, dbpedia_types_batch = self.extract_add_info(entity_pages_batch)
return entity_ids_batch, entity_conf_batch, entity_pages_batch, first_par_batch, dbpedia_types_batch
return (
entity_ids_batch,
entity_conf_batch,
entity_id_tags_batch,
entity_pages_batch,
first_par_batch,
dbpedia_types_batch,
)

def extract_add_info(self, entity_pages_batch: List[List[List[str]]]):
first_par_batch, dbpedia_types_batch = [], []
Expand All @@ -201,8 +219,10 @@ def extract_add_info(self, entity_pages_batch: List[List[List[str]]]):
query = "SELECT * FROM entity_additional_info WHERE page_title='{}';".format(entity_page)
res = self.add_info_cur.execute(query)
fetch_res = res.fetchall()
first_par = fetch_res[0][1]
dbpedia_types_elem = fetch_res[0][2].split()
first_par, dbpedia_types = "", []
if fetch_res:
first_par = fetch_res[0][1]
dbpedia_types_elem = fetch_res[0][2].split()
first_pars.append(first_par)
dbpedia_types.append(dbpedia_types_elem)
except Exception as e:
Expand All @@ -227,7 +247,8 @@ def link_entities(
f"entity_substr_list {entity_substr_list} entity_tags_list {entity_tags_list} "
f"entity_offsets_list {entity_offsets_list}"
)
entity_ids_list, conf_list, pages_list, pages_dict_list, descr_list = [], [], [], [], []
entity_ids_list, conf_list, pages_list, entity_id_tags_list, descr_list = [], [], [], [], []
pages_dict_list = []
if entity_substr_list:
entities_scores_list = []
cand_ent_scores_list = []
Expand Down Expand Up @@ -271,6 +292,10 @@ def link_entities(
corr_words = self.word_searcher(entity_substr_split[0], set(clean_tags + corr_clean_tags))
if corr_words:
cand_ent_init = self.find_exact_match(corr_words[0], tags + corr_tags)
if len(entity_substr_split) == 1 and self.stemmer.stem(entity_substr) != entity_substr:
entity_substr_stemmed = self.stemmer.stem(entity_substr)
stem_cand_ent_init = self.find_exact_match(entity_substr_stemmed, tags)
cand_ent_init = {**cand_ent_init, **stem_cand_ent_init}
if not cand_ent_init and len(entity_substr_split) > 1:
cand_ent_init = self.find_fuzzy_match(entity_substr_split, tags)

Expand All @@ -284,15 +309,22 @@ def link_entities(
cand_ent_scores = cand_ent_scores[: self.num_entities_for_bert_ranking]
cand_ent_scores_list.append(cand_ent_scores)
entity_ids = [elem[0] for elem in cand_ent_scores]
pages = [elem[5] for elem in cand_ent_scores]
entity_id_tags = [elem[5] for elem in cand_ent_scores]
pages = [elem[6] for elem in cand_ent_scores]
scores = [elem[1:5] for elem in cand_ent_scores]
entities_scores_list.append(
{entity_id: entity_scores for entity_id, entity_scores in zip(entity_ids, scores)}
)
entity_ids_list.append(entity_ids)
entity_id_tags_list.append(entity_id_tags)
pages_list.append(pages)
pages_dict_list.append({entity_id: page for entity_id, page in zip(entity_ids, pages)})
descr_list.append([elem[6] for elem in cand_ent_scores])
pages_dict_list.append(
{
entity_id: (page, entity_id_tag)
for entity_id, page, entity_id_tag in zip(entity_ids, pages, entity_id_tags)
}
)
descr_list.append([elem[7] for elem in cand_ent_scores])

if self.use_descriptions:
substr_lens = [len(entity_substr.split()) for entity_substr in entity_substr_list]
Expand All @@ -308,16 +340,19 @@ def link_entities(
substr_lens,
)
pages_list = [
[pages_dict.get(entity_id, "") for entity_id in entity_ids]
[pages_dict.get(entity_id, ("", ""))[0] for entity_id in entity_ids]
for entity_ids, pages_dict in zip(entity_ids_list, pages_dict_list)
]

return entity_ids_list, conf_list, pages_list
entity_id_tags_list = [
[pages_dict.get(entity_id, ("", ""))[1] for entity_id in entity_ids]
for entity_ids, pages_dict in zip(entity_ids_list, pages_dict_list)
]
return entity_ids_list, conf_list, pages_list, entity_id_tags_list

def process_cand_ent(self, cand_ent_init, entities_and_ids, entity_substr_split, tag, tag_conf):
for entity_title, entity_id, entity_rels, anchor_cnt, _, page, descr in entities_and_ids:
for entity_title, entity_id, entity_rels, anchor_cnt, tag, page, descr in entities_and_ids:
substr_score = self.calc_substr_score(entity_title, entity_substr_split)
cand_ent_init[entity_id].add((substr_score, anchor_cnt, entity_rels, tag_conf, page, descr))
cand_ent_init[entity_id].add((substr_score, anchor_cnt, entity_rels, tag_conf, tag, page, descr))
return cand_ent_init

def find_exact_match(self, entity_substr, tags):
Expand All @@ -333,15 +368,16 @@ def find_exact_match(self, entity_substr, tags):
cand_ent_init = self.process_cand_ent(
cand_ent_init, entities_and_ids, entity_substr_split, tag, tag_conf
)
if tags and tags[0][0] == "misc" and not cand_ent_init:
if tags and ((tags[0][0] == "misc" and not cand_ent_init) or tags[0][1] < 0.7):
for tag in self.cursors:
query = "SELECT * FROM inverted_index WHERE title MATCH '{}';".format(entity_substr)
res = self.cursors[tag].execute(query)
entities_and_ids = res.fetchall()
if entities_and_ids:
cand_ent_init = self.process_cand_ent(
cand_ent_init, entities_and_ids, entity_substr_split, tag, tag_conf
)
if (tags[0][0] == "misc" and tag in self.not_named_entities_tags) or tags[0][0] != "misc":
query = "SELECT * FROM inverted_index WHERE title MATCH '{}';".format(entity_substr)
res = self.cursors[tag].execute(query)
entities_and_ids = res.fetchall()
if entities_and_ids:
cand_ent_init = self.process_cand_ent(
cand_ent_init, entities_and_ids, entity_substr_split, tag, tag_conf
)
return cand_ent_init

def find_fuzzy_match(self, entity_substr_split, tags):
Expand Down Expand Up @@ -481,8 +517,8 @@ def rank_by_description(
)
for entity, score in scores
]
log.info(f"len entities with scores {len(entities_with_scores)}")
if entity_tags and entity_tags[0][0] == "misc":
log.info(f"len entities with scores {len(entities_with_scores)} --- entity_tags {entity_tags}")
if entity_tags and (entity_tags[0][0] == "misc" or entity_tags[0][1] < 0.7):
entities_with_scores = sorted(entities_with_scores, key=lambda x: (x[1], x[2], x[4]), reverse=True)
else:
entities_with_scores = sorted(entities_with_scores, key=lambda x: (x[1], x[4], x[3]), reverse=True)
Expand Down
20 changes: 20 additions & 0 deletions annotators/property_extraction/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
FROM deeppavlov/base-gpu
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

а версию зафиксировать?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Зафиксировал


RUN apt-get update && apt-get install git -y

ARG CONFIG
ARG PORT
ARG SRC_DIR
ARG SED_ARG=" | "
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

неиспользуемые


ENV CONFIG=$CONFIG
ENV PORT=$PORT

COPY ./annotators/property_extraction/requirements.txt /src/requirements.txt
RUN pip install -r /src/requirements.txt

COPY $SRC_DIR /src

WORKDIR /src

CMD gunicorn --workers=1 --timeout 500 server:app -b 0.0.0.0:8129
100 changes: 100 additions & 0 deletions annotators/property_extraction/property_classification_distilbert.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
{
"dataset_reader": {
"class_name": "sq_reader",
"data_path": "{DOWNLOADS_PATH}/dialogue_nli/dialogue_nli_cls.json"
},
"dataset_iterator": {
"class_name": "basic_classification_iterator",
"seed": 42
},
"chainer": {
"in": ["x"],
"in_y": ["y"],
"pipe": [
{
"class_name": "torch_transformers_preprocessor",
"vocab_file": "{TRANSFORMER}",
"do_lower_case": false,
"max_seq_length": 64,
"in": ["x"],
"out": ["bert_features"]
},
{
"id": "classes_vocab",
"class_name": "simple_vocab",
"fit_on": ["y"],
"save_path": "{MODEL_PATH}/classes.dict",
"load_path": "{MODEL_PATH}/classes.dict",
"in": ["y"],
"out": ["y_ids"]
},
{
"in": ["y_ids"],
"out": ["y_onehot"],
"class_name": "one_hotter",
"depth": "#classes_vocab.len",
"single_vector": true
},
{
"class_name": "torch_transformers_classifier",
"n_classes": "#classes_vocab.len",
"return_probas": true,
"pretrained_bert": "{TRANSFORMER}",
"save_path": "{MODEL_PATH}/model",
"load_path": "{MODEL_PATH}/model",
"optimizer": "AdamW",
"optimizer_parameters": {"lr": 1e-05},
"learning_rate_drop_patience": 5,
"learning_rate_drop_div": 2.0,
"in": ["bert_features"],
"in_y": ["y_ids"],
"out": ["y_pred_probas"]
},
{
"in": ["y_pred_probas"],
"out": ["y_pred_ids"],
"class_name": "proba2labels",
"max_proba": true
},
{
"in": ["y_pred_ids"],
"out": ["y_pred_labels"],
"ref": "classes_vocab"
}
],
"out": ["y_pred_labels"]
},
"train": {
"epochs": 100,
"batch_size": 64,
"metrics": [
"f1_macro",
"accuracy"
],
"validation_patience": 10,
"val_every_n_batches": 100,
"log_every_n_batches": 100,
"show_examples": false,
"evaluation_targets": ["valid", "test"],
"class_name": "torch_trainer"
},
"metadata": {
"variables": {
"TRANSFORMER": "distilbert-base-uncased",
"ROOT_PATH": "~/.deeppavlov",
"DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
"MODELS_PATH": "{ROOT_PATH}/models",
"MODEL_PATH": "{MODELS_PATH}/classifiers/property_classification"
},
"download": [
{
"url": "http://files.deeppavlov.ai/deeppavlov_data/generative_ie/property_classification.tar.gz",
"subdir": "{MODEL_PATH}"
},
{
"url": "http://files.deeppavlov.ai/deeppavlov_data/generative_ie/dialogue_nli_cls.tar.gz",
"subdir": "{DOWNLOADS_PATH}/dialogue_nli"
}
]
}
}
Loading