deeppavlov · dilyararimovna · Feb 21, 2023 · Sep 26, 2022 · Sep 27, 2022 · Sep 27, 2022
diff --git a/annotators/entity_linking/entity_linking_eng.json b/annotators/entity_linking/entity_linking_eng.json
@@ -15,7 +15,7 @@
       {
         "class_name": "src.entity_linking:EntityLinker",
         "in": ["entity_substr", "entity_tags", "sentences"],
-        "out": ["entity_ids", "entity_conf", "entity_pages", "first_pars", "dbpedia_types"],
+        "out": ["entity_ids", "entity_conf", "entity_id_tags", "entity_pages", "first_pars", "dbpedia_types"],
         "load_path": "{DOWNLOADS_PATH}/entity_linking_eng/el_eng_dream",
         "add_info_filename": "{DOWNLOADS_PATH}/entity_linking_eng/el_eng_dream/add_info.db",
         "tags_filename": "{MODELS_PATH}/finegrained_tags/tag.dict",
@@ -35,7 +35,7 @@
         "lang": "en"
       }
     ],
-    "out": ["entity_substr", "entity_ids", "entity_conf", "entity_pages", "first_pars", "dbpedia_types"]
+    "out": ["entity_substr", "entity_ids", "entity_conf", "entity_id_tags", "entity_pages", "first_pars", "dbpedia_types"]
   },
   "metadata": {
     "variables": {

diff --git a/annotators/entity_linking/server.py b/annotators/entity_linking/server.py
@@ -51,6 +51,7 @@ def respond():
             entity_substr_batch,
             entity_ids_batch,
             conf_batch,
+            entity_id_tags_batch,
             entity_pages_batch,
             first_pars_batch,
             dbpedia_types_batch,
@@ -60,21 +61,35 @@ def respond():
             entity_substr_list,
             entity_ids_list,
             conf_list,
+            entity_id_tags_list,
             entity_pages_list,
             first_pars_list,
             dbpedia_types_list,
         ) in zip(
-            entity_substr_batch, entity_ids_batch, conf_batch, entity_pages_batch, first_pars_batch, dbpedia_types_batch
+            entity_substr_batch,
+            entity_ids_batch,
+            conf_batch,
+            entity_id_tags_batch,
+            entity_pages_batch,
+            first_pars_batch,
+            dbpedia_types_batch,
         ):
             entity_info_list = []
-            for entity_substr, entity_ids, confs, entity_pages, first_pars, dbpedia_types in zip(
-                entity_substr_list, entity_ids_list, conf_list, entity_pages_list, first_pars_list, dbpedia_types_list
+            for entity_substr, entity_ids, confs, entity_id_tags, entity_pages, first_pars, dbpedia_types in zip(
+                entity_substr_list,
+                entity_ids_list,
+                conf_list,
+                entity_id_tags_list,
+                entity_pages_list,
+                first_pars_list,
+                dbpedia_types_list,
             ):
                 entity_info = {}
                 entity_info["entity_substr"] = entity_substr
                 entity_info["entity_ids"] = entity_ids
                 entity_info["confidences"] = [float(elem[2]) for elem in confs]
                 entity_info["tokens_match_conf"] = [float(elem[0]) for elem in confs]
+                entity_info["entity_id_tags"] = entity_id_tags
                 entity_info["pages_titles"] = entity_pages
                 entity_info["first_paragraphs"] = first_pars
                 entity_info["dbpedia_types"] = dbpedia_types

diff --git a/annotators/entity_linking/src/entity_linking.py b/annotators/entity_linking/src/entity_linking.py
@@ -59,7 +59,6 @@ def __init__(
         **kwargs,
     ) -> None:
         """
-
         Args:
             load_path: path to folder with inverted index files
             entity_ranker: component deeppavlov.models.kbqa.rel_ranking_bert
@@ -93,6 +92,7 @@ def __init__(
         self.full_paragraph = full_paragraph
         self.re_tokenizer = re.compile(r"[\w']+|[^\w ]")
         self.not_found_str = "not in wiki"
+        self.stemmer = nltk.PorterStemmer()
         self.related_tags = {
             "loc": ["gpe", "country", "city", "us_state", "river"],
             "gpe": ["loc", "country", "city", "us_state"],
@@ -107,6 +107,16 @@ def __init__(
             "politician": ["per"],
             "writer": ["per"],
         }
+        self.not_named_entities_tags = {
+            "animal",
+            "food",
+            "music_genre",
+            "misc",
+            "language",
+            "occupation",
+            "type_of_sport",
+            "product",
+        }
         self.word_searcher = None
         if self.words_dict_filename:
             self.word_searcher = WordSearcher(self.words_dict_filename, self.ngrams_matrix_filename)
@@ -168,11 +178,11 @@ def __call__(
                     entity_offsets_list.append([st_offset, end_offset])
                 entity_offsets_batch.append(entity_offsets_list)
 
-        entity_ids_batch, entity_conf_batch, entity_pages_batch = [], [], []
+        entity_ids_batch, entity_conf_batch, entity_pages_batch, entity_id_tags_batch = [], [], [], []
         for entity_substr_list, entity_offsets_list, entity_tags_list, sentences_list, sentences_offsets_list in zip(
             entity_substr_batch, entity_offsets_batch, entity_tags_batch, sentences_batch, sentences_offsets_batch
         ):
-            entity_ids_list, entity_conf_list, entity_pages_list = self.link_entities(
+            entity_ids_list, entity_conf_list, entity_pages_list, entity_id_tags_list = self.link_entities(
                 entity_substr_list,
                 entity_offsets_list,
                 entity_tags_list,
@@ -186,9 +196,17 @@ def __call__(
                 entity_pages_list = [entity_pages[: self.num_entities_to_return] for entity_pages in entity_pages_list]
             entity_ids_batch.append(entity_ids_list)
             entity_conf_batch.append(entity_conf_list)
+            entity_id_tags_batch.append(entity_id_tags_list)
             entity_pages_batch.append(entity_pages_list)
             first_par_batch, dbpedia_types_batch = self.extract_add_info(entity_pages_batch)
-        return entity_ids_batch, entity_conf_batch, entity_pages_batch, first_par_batch, dbpedia_types_batch
+        return (
+            entity_ids_batch,
+            entity_conf_batch,
+            entity_id_tags_batch,
+            entity_pages_batch,
+            first_par_batch,
+            dbpedia_types_batch,
+        )
 
     def extract_add_info(self, entity_pages_batch: List[List[List[str]]]):
         first_par_batch, dbpedia_types_batch = [], []
@@ -201,8 +219,10 @@ def extract_add_info(self, entity_pages_batch: List[List[List[str]]]):
                         query = "SELECT * FROM entity_additional_info WHERE page_title='{}';".format(entity_page)
                         res = self.add_info_cur.execute(query)
                         fetch_res = res.fetchall()
-                        first_par = fetch_res[0][1]
-                        dbpedia_types_elem = fetch_res[0][2].split()
+                        first_par, dbpedia_types = "", []
+                        if fetch_res:
+                            first_par = fetch_res[0][1]
+                            dbpedia_types_elem = fetch_res[0][2].split()
                         first_pars.append(first_par)
                         dbpedia_types.append(dbpedia_types_elem)
                     except Exception as e:
@@ -227,7 +247,8 @@ def link_entities(
             f"entity_substr_list {entity_substr_list} entity_tags_list {entity_tags_list} "
             f"entity_offsets_list {entity_offsets_list}"
         )
-        entity_ids_list, conf_list, pages_list, pages_dict_list, descr_list = [], [], [], [], []
+        entity_ids_list, conf_list, pages_list, entity_id_tags_list, descr_list = [], [], [], [], []
+        pages_dict_list = []
         if entity_substr_list:
             entities_scores_list = []
             cand_ent_scores_list = []
@@ -271,6 +292,10 @@ def link_entities(
                         corr_words = self.word_searcher(entity_substr_split[0], set(clean_tags + corr_clean_tags))
                         if corr_words:
                             cand_ent_init = self.find_exact_match(corr_words[0], tags + corr_tags)
+                    if len(entity_substr_split) == 1 and self.stemmer.stem(entity_substr) != entity_substr:
+                        entity_substr_stemmed = self.stemmer.stem(entity_substr)
+                        stem_cand_ent_init = self.find_exact_match(entity_substr_stemmed, tags)
+                        cand_ent_init = {**cand_ent_init, **stem_cand_ent_init}
                     if not cand_ent_init and len(entity_substr_split) > 1:
                         cand_ent_init = self.find_fuzzy_match(entity_substr_split, tags)
 
@@ -284,15 +309,22 @@ def link_entities(
                 cand_ent_scores = cand_ent_scores[: self.num_entities_for_bert_ranking]
                 cand_ent_scores_list.append(cand_ent_scores)
                 entity_ids = [elem[0] for elem in cand_ent_scores]
-                pages = [elem[5] for elem in cand_ent_scores]
+                entity_id_tags = [elem[5] for elem in cand_ent_scores]
+                pages = [elem[6] for elem in cand_ent_scores]
                 scores = [elem[1:5] for elem in cand_ent_scores]
                 entities_scores_list.append(
                     {entity_id: entity_scores for entity_id, entity_scores in zip(entity_ids, scores)}
                 )
                 entity_ids_list.append(entity_ids)
+                entity_id_tags_list.append(entity_id_tags)
                 pages_list.append(pages)
-                pages_dict_list.append({entity_id: page for entity_id, page in zip(entity_ids, pages)})
-                descr_list.append([elem[6] for elem in cand_ent_scores])
+                pages_dict_list.append(
+                    {
+                        entity_id: (page, entity_id_tag)
+                        for entity_id, page, entity_id_tag in zip(entity_ids, pages, entity_id_tags)
+                    }
+                )
+                descr_list.append([elem[7] for elem in cand_ent_scores])
 
             if self.use_descriptions:
                 substr_lens = [len(entity_substr.split()) for entity_substr in entity_substr_list]
@@ -308,16 +340,19 @@ def link_entities(
                     substr_lens,
                 )
                 pages_list = [
-                    [pages_dict.get(entity_id, "") for entity_id in entity_ids]
+                    [pages_dict.get(entity_id, ("", ""))[0] for entity_id in entity_ids]
                     for entity_ids, pages_dict in zip(entity_ids_list, pages_dict_list)
                 ]
-
-        return entity_ids_list, conf_list, pages_list
+                entity_id_tags_list = [
+                    [pages_dict.get(entity_id, ("", ""))[1] for entity_id in entity_ids]
+                    for entity_ids, pages_dict in zip(entity_ids_list, pages_dict_list)
+                ]
+        return entity_ids_list, conf_list, pages_list, entity_id_tags_list
 
     def process_cand_ent(self, cand_ent_init, entities_and_ids, entity_substr_split, tag, tag_conf):
-        for entity_title, entity_id, entity_rels, anchor_cnt, _, page, descr in entities_and_ids:
+        for entity_title, entity_id, entity_rels, anchor_cnt, tag, page, descr in entities_and_ids:
             substr_score = self.calc_substr_score(entity_title, entity_substr_split)
-            cand_ent_init[entity_id].add((substr_score, anchor_cnt, entity_rels, tag_conf, page, descr))
+            cand_ent_init[entity_id].add((substr_score, anchor_cnt, entity_rels, tag_conf, tag, page, descr))
         return cand_ent_init
 
     def find_exact_match(self, entity_substr, tags):
@@ -333,15 +368,16 @@ def find_exact_match(self, entity_substr, tags):
                     cand_ent_init = self.process_cand_ent(
                         cand_ent_init, entities_and_ids, entity_substr_split, tag, tag_conf
                     )
-        if tags and tags[0][0] == "misc" and not cand_ent_init:
+        if tags and ((tags[0][0] == "misc" and not cand_ent_init) or tags[0][1] < 0.7):
             for tag in self.cursors:
-                query = "SELECT * FROM inverted_index WHERE title MATCH '{}';".format(entity_substr)
-                res = self.cursors[tag].execute(query)
-                entities_and_ids = res.fetchall()
-                if entities_and_ids:
-                    cand_ent_init = self.process_cand_ent(
-                        cand_ent_init, entities_and_ids, entity_substr_split, tag, tag_conf
-                    )
+                if (tags[0][0] == "misc" and tag in self.not_named_entities_tags) or tags[0][0] != "misc":
+                    query = "SELECT * FROM inverted_index WHERE title MATCH '{}';".format(entity_substr)
+                    res = self.cursors[tag].execute(query)
+                    entities_and_ids = res.fetchall()
+                    if entities_and_ids:
+                        cand_ent_init = self.process_cand_ent(
+                            cand_ent_init, entities_and_ids, entity_substr_split, tag, tag_conf
+                        )
         return cand_ent_init
 
     def find_fuzzy_match(self, entity_substr_split, tags):
@@ -481,8 +517,8 @@ def rank_by_description(
                     )
                     for entity, score in scores
                 ]
-            log.info(f"len entities with scores {len(entities_with_scores)}")
-            if entity_tags and entity_tags[0][0] == "misc":
+            log.info(f"len entities with scores {len(entities_with_scores)} --- entity_tags {entity_tags}")
+            if entity_tags and (entity_tags[0][0] == "misc" or entity_tags[0][1] < 0.7):
                 entities_with_scores = sorted(entities_with_scores, key=lambda x: (x[1], x[2], x[4]), reverse=True)
             else:
                 entities_with_scores = sorted(entities_with_scores, key=lambda x: (x[1], x[4], x[3]), reverse=True)

diff --git a/annotators/property_extraction/Dockerfile b/annotators/property_extraction/Dockerfile
@@ -0,0 +1,20 @@
+FROM deeppavlov/base-gpu
+
+RUN apt-get update && apt-get install git -y
+
+ARG CONFIG
+ARG PORT
+ARG SRC_DIR
+ARG SED_ARG=" | "
+
+ENV CONFIG=$CONFIG
+ENV PORT=$PORT
+
+COPY ./annotators/property_extraction/requirements.txt /src/requirements.txt
+RUN pip install -r /src/requirements.txt
+
+COPY $SRC_DIR /src
+
+WORKDIR /src
+
+CMD gunicorn  --workers=1 --timeout 500 server:app -b 0.0.0.0:8129
diff --git a/annotators/property_extraction/property_classification_distilbert.json b/annotators/property_extraction/property_classification_distilbert.json
@@ -0,0 +1,100 @@
+{
+  "dataset_reader": {
+    "class_name": "sq_reader",
+    "data_path": "{DOWNLOADS_PATH}/dialogue_nli/dialogue_nli_cls.json"
+  },
+  "dataset_iterator": {
+    "class_name": "basic_classification_iterator",
+    "seed": 42
+  },
+  "chainer": {
+    "in": ["x"],
+    "in_y": ["y"],
+    "pipe": [
+      {
+        "class_name": "torch_transformers_preprocessor",
+        "vocab_file": "{TRANSFORMER}",
+        "do_lower_case": false,
+        "max_seq_length": 64,
+        "in": ["x"],
+        "out": ["bert_features"]
+      },
+      {
+        "id": "classes_vocab",
+        "class_name": "simple_vocab",
+        "fit_on": ["y"],
+        "save_path": "{MODEL_PATH}/classes.dict",
+        "load_path": "{MODEL_PATH}/classes.dict",
+        "in": ["y"],
+        "out": ["y_ids"]
+      },
+      {
+        "in": ["y_ids"],
+        "out": ["y_onehot"],
+        "class_name": "one_hotter",
+        "depth": "#classes_vocab.len",
+        "single_vector": true
+      },
+      {
+        "class_name": "torch_transformers_classifier",
+        "n_classes": "#classes_vocab.len",
+        "return_probas": true,
+        "pretrained_bert": "{TRANSFORMER}",
+        "save_path": "{MODEL_PATH}/model",
+        "load_path": "{MODEL_PATH}/model",
+        "optimizer": "AdamW",
+        "optimizer_parameters": {"lr": 1e-05},
+        "learning_rate_drop_patience": 5,
+        "learning_rate_drop_div": 2.0,
+        "in": ["bert_features"],
+        "in_y": ["y_ids"],
+        "out": ["y_pred_probas"]
+      },
+      {
+        "in": ["y_pred_probas"],
+        "out": ["y_pred_ids"],
+        "class_name": "proba2labels",
+        "max_proba": true
+      },
+      {
+        "in": ["y_pred_ids"],
+        "out": ["y_pred_labels"],
+        "ref": "classes_vocab"
+      }
+    ],
+    "out": ["y_pred_labels"]
+  },
+  "train": {
+    "epochs": 100,
+    "batch_size": 64,
+    "metrics": [
+      "f1_macro",
+      "accuracy"
+    ],
+    "validation_patience": 10,
+    "val_every_n_batches": 100,
+    "log_every_n_batches": 100,
+    "show_examples": false,
+    "evaluation_targets": ["valid", "test"],
+    "class_name": "torch_trainer"
+  },
+  "metadata": {
+    "variables": {
+      "TRANSFORMER": "distilbert-base-uncased",
+      "ROOT_PATH": "~/.deeppavlov",
+      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
+      "MODELS_PATH": "{ROOT_PATH}/models",
+      "MODEL_PATH": "{MODELS_PATH}/classifiers/property_classification"
+    },
+    "download": [
+      {
+        "url": "http://files.deeppavlov.ai/deeppavlov_data/generative_ie/property_classification.tar.gz",
+        "subdir": "{MODEL_PATH}"
+      },
+      {
+        "url": "http://files.deeppavlov.ai/deeppavlov_data/generative_ie/dialogue_nli_cls.tar.gz",
+        "subdir": "{DOWNLOADS_PATH}/dialogue_nli"
+      }
+    ]
+  }
+}