Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add biomedical entity normalization #3180

Closed
wants to merge 30 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
641a3c0
Initial version (already adapted to recent Flair API changes)
Mar 14, 2023
9779abf
Revise mention text pre-processing: define general interface and adap…
Mar 14, 2023
8da7d75
Refactor entity linking model structure
Mar 15, 2023
e34c831
Update documentation
Mar 22, 2023
f54925c
Introduce separate methods for pre-processing (1) entity mentions fro…
Mar 23, 2023
90a0acb
Merge branch 'master' into bio-entity-normalization
alanakbik Apr 21, 2023
f1f51fd
Fix formatting
alanakbik Apr 21, 2023
f2f21d3
feat(test): biomedical entity linking
Apr 26, 2023
82c1b8b
fix(requirements): add faiss
Apr 26, 2023
2e3cda3
fix(test): hold on w/ automatic tests for now
Apr 26, 2023
adb231e
fix(bionel): start major refactoring
Apr 26, 2023
c80f1be
fix(bionel): major refactor
Apr 27, 2023
d10d297
fix(bionel): assign entity type
May 2, 2023
25ba2dd
fix(biencoder): set sparse encoder and weight
May 2, 2023
4525d3b
fix(bionel): address comments
May 11, 2023
3a5913d
fix(candidate_generator): container for search result
May 12, 2023
734d895
fix(predict): default annotation layer iff not provided by use
May 19, 2023
d79f871
fix(label): scores can be >= or <=
May 19, 2023
118fb95
fix(candidate): parametrize database name
May 19, 2023
1fcfddf
feat(candidate_generator): cache sparse encoder
May 22, 2023
9322c1b
fix(candidate_generator): minor improvements
May 23, 2023
071f51e
feat(linking_candidate): pretty print
May 24, 2023
a23f360
fix(candidate_generator): check sparse encoder for sparse search
May 24, 2023
ce29290
chore: crystal clear dictionary name
Jun 1, 2023
0d65336
feat(candidate_generator): add sparse index
Jun 1, 2023
02812f0
fix(candidate_generator): KISS: sparse search w/ scipy sparse matrices
Jun 2, 2023
ca6eee8
Minor update to comments and documentation
Jul 12, 2023
6c8f219
Fix tests and type annotations
Jul 12, 2023
2fa43cc
Merge branch 'master' into bio-entity-normalization
Jul 12, 2023
d90d92d
Merge
Jul 12, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions flair/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@
CLL,
CRAFT,
CRAFT_V4,
CTD_CHEMICAL_DICTIONARY,
CTD_DISEASE_DICTIONARY,
DECA,
FSU,
GELLUS,
Expand Down Expand Up @@ -90,10 +92,8 @@
LOCTEXT,
MIRNA,
NCBI_DISEASE,
NEL_CTD_CHEMICAL_DICT,
NEL_CTD_DISEASE_DICT,
NEL_NCBI_HUMAN_GENE_DICT,
NEL_NCBI_TAXONOMY_DICT,
NCBI_GENE_HUMAN_DICTIONARY,
NCBI_TAXONOMY_DICTIONARY,
OSIRIS,
PDR,
S800,
Expand Down
24 changes: 12 additions & 12 deletions flair/datasets/biomedical.py
Original file line number Diff line number Diff line change
Expand Up @@ -505,7 +505,7 @@ def get_subset(self, dataset: InternalBioNerDataset, split: str, split_dir: Path
)


class AbstractBioNelDictionary(ABC):
class AbstractBiomedicalEntityLinkingDictionary(ABC):
"""
Base class for dictionaries for named entity linking.
Dictionary contains all entities in the corpus and their associated ids.
Expand All @@ -530,7 +530,7 @@ def __init__(

# check if there is a parsed_dict file in cache
if not self.dataset_file.exists():
logger.info("Preprocess and cache dictionary `%s` file: %s", (dataset_name, self.dataset_file))
logger.info("Preprocess and cache dictionary `%s` file: %s", dataset_name, self.dataset_file)
data_file = self.download_dictionary(data_folder)

with open(self.dataset_file, "w", encoding="utf-8") as f:
Expand Down Expand Up @@ -565,13 +565,13 @@ def stream(self) -> Iterator[Tuple[str, str]]:
line = line.strip()
if line == "":
continue
assert "||" in line, "Preprocessed BioNelDictionary must have lines in the format: `cui||name`"
assert "||" in line, "Preprocessed EntityLinkingDictionary must have lines in the format: `cui||name`"
cui, name = line.split("||")
name = name.lower()
yield (name, cui)


class PreprocessedBioNelDictionary(AbstractBioNelDictionary):
class ParsedBiomedicalEntityLinkingDictionary(AbstractBiomedicalEntityLinkingDictionary):
"""
Base dictionary with data already in preprocessed format
"""
Expand All @@ -590,7 +590,7 @@ def download_dictionary(self):
def parse_dictionary(self):
pass

class NEL_CTD_DISEASE_DICT(AbstractBioNelDictionary):
class CTD_DISEASE_DICTIONARY(AbstractBiomedicalEntityLinkingDictionary):
"""
Dictionary for Named Entity Linking on Diseases
"""
Expand All @@ -601,7 +601,7 @@ def __init__(
):
"""
:param base_path: Path to the corpus on your machine"""
super(NEL_CTD_DISEASE_DICT, self).__init__(base_path=base_path)
super(CTD_DISEASE_DICTIONARY, self).__init__(base_path=base_path)

def get_database_names(self):
return ["MESH", "DO:DOID", "OMIM"]
Expand Down Expand Up @@ -658,7 +658,7 @@ def parse_dictionary(self, original_file: Path) -> Iterator[Tuple[str, str]]:
yield e


class NEL_CTD_CHEMICAL_DICT(AbstractBioNelDictionary):
class CTD_CHEMICAL_DICTIONARY(AbstractBiomedicalEntityLinkingDictionary):
"""
Dictionary for Named Entity Linking on Chemicals
"""
Expand All @@ -669,7 +669,7 @@ def __init__(
):
"""
:param base_path: Path to the corpus on your machine"""
super(NEL_CTD_CHEMICAL_DICT, self).__init__(base_path=base_path)
super(CTD_CHEMICAL_DICTIONARY, self).__init__(base_path=base_path)

def get_database_names(self):
return ["MESH"]
Expand Down Expand Up @@ -724,7 +724,7 @@ def parse_dictionary(self, original_file: Path) -> Iterator[Tuple[str, str]]:



class NEL_NCBI_HUMAN_GENE_DICT(AbstractBioNelDictionary):
class NCBI_GENE_HUMAN_DICTIONARY(AbstractBiomedicalEntityLinkingDictionary):
"""
Dictionary for Named Entity Linking on Genes
"""
Expand All @@ -735,7 +735,7 @@ def __init__(
):
"""
:param base_path: Path to the corpus on your machine"""
super(NEL_NCBI_HUMAN_GENE_DICT, self).__init__(base_path=base_path)
super(NCBI_GENE_HUMAN_DICTIONARY, self).__init__(base_path=base_path)

def _is_invalid_name(self, name: str) -> bool:
"""
Expand Down Expand Up @@ -812,7 +812,7 @@ def parse_dictionary(self, original_file: Path) -> Iterator[Tuple[str, str]]:
yield e


class NEL_NCBI_TAXONOMY_DICT(AbstractBioNelDictionary):
class NCBI_TAXONOMY_DICTIONARY(AbstractBiomedicalEntityLinkingDictionary):
"""
Dictionary for Named Entity Linking on Organisms
"""
Expand All @@ -823,7 +823,7 @@ def __init__(
):
"""
:param base_path: Path to the corpus on your machine"""
super(NEL_NCBI_TAXONOMY_DICT, self).__init__(base_path=base_path)
super(NCBI_TAXONOMY_DICTIONARY, self).__init__(base_path=base_path)

def get_database_names(self):
return ["NCBI Taxonomy"]
Expand Down
Loading