Skip to content

Commit

Permalink
Add lemmatization to gpt helper (#387)
Browse files Browse the repository at this point in the history
  • Loading branch information
SavenkovIgor authored Aug 11, 2024
1 parent f1c9b98 commit 3485d08
Show file tree
Hide file tree
Showing 4 changed files with 117 additions and 11 deletions.
9 changes: 8 additions & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,14 @@
{
"cSpell.words": [
"dotenv",
"gtest",
"openai"
],
"cSpell.ignoreWords": [
"dotenv",
"Lemmatizer",
"lemmatize",
"lemmatized",
"punkt",
"wordnet"
]
}
2 changes: 1 addition & 1 deletion source/staticDataStorage/data/Global.json
Original file line number Diff line number Diff line change
Expand Up @@ -4207,7 +4207,7 @@
},
{
"area": "chem",
"termDef": "Gamma-Aminobutyric Acid - a simple {amino acid} with a {carbon skeleton} of {butane}. 4-Aminobutyric Acid"
"termDef": "Gamma-Aminobutyric Acid - a simple {amino acid} with a {carbon skeleton} of {butane}. 4-Aminobutyric {Acid}"
},
{
"area": "chem",
Expand Down
1 change: 1 addition & 0 deletions tools/gpt-helper/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@ jupyter
jupytext
openai
python-dotenv
nltk
116 changes: 107 additions & 9 deletions tools/gpt-helper/term_helper.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,55 @@
"cells": [
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"id": "413b2b8e",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt to /home/user/nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n",
"[nltk_data] Downloading package punkt_tab to /home/user/nltk_data...\n",
"[nltk_data] Package punkt_tab is already up-to-date!\n",
"[nltk_data] Downloading package wordnet to /home/user/nltk_data...\n",
"[nltk_data] Package wordnet is already up-to-date!\n"
]
}
],
"source": [
"import openai\n",
"import os\n",
"import json\n",
"import os\n",
"import re\n",
"\n",
"from typing import List, Dict, Any\n",
"from pathlib import Path\n",
"from dataclasses import dataclass\n",
"from pathlib import Path\n",
"from typing import Any, Dict, List, Tuple\n",
"\n",
"import nltk\n",
"import openai\n",
"from dotenv import find_dotenv, load_dotenv\n",
"from nltk.stem import PorterStemmer, WordNetLemmatizer\n",
"from nltk.tokenize import word_tokenize\n",
"\n",
"nltk.download('punkt')\n",
"nltk.download('punkt_tab')\n",
"nltk.download('wordnet')\n",
"\n",
"lemmatizer = WordNetLemmatizer()\n",
"\n",
"syntax_tokens: List[str] = [\"{\", \"}\", \"(\", \")\", \"[\", \"]\", \"+\", \"-\", \",\", \".\"]\n",
"\n",
"def lemmatize(text: str, drop_tokens: List[str] = syntax_tokens) -> str:\n",
" tokens = word_tokenize(text)\n",
" lemmatized_words: List[str] = [lemmatizer.lemmatize(word) for word in tokens]\n",
" ret: List[str] = []\n",
" for word in lemmatized_words:\n",
" if word not in drop_tokens:\n",
" ret.append(word)\n",
"\n",
" return ' '.join(ret)\n",
"\n",
"from dotenv import load_dotenv, find_dotenv\n",
"_ = load_dotenv(find_dotenv())\n",
"\n",
"openai.api_key = os.getenv('OPENAI_API_KEY')\n",
Expand Down Expand Up @@ -52,6 +86,16 @@
" def has_definition(self) -> bool:\n",
" return len(self.definition) > 0\n",
"\n",
" @property\n",
" def lemmatized_term(self) -> str:\n",
" return lemmatize(self.term)\n",
"\n",
" def lemmatized_definition(self, remove_links: bool = True) -> str:\n",
" definition = self.definition\n",
" if remove_links:\n",
" definition = re.sub(r\"\\{.*\\}\", \"\", self.definition)\n",
" return lemmatize(definition)\n",
"\n",
" def term_def_without_uuids(self) -> str:\n",
" # Replaces |[a-f0-9-]{36}\\} with \"\\}\"\n",
" return re.sub(r\"\\|[a-f0-9-]{36}\\}\", \"}\", self.termDef)\n",
Expand Down Expand Up @@ -222,7 +266,7 @@
"data_path: Path = Path(\"../../source/staticDataStorage/data/\").resolve()\n",
"\n",
"# bio_storage: StaticStorage = StaticStorage(data_path / \"Biochemistry.json\")\n",
"glb_storage: StaticStorage = StaticStorage(data_path / \"Global.json\")\n"
"glb_storage: StaticStorage = StaticStorage(data_path / \"Global.json\")"
]
},
{
Expand Down Expand Up @@ -395,6 +439,60 @@
"glb_storage.validate()\n",
"glb_storage.save()"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "b7b09917",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Processed [0/1426] terms\n",
"Processed [100/1426] terms\n",
"Processed [200/1426] terms\n",
"Processed [300/1426] terms\n",
"Processed [400/1426] terms\n",
"Processed [500/1426] terms\n",
"Found unmarked term: H3 in term: Oleic Acid\n",
"Processed [600/1426] terms\n",
"Found unmarked term: Gene in term: Carbonyl Group\n",
"Processed [700/1426] terms\n",
"Found unmarked term: Gene in term: Aldehyde Group\n",
"Found unmarked term: H4 in term: Methane\n",
"Processed [800/1426] terms\n",
"Processed [900/1426] terms\n",
"Processed [1000/1426] terms\n",
"Found unmarked term: Acid in term: Gamma-Aminobutyric Acid\n",
"Processed [1100/1426] terms\n",
"Found unmarked term: H4 in term: Ethylene\n",
"Processed [1200/1426] terms\n",
"Processed [1300/1426] terms\n",
"Processed [1400/1426] terms\n"
]
}
],
"source": [
"lemmatized_terms: List[str] = []\n",
"\n",
"for term in glb_storage.terms:\n",
" lemmatized_terms.append(term.lemmatized_term)\n",
"\n",
"term_data_list = glb_storage.terms\n",
"terms_count = len(term_data_list)\n",
"\n",
"for i, term_data in enumerate(term_data_list):\n",
" lemmatized_definition = term_data.lemmatized_definition()\n",
"\n",
" if i % 100 == 0:\n",
" print(f\"Processed [{i}/{terms_count}] terms\")\n",
"\n",
" for l_term in lemmatized_terms:\n",
" if l_term in lemmatized_definition:\n",
" print(f'Found unmarked term: {l_term} in term: {term_data.term}')"
]
}
],
"metadata": {
Expand Down

0 comments on commit 3485d08

Please sign in to comment.