Add lemmatization to gpt helper (#387)

SavenkovIgor · Aug 11, 2024 · 3485d08 · 3485d08
1 parent f1c9b98
commit 3485d08
Show file tree

Hide file tree

Showing 4 changed files with 117 additions and 11 deletions.
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -1,7 +1,14 @@
 {
     "cSpell.words": [
-        "dotenv",
         "gtest",
         "openai"
+    ],
+    "cSpell.ignoreWords": [
+        "dotenv",
+        "Lemmatizer",
+        "lemmatize",
+        "lemmatized",
+        "punkt",
+        "wordnet"
     ]
 }
diff --git a/source/staticDataStorage/data/Global.json b/source/staticDataStorage/data/Global.json
@@ -4207,7 +4207,7 @@
         },
         {
             "area": "chem",
-            "termDef": "Gamma-Aminobutyric Acid - a simple {amino acid} with a {carbon skeleton} of {butane}. 4-Aminobutyric Acid"
+            "termDef": "Gamma-Aminobutyric Acid - a simple {amino acid} with a {carbon skeleton} of {butane}. 4-Aminobutyric {Acid}"
         },
         {
             "area": "chem",

diff --git a/tools/gpt-helper/requirements.txt b/tools/gpt-helper/requirements.txt
@@ -2,3 +2,4 @@ jupyter
 jupytext
 openai
 python-dotenv
+nltk
diff --git a/tools/gpt-helper/term_helper.ipynb b/tools/gpt-helper/term_helper.ipynb
@@ -2,21 +2,55 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "id": "413b2b8e",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[nltk_data] Downloading package punkt to /home/user/nltk_data...\n",
+      "[nltk_data]   Package punkt is already up-to-date!\n",
+      "[nltk_data] Downloading package punkt_tab to /home/user/nltk_data...\n",
+      "[nltk_data]   Package punkt_tab is already up-to-date!\n",
+      "[nltk_data] Downloading package wordnet to /home/user/nltk_data...\n",
+      "[nltk_data]   Package wordnet is already up-to-date!\n"
+     ]
+    }
+   ],
    "source": [
-    "import openai\n",
-    "import os\n",
     "import json\n",
+    "import os\n",
     "import re\n",
-    "\n",
-    "from typing import List, Dict, Any\n",
-    "from pathlib import Path\n",
     "from dataclasses import dataclass\n",
+    "from pathlib import Path\n",
+    "from typing import Any, Dict, List, Tuple\n",
+    "\n",
+    "import nltk\n",
+    "import openai\n",
+    "from dotenv import find_dotenv, load_dotenv\n",
+    "from nltk.stem import PorterStemmer, WordNetLemmatizer\n",
+    "from nltk.tokenize import word_tokenize\n",
+    "\n",
+    "nltk.download('punkt')\n",
+    "nltk.download('punkt_tab')\n",
+    "nltk.download('wordnet')\n",
+    "\n",
+    "lemmatizer = WordNetLemmatizer()\n",
+    "\n",
+    "syntax_tokens: List[str] = [\"{\", \"}\", \"(\", \")\", \"[\", \"]\", \"+\", \"-\", \",\", \".\"]\n",
+    "\n",
+    "def lemmatize(text: str, drop_tokens: List[str] = syntax_tokens) -> str:\n",
+    "    tokens = word_tokenize(text)\n",
+    "    lemmatized_words: List[str] = [lemmatizer.lemmatize(word) for word in tokens]\n",
+    "    ret: List[str] = []\n",
+    "    for word in lemmatized_words:\n",
+    "        if word not in drop_tokens:\n",
+    "            ret.append(word)\n",
+    "\n",
+    "    return ' '.join(ret)\n",
     "\n",
-    "from dotenv import load_dotenv, find_dotenv\n",
     "_ = load_dotenv(find_dotenv())\n",
     "\n",
     "openai.api_key = os.getenv('OPENAI_API_KEY')\n",
@@ -52,6 +86,16 @@
     "    def has_definition(self) -> bool:\n",
     "        return len(self.definition) > 0\n",
     "\n",
+    "    @property\n",
+    "    def lemmatized_term(self) -> str:\n",
+    "        return lemmatize(self.term)\n",
+    "\n",
+    "    def lemmatized_definition(self, remove_links: bool = True) -> str:\n",
+    "        definition = self.definition\n",
+    "        if remove_links:\n",
+    "            definition = re.sub(r\"\\{.*\\}\", \"\", self.definition)\n",
+    "        return lemmatize(definition)\n",
+    "\n",
     "    def term_def_without_uuids(self) -> str:\n",
     "        # Replaces |[a-f0-9-]{36}\\} with \"\\}\"\n",
     "        return re.sub(r\"\\|[a-f0-9-]{36}\\}\", \"}\", self.termDef)\n",
@@ -222,7 +266,7 @@
     "data_path: Path = Path(\"../../source/staticDataStorage/data/\").resolve()\n",
     "\n",
     "# bio_storage: StaticStorage = StaticStorage(data_path / \"Biochemistry.json\")\n",
-    "glb_storage: StaticStorage = StaticStorage(data_path / \"Global.json\")\n"
+    "glb_storage: StaticStorage = StaticStorage(data_path / \"Global.json\")"
    ]
   },
   {
@@ -395,6 +439,60 @@
     "glb_storage.validate()\n",
     "glb_storage.save()"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "b7b09917",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Processed [0/1426] terms\n",
+      "Processed [100/1426] terms\n",
+      "Processed [200/1426] terms\n",
+      "Processed [300/1426] terms\n",
+      "Processed [400/1426] terms\n",
+      "Processed [500/1426] terms\n",
+      "Found unmarked term: H3 in term: Oleic Acid\n",
+      "Processed [600/1426] terms\n",
+      "Found unmarked term: Gene in term: Carbonyl Group\n",
+      "Processed [700/1426] terms\n",
+      "Found unmarked term: Gene in term: Aldehyde Group\n",
+      "Found unmarked term: H4 in term: Methane\n",
+      "Processed [800/1426] terms\n",
+      "Processed [900/1426] terms\n",
+      "Processed [1000/1426] terms\n",
+      "Found unmarked term: Acid in term: Gamma-Aminobutyric Acid\n",
+      "Processed [1100/1426] terms\n",
+      "Found unmarked term: H4 in term: Ethylene\n",
+      "Processed [1200/1426] terms\n",
+      "Processed [1300/1426] terms\n",
+      "Processed [1400/1426] terms\n"
+     ]
+    }
+   ],
+   "source": [
+    "lemmatized_terms: List[str] = []\n",
+    "\n",
+    "for term in glb_storage.terms:\n",
+    "    lemmatized_terms.append(term.lemmatized_term)\n",
+    "\n",
+    "term_data_list = glb_storage.terms\n",
+    "terms_count = len(term_data_list)\n",
+    "\n",
+    "for i, term_data in enumerate(term_data_list):\n",
+    "    lemmatized_definition = term_data.lemmatized_definition()\n",
+    "\n",
+    "    if i % 100 == 0:\n",
+    "        print(f\"Processed [{i}/{terms_count}] terms\")\n",
+    "\n",
+    "    for l_term in lemmatized_terms:\n",
+    "        if l_term in lemmatized_definition:\n",
+    "            print(f'Found unmarked term: {l_term} in term: {term_data.term}')"
+   ]
   }
  ],
  "metadata": {