Extract more methods from the scripts

languagetool-org · Jan 15, 2024 · 117b47b · 117b47b
1 parent 50021c8
commit 117b47b
Show file tree

Hide file tree

Showing 5 changed files with 225 additions and 191 deletions.
diff --git a/lib/dic_chunk.py b/lib/dic_chunk.py
@@ -0,0 +1,88 @@
+import shutil
+from os import path
+from tempfile import NamedTemporaryFile
+from typing import List
+
+from lib.constants import LATIN_1_ENCODING
+from lib.logger import LOGGER
+from lib.shell_command import ShellCommand
+
+
+class DicChunk:
+    """This class represents a single chunk of a Hunspell dictionary file.
+
+    Attributes:
+        filepath (str): the path to the chunk
+        compounds (bool): whether this is a file containing compounds or not; if True, this chunk will *not* be
+                          tokenised;
+    """
+    def __init__(self, filepath: str, compounds: bool = False):
+        self.filepath = filepath
+        self.compounds = compounds
+
+    def __str__(self) -> str:
+        basename = path.basename(self.filepath)
+        if self.compounds:
+            return path.join('compounds', basename)
+        return basename
+
+    def rm(self) -> None:
+        """Remove the chunk file."""
+        LOGGER.debug(f"Removing {self} ...")
+        shutil.rmtree(self.filepath)
+
+    @classmethod
+    def from_hunspell_dic(cls, dic_path: str, chunk_size: int, target_dir: str, sample_size: int) -> List:
+        """Splits a dictionary file into smaller files (chunks) of a given number of lines.
+
+        Args:
+            dic_path (str): the path to the Hunspell .dic file
+            chunk_size (int): the number of lines per chunk
+            target_dir (str): the directory where the chunks will be saved
+            sample_size (int): the number of lines to read from the dictionary file; if 0 or negative, read all lines
+
+        Returns:
+            A list of DicChunk objects, each representing a chunk of the dictionary file
+        """
+        compounds = (True if 'compounds' in dic_path else False)
+        with open(dic_path, 'r', encoding=LATIN_1_ENCODING) as dic_file:
+            lines = dic_file.readlines()[1:]  # Skip the first line
+        lines = [line for line in lines if not line.startswith("#")]  # Filter out comment lines
+        if sample_size > 0:
+            lines = lines[0:sample_size]
+        total_lines = len(lines)
+        str_chunks: List[List[str]] = [lines[i:i + chunk_size] for i in range(0, total_lines, chunk_size)]
+        chunks: List[cls] = []
+        for index, chunk in enumerate(str_chunks):
+            if compounds:
+                tmp_dir = path.join(target_dir, 'compounds')
+            else:
+                tmp_dir = target_dir
+            filename = path.basename(dic_path).replace('.dic', f'_chunk{index}.dic')
+            chunk_path = path.join(tmp_dir, filename)
+            with open(chunk_path, 'w', encoding=LATIN_1_ENCODING) as chunk_file:
+                # Prepend the count of lines in this chunk and then write all lines
+                chunk_file.write(f"{len(chunk)}\n")
+                chunk_file.writelines(chunk)
+            chunks.append(cls(chunk_path, compounds))
+        return chunks
+
+    def unmunch(self, aff_path: str, delete_tmp: bool = False) -> NamedTemporaryFile:
+        """Create all forms from Hunspell dictionaries.
+
+        Args:
+            aff_path: the path to the .aff file
+            delete_tmp: whether to delete the temporary file after use
+
+        Returns:
+            the temp file containing the unmunched dictionary
+        """
+        unmunched_tmp = NamedTemporaryFile(delete=delete_tmp, mode='wb')
+        LOGGER.debug(f"Unmunching {self} into {unmunched_tmp.name} ...")
+        cmd_unmunch = f"unmunch {self.filepath} {aff_path}"
+        unmunch_result = ShellCommand(cmd_unmunch).run()
+        unmunched_tmp.write(unmunch_result)
+        unmunched_tmp.flush()
+        if delete_tmp:
+            self.rm()
+        return unmunched_tmp
diff --git a/lib/languagetool_utils.py b/lib/languagetool_utils.py
@@ -0,0 +1,108 @@
+from tempfile import NamedTemporaryFile
+from typing import List
+
+from lib.constants import LATIN_1_ENCODING, LT_VER, LT_JAR_PATH, LT_DIR, RESULT_POS_DICT_FILEPATH
+from lib.logger import LOGGER
+from lib.shell_command import ShellCommand
+from lib.variant import Variant
+
+
+class LanguageToolUtils:
+    def __init__(self, variant: Variant, delete_tmp: bool = False):
+        self.variant = variant
+        self.delete_tmp = delete_tmp
+
+    def tokenise(self, unmunched_file: NamedTemporaryFile) -> NamedTemporaryFile:
+        """Tokenise each line of an unmunched file, write it to another temp file and return it.
+
+        The written data looks weird, since the output of the LT word tokeniser inserts newlines between tokens.
+        Original line after unmunch:
+           "far-se-á"
+        Lines after tokenisation:
+            "far"
+            ""
+            "se"
+            ""
+            "á"
+        This may look iffy, but later in the process we will sort and dedupe these files, so don't panic.
+
+        Args:
+            unmunched_file: the NamedTemporaryFile object for the unmunched file we'll be tokenising
+
+        Returns:
+            a NamedTemporaryFile with the result of tokenisation written to it; note this is a UTF-8-encoded file; it is
+            not at this stage that we move from latin-1 encoding to UTF-8.
+        """
+        tokenised_tmp = NamedTemporaryFile(delete=self.delete_tmp, mode='w')
+        LOGGER.debug(f"Tokenising {unmunched_file.name} into {tokenised_tmp.name} ...")
+        tokenise_cmd = (
+            f"java -cp {LT_JAR_PATH}:"
+            f"{LT_DIR}/languagetool-dev/target/languagetool-dev-{LT_VER}-jar-with-dependencies.jar "
+            f"org.languagetool.dev.archive.WordTokenizer {self.variant.lang}"
+        )
+        with open(unmunched_file.name, 'r', encoding=LATIN_1_ENCODING) as u:
+            unmunched_str = u.read()
+        unmunched_file.close()
+        tokenisation_result = ShellCommand(tokenise_cmd).run_with_input(unmunched_str)
+        tokenised_tmp.write(tokenisation_result)
+        tokenised_tmp.flush()
+        return tokenised_tmp
+
+    def build_spelling_binary(self, tokenised_temps: List[NamedTemporaryFile]) -> None:
+        """Merge many unmunched and tokenised files into *one* plaintext file and used that to build a Morfologik
+        SPELLING dictionary.
+
+        The files must be merged and converted into UTF-8 before we can do anything with them. Once we have a single
+        'master' temp file per variant, we can pass that file as an input parameter to the Java tool that builds
+        spelling dictionaries.
+
+        If the shell command is successful, we will have a new output file saved to the appropriate result directory.
+        This will be a binary file ready to be released and used by Morfologik.
+
+        Returns:
+            None
+        """
+        LOGGER.info(f"Building binary for {self.variant}...")
+        megatemp = NamedTemporaryFile(delete=self.delete_tmp, mode='w',
+                                      encoding='utf-8')  # Open the file with UTF-8 encoding
+        lines = set()
+        for tmp in tokenised_temps:
+            with open(tmp.name, 'r', encoding='utf-8') as t:
+                lines.update(t.read().split("\n"))
+        megatemp.write("\n".join(sorted(lines)))
+        LOGGER.debug(f"Found {len(lines)} unique unmunched and tokenised forms for {self.variant}.")
+        cmd_build = (
+            f"java -cp {LT_JAR_PATH} "
+            f"org.languagetool.tools.SpellDictionaryBuilder "
+            f"-i {megatemp.name} "
+            f"-info {self.variant.info('source')} "
+            f"-freq {self.variant.freq()} "
+            f"-o {self.variant.dict()}"
+        )
+        ShellCommand(cmd_build).run()
+        LOGGER.info(f"Done compiling {self.variant} dictionary!")
+        self.variant.copy_spell_info()
+        megatemp.close()
+
+    def build_pos_binary(self) -> None:
+        cmd_build = (
+            f"java -cp {LT_JAR_PATH} "
+            f"org.languagetool.tools.POSDictionaryBuilder "
+            f"-i {RESULT_POS_DICT_FILEPATH} "
+            f"-info {self.variant.pos_info_java_input_path()} "
+            f"-o {self.variant.pos_dict_java_output_path()}"
+        )
+        ShellCommand(cmd_build).run()
+        self.variant.copy_pos_info()
+
+    def build_synth_binary(self) -> None:
+        cmd_build = (
+            f"java -cp {LT_JAR_PATH} "
+            f"org.languagetool.tools.SynthDictionaryBuilder "
+            f"-i {RESULT_POS_DICT_FILEPATH} "
+            f"-info {self.variant.synth_info_java_input_path()} "
+            f"-o {self.variant.synth_dict_java_output_path()}"
+        )
+        ShellCommand(cmd_build).run()
+        self.variant.copy_synth_info()
+        self.variant.rename_synth_tag_files()
diff --git a/lib/utils.py b/lib/utils.py
@@ -1,7 +1,10 @@
+import codecs
+import shutil
 from os import chdir, path
+from tempfile import NamedTemporaryFile
 from typing import Optional
 
-from lib.constants import REPO_DIR, LT_DIR, JAVA_RESULTS_DIR
+from lib.constants import REPO_DIR, LT_DIR, JAVA_RESULTS_DIR, LATIN_1_ENCODING
 from lib.shell_command import ShellCommand
 from lib.logger import LOGGER
 
@@ -26,3 +29,13 @@ def install_dictionaries(custom_version: Optional[str]):
         LOGGER.info(f"Installing environment-defined version \"{env['PT_DICT_VERSION']}\"")
     ShellCommand("mvn clean install", env=env).run()
     chdir(REPO_DIR)  # Go back to the repo directory
+
+
+def convert_to_utf8(tmp_file: NamedTemporaryFile, delete_tmp: bool = False) -> NamedTemporaryFile:
+    """Takes a Latin-1-encoded temp and returns another temp with the same contents but in UTF-8."""
+    utf8_tmp = NamedTemporaryFile(mode='w+', encoding='utf-8', delete=delete_tmp)
+    LOGGER.debug(f"Converting {tmp_file.name} into UTF-8, into {utf8_tmp.name} ...")
+    with codecs.open(tmp_file.name, 'r', encoding=LATIN_1_ENCODING) as file:
+        shutil.copyfileobj(file, utf8_tmp)
+    utf8_tmp.seek(0)
+    return utf8_tmp