Skip to content

Commit

Permalink
Extract more methods from the scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
p-goulart committed Jan 15, 2024
1 parent 50021c8 commit 117b47b
Show file tree
Hide file tree
Showing 5 changed files with 225 additions and 191 deletions.
88 changes: 88 additions & 0 deletions lib/dic_chunk.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import shutil
from os import path
from tempfile import NamedTemporaryFile
from typing import List

from lib.constants import LATIN_1_ENCODING
from lib.logger import LOGGER
from lib.shell_command import ShellCommand


class DicChunk:
"""This class represents a single chunk of a Hunspell dictionary file.
Attributes:
filepath (str): the path to the chunk
compounds (bool): whether this is a file containing compounds or not; if True, this chunk will *not* be
tokenised;
"""
def __init__(self, filepath: str, compounds: bool = False):
self.filepath = filepath
self.compounds = compounds

def __str__(self) -> str:
basename = path.basename(self.filepath)
if self.compounds:
return path.join('compounds', basename)
return basename

def rm(self) -> None:
"""Remove the chunk file."""
LOGGER.debug(f"Removing {self} ...")
shutil.rmtree(self.filepath)

@classmethod
def from_hunspell_dic(cls, dic_path: str, chunk_size: int, target_dir: str, sample_size: int) -> List:
"""Splits a dictionary file into smaller files (chunks) of a given number of lines.
Args:
dic_path (str): the path to the Hunspell .dic file
chunk_size (int): the number of lines per chunk
target_dir (str): the directory where the chunks will be saved
sample_size (int): the number of lines to read from the dictionary file; if 0 or negative, read all lines
Returns:
A list of DicChunk objects, each representing a chunk of the dictionary file
"""
compounds = (True if 'compounds' in dic_path else False)
with open(dic_path, 'r', encoding=LATIN_1_ENCODING) as dic_file:
lines = dic_file.readlines()[1:] # Skip the first line
lines = [line for line in lines if not line.startswith("#")] # Filter out comment lines
if sample_size > 0:
lines = lines[0:sample_size]
total_lines = len(lines)
str_chunks: List[List[str]] = [lines[i:i + chunk_size] for i in range(0, total_lines, chunk_size)]
chunks: List[cls] = []
for index, chunk in enumerate(str_chunks):
if compounds:
tmp_dir = path.join(target_dir, 'compounds')
else:
tmp_dir = target_dir
filename = path.basename(dic_path).replace('.dic', f'_chunk{index}.dic')
chunk_path = path.join(tmp_dir, filename)
with open(chunk_path, 'w', encoding=LATIN_1_ENCODING) as chunk_file:
# Prepend the count of lines in this chunk and then write all lines
chunk_file.write(f"{len(chunk)}\n")
chunk_file.writelines(chunk)
chunks.append(cls(chunk_path, compounds))
return chunks

def unmunch(self, aff_path: str, delete_tmp: bool = False) -> NamedTemporaryFile:
"""Create all forms from Hunspell dictionaries.
Args:
aff_path: the path to the .aff file
delete_tmp: whether to delete the temporary file after use
Returns:
the temp file containing the unmunched dictionary
"""
unmunched_tmp = NamedTemporaryFile(delete=delete_tmp, mode='wb')
LOGGER.debug(f"Unmunching {self} into {unmunched_tmp.name} ...")
cmd_unmunch = f"unmunch {self.filepath} {aff_path}"
unmunch_result = ShellCommand(cmd_unmunch).run()
unmunched_tmp.write(unmunch_result)
unmunched_tmp.flush()
if delete_tmp:
self.rm()
return unmunched_tmp
108 changes: 108 additions & 0 deletions lib/languagetool_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
from tempfile import NamedTemporaryFile
from typing import List

from lib.constants import LATIN_1_ENCODING, LT_VER, LT_JAR_PATH, LT_DIR, RESULT_POS_DICT_FILEPATH
from lib.logger import LOGGER
from lib.shell_command import ShellCommand
from lib.variant import Variant


class LanguageToolUtils:
def __init__(self, variant: Variant, delete_tmp: bool = False):
self.variant = variant
self.delete_tmp = delete_tmp

def tokenise(self, unmunched_file: NamedTemporaryFile) -> NamedTemporaryFile:
"""Tokenise each line of an unmunched file, write it to another temp file and return it.
The written data looks weird, since the output of the LT word tokeniser inserts newlines between tokens.
Original line after unmunch:
"far-se-á"
Lines after tokenisation:
"far"
""
"se"
""
"á"
This may look iffy, but later in the process we will sort and dedupe these files, so don't panic.
Args:
unmunched_file: the NamedTemporaryFile object for the unmunched file we'll be tokenising
Returns:
a NamedTemporaryFile with the result of tokenisation written to it; note this is a UTF-8-encoded file; it is
not at this stage that we move from latin-1 encoding to UTF-8.
"""
tokenised_tmp = NamedTemporaryFile(delete=self.delete_tmp, mode='w')
LOGGER.debug(f"Tokenising {unmunched_file.name} into {tokenised_tmp.name} ...")
tokenise_cmd = (
f"java -cp {LT_JAR_PATH}:"
f"{LT_DIR}/languagetool-dev/target/languagetool-dev-{LT_VER}-jar-with-dependencies.jar "
f"org.languagetool.dev.archive.WordTokenizer {self.variant.lang}"
)
with open(unmunched_file.name, 'r', encoding=LATIN_1_ENCODING) as u:
unmunched_str = u.read()
unmunched_file.close()
tokenisation_result = ShellCommand(tokenise_cmd).run_with_input(unmunched_str)
tokenised_tmp.write(tokenisation_result)
tokenised_tmp.flush()
return tokenised_tmp

def build_spelling_binary(self, tokenised_temps: List[NamedTemporaryFile]) -> None:
"""Merge many unmunched and tokenised files into *one* plaintext file and used that to build a Morfologik
SPELLING dictionary.
The files must be merged and converted into UTF-8 before we can do anything with them. Once we have a single
'master' temp file per variant, we can pass that file as an input parameter to the Java tool that builds
spelling dictionaries.
If the shell command is successful, we will have a new output file saved to the appropriate result directory.
This will be a binary file ready to be released and used by Morfologik.
Returns:
None
"""
LOGGER.info(f"Building binary for {self.variant}...")
megatemp = NamedTemporaryFile(delete=self.delete_tmp, mode='w',
encoding='utf-8') # Open the file with UTF-8 encoding
lines = set()
for tmp in tokenised_temps:
with open(tmp.name, 'r', encoding='utf-8') as t:
lines.update(t.read().split("\n"))
megatemp.write("\n".join(sorted(lines)))
LOGGER.debug(f"Found {len(lines)} unique unmunched and tokenised forms for {self.variant}.")
cmd_build = (
f"java -cp {LT_JAR_PATH} "
f"org.languagetool.tools.SpellDictionaryBuilder "
f"-i {megatemp.name} "
f"-info {self.variant.info('source')} "
f"-freq {self.variant.freq()} "
f"-o {self.variant.dict()}"
)
ShellCommand(cmd_build).run()
LOGGER.info(f"Done compiling {self.variant} dictionary!")
self.variant.copy_spell_info()
megatemp.close()

def build_pos_binary(self) -> None:
cmd_build = (
f"java -cp {LT_JAR_PATH} "
f"org.languagetool.tools.POSDictionaryBuilder "
f"-i {RESULT_POS_DICT_FILEPATH} "
f"-info {self.variant.pos_info_java_input_path()} "
f"-o {self.variant.pos_dict_java_output_path()}"
)
ShellCommand(cmd_build).run()
self.variant.copy_pos_info()

def build_synth_binary(self) -> None:
cmd_build = (
f"java -cp {LT_JAR_PATH} "
f"org.languagetool.tools.SynthDictionaryBuilder "
f"-i {RESULT_POS_DICT_FILEPATH} "
f"-info {self.variant.synth_info_java_input_path()} "
f"-o {self.variant.synth_dict_java_output_path()}"
)
ShellCommand(cmd_build).run()
self.variant.copy_synth_info()
self.variant.rename_synth_tag_files()
15 changes: 14 additions & 1 deletion lib/utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import codecs
import shutil
from os import chdir, path
from tempfile import NamedTemporaryFile
from typing import Optional

from lib.constants import REPO_DIR, LT_DIR, JAVA_RESULTS_DIR
from lib.constants import REPO_DIR, LT_DIR, JAVA_RESULTS_DIR, LATIN_1_ENCODING
from lib.shell_command import ShellCommand
from lib.logger import LOGGER

Expand All @@ -26,3 +29,13 @@ def install_dictionaries(custom_version: Optional[str]):
LOGGER.info(f"Installing environment-defined version \"{env['PT_DICT_VERSION']}\"")
ShellCommand("mvn clean install", env=env).run()
chdir(REPO_DIR) # Go back to the repo directory


def convert_to_utf8(tmp_file: NamedTemporaryFile, delete_tmp: bool = False) -> NamedTemporaryFile:
"""Takes a Latin-1-encoded temp and returns another temp with the same contents but in UTF-8."""
utf8_tmp = NamedTemporaryFile(mode='w+', encoding='utf-8', delete=delete_tmp)
LOGGER.debug(f"Converting {tmp_file.name} into UTF-8, into {utf8_tmp.name} ...")
with codecs.open(tmp_file.name, 'r', encoding=LATIN_1_ENCODING) as file:
shutil.copyfileobj(file, utf8_tmp)
utf8_tmp.seek(0)
return utf8_tmp
Loading

0 comments on commit 117b47b

Please sign in to comment.