Skip to content

Commit

Permalink
refactor(excel2json-lists): change find missing translations (#1112)
Browse files Browse the repository at this point in the history
  • Loading branch information
Nora-Olivia-Ammann authored Aug 15, 2024
1 parent 4670085 commit ab54d96
Show file tree
Hide file tree
Showing 4 changed files with 67 additions and 78 deletions.
53 changes: 23 additions & 30 deletions src/dsp_tools/commands/excel2json/new_lists/compliance_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,10 @@ def make_all_excel_compliance_checks(sheet_list: list[ExcelSheet]) -> None:
# These functions must be called in this order,
# as some of the following checks only work if the previous have passed.
_check_duplicates_all_excels(sheet_list)
_check_for_unique_list_names(sheet_list)
_make_shape_compliance_all_excels(sheet_list)
_make_all_content_compliance_checks_all_excels(sheet_list)
_check_for_missing_translations_all_excels(sheet_list)
_check_for_unique_list_names(sheet_list)
_check_for_erroneous_entries_all_excels(sheet_list)


def _check_duplicates_all_excels(sheet_list: list[ExcelSheet]) -> None:
Expand Down Expand Up @@ -215,12 +216,6 @@ def make_col_names(lang: str) -> set[str]:
return {}


def _make_all_content_compliance_checks_all_excels(sheet_list: list[ExcelSheet]) -> None:
"""Check if the content of the excel files is compliant with the expected format."""
_check_for_missing_translations_all_excels(sheet_list)
_check_for_erroneous_entries_all_excels(sheet_list)


def _check_for_missing_translations_all_excels(sheet_list: list[ExcelSheet]) -> None:
problems: list[SheetProblem] = [
p for sheet in sheet_list if (p := _check_for_missing_translations_one_sheet(sheet)) is not None
Expand All @@ -236,23 +231,31 @@ def _check_for_missing_translations_one_sheet(sheet: ExcelSheet) -> MissingTrans
languages = get_all_languages_for_columns(sheet.df.columns)
all_cols = _compose_all_combinatoric_column_titles(col_endings, languages)
problems = []
for column_group in all_cols.node_cols:
problems.extend(_check_for_missing_translations_one_column_level(column_group.columns, sheet.df))
problems.extend(_check_for_missing_translations_one_column_level(all_cols.list_cols.columns, sheet.df))
for i, row in sheet.df.iterrows():
if problem := _check_missing_translations_one_row(int(str(i)), row, all_cols):
problems.append(problem)
if problems:
return MissingTranslationsSheetProblem(sheet.excel_name, sheet.sheet_name, problems)
return None


def _check_for_missing_translations_one_column_level(
columns: list[str], df: pd.DataFrame
) -> list[MissingNodeTranslationProblem]:
# column level refers to the hierarchical level of the nodes. eg. ["en_1", "de_1", "fr_1", "it_1", "rm_1"]
problems = []
for i, row in df.iterrows():
if problem := _check_for_missing_translations_one_node(row, columns, int(str(i))):
problems.append(problem)
return problems
def _check_missing_translations_one_row(
row_index: int, row: pd.Series[Any], columns: Columns
) -> MissingNodeTranslationProblem | None:
missing_translations = []
for col_group in columns.node_cols:
missing_translations.extend(_check_for_missing_translations_one_column_group(row, col_group.columns))
missing_translations.extend(_check_for_missing_translations_one_column_group(row, columns.list_cols.columns))
if missing_translations:
return MissingNodeTranslationProblem(empty_columns=missing_translations, index_num=row_index)
return None


def _check_for_missing_translations_one_column_group(row: pd.Series[Any], columns: list[str]) -> list[str]:
missing = row[columns].isna()
if missing.any() and not missing.all():
return [str(index) for index, is_missing in missing.items() if is_missing]
return []


def _compose_all_combinatoric_column_titles(nums: list[str], languages: set[str]) -> Columns:
Expand All @@ -263,16 +266,6 @@ def _compose_all_combinatoric_column_titles(nums: list[str], languages: set[str]
return Columns(list_cols=list_columns, node_cols=node_cols)


def _check_for_missing_translations_one_node(
row: pd.Series[Any], columns: list[str], row_index: int
) -> MissingNodeTranslationProblem | None:
missing = row[columns].isna()
if missing.any() and not missing.all():
missing_cols = [str(index) for index, is_missing in missing.items() if is_missing]
return MissingNodeTranslationProblem(missing_cols, row_index)
return None


def _check_for_erroneous_entries_all_excels(sheet_list: list[ExcelSheet]) -> None:
problems: list[SheetProblem] = [
p for sheet in sheet_list if (p := _check_for_erroneous_entries_one_list(sheet)) is not None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ class MissingNodeTranslationProblem:
index_num: int

def execute_error_protocol(self) -> str:
return f"Row Number: {self.index_num + 2} | Column(s): {', '.join(self.empty_columns)}"
return f"Row Number: {self.index_num + 2} | Column(s): {', '.join(sorted(self.empty_columns))}"


@dataclass(frozen=True)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import pandas as pd
import pytest

from dsp_tools.commands.excel2json.new_lists.models.deserialise import ColumnNodes
from dsp_tools.commands.excel2json.new_lists.models.deserialise import Columns
from dsp_tools.commands.excel2json.new_lists.models.deserialise import ColumnsList
from dsp_tools.commands.excel2json.new_lists.models.deserialise import ExcelSheet


Expand Down Expand Up @@ -34,6 +37,14 @@ def f2_s2_good_en_de() -> ExcelSheet:
return ExcelSheet(excel_name="file2", sheet_name="sheet2", df=df)


@pytest.fixture()
def cols_en_de_1_3() -> Columns:
n_1 = ColumnNodes(level_num=1, columns=["en_1", "de_1"])
n_2 = ColumnNodes(level_num=2, columns=["en_2", "de_2"])
n_3 = ColumnNodes(level_num=3, columns=["en_3", "de_3"])
return Columns(list_cols=ColumnsList(["en_list", "de_list"]), node_cols=[n_1, n_2, n_3])


@pytest.fixture()
def f1_s1_good_id_filled() -> ExcelSheet:
df = pd.DataFrame(
Expand Down Expand Up @@ -217,11 +228,11 @@ def f2_s2_missing_translations() -> ExcelSheet:
df = pd.DataFrame(
{
"id (optional)": [1, 2, 3, 4, 5, 6, 7],
"en_list": ["list3", "list3", "list3", "list3", "list3", "list3", "list3"],
"en_list": ["list3", "list3", "list3", "list3", pd.NA, "list3", "list3"],
"de_list": [pd.NA, "list3", "list3", "list3", "list3", "list3", "list3"],
"en_1": [pd.NA, pd.NA, "node1", "node1", "node1", "node2", "node3"],
"de_1": [pd.NA, "node1", "node1", "node1", "node1", "node2", pd.NA],
"en_2": [pd.NA, pd.NA, "node1.1", "node1.1", "node1.2", pd.NA, pd.NA],
"en_1": [pd.NA, pd.NA, "node1", "node1", pd.NA, "node1", "node3"],
"de_1": [pd.NA, "node1", "node1", "node1", "node1", "node2", "node3"],
"en_2": [pd.NA, pd.NA, "node1.1", "node1.1", pd.NA, pd.NA, pd.NA],
"de_2": [pd.NA, pd.NA, "node1.1", "node1.1", "node1.2", pd.NA, pd.NA],
"en_3": [pd.NA, pd.NA, pd.NA, "node1.1.1", pd.NA, pd.NA, pd.NA],
"de_3": [pd.NA, pd.NA, pd.NA, "node1.1.1", pd.NA, pd.NA, pd.NA],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,20 +14,21 @@
from dsp_tools.commands.excel2json.new_lists.compliance_checks import _check_for_erroneous_entries_one_list
from dsp_tools.commands.excel2json.new_lists.compliance_checks import _check_for_erroneous_node_info_one_df
from dsp_tools.commands.excel2json.new_lists.compliance_checks import _check_for_missing_translations_all_excels
from dsp_tools.commands.excel2json.new_lists.compliance_checks import _check_for_missing_translations_one_column_level
from dsp_tools.commands.excel2json.new_lists.compliance_checks import _check_for_missing_translations_one_node
from dsp_tools.commands.excel2json.new_lists.compliance_checks import _check_for_missing_translations_one_column_group
from dsp_tools.commands.excel2json.new_lists.compliance_checks import _check_for_missing_translations_one_sheet
from dsp_tools.commands.excel2json.new_lists.compliance_checks import _check_for_unique_list_names
from dsp_tools.commands.excel2json.new_lists.compliance_checks import (
_check_if_all_translations_in_all_column_levels_present_one_sheet,
)
from dsp_tools.commands.excel2json.new_lists.compliance_checks import _check_if_minimum_number_of_cols_present_one_sheet
from dsp_tools.commands.excel2json.new_lists.compliance_checks import _check_minimum_rows
from dsp_tools.commands.excel2json.new_lists.compliance_checks import _check_missing_translations_one_row
from dsp_tools.commands.excel2json.new_lists.compliance_checks import _check_warn_unusual_columns_one_sheet
from dsp_tools.commands.excel2json.new_lists.compliance_checks import _compose_all_combinatoric_column_titles
from dsp_tools.commands.excel2json.new_lists.compliance_checks import _make_shape_compliance_all_excels
from dsp_tools.commands.excel2json.new_lists.compliance_checks import _make_shape_compliance_one_sheet
from dsp_tools.commands.excel2json.new_lists.compliance_checks import make_all_excel_compliance_checks
from dsp_tools.commands.excel2json.new_lists.models.deserialise import Columns
from dsp_tools.commands.excel2json.new_lists.models.deserialise import ExcelSheet
from dsp_tools.commands.excel2json.new_lists.models.input_error import DuplicatesCustomIDInProblem
from dsp_tools.commands.excel2json.new_lists.models.input_error import DuplicatesInSheetProblem
Expand Down Expand Up @@ -89,7 +90,7 @@ def test_content_compliance(self, f1_s1_good_en: ExcelSheet, f2_s2_missing_trans
"For the following nodes, the translations are missing:\n"
" - Row Number: 2 | Column(s): de_list\n"
" - Row Number: 3 | Column(s): en_1\n"
" - Row Number: 8 | Column(s): de_1"
" - Row Number: 6 | Column(s): en_1, en_2, en_list"
)
with pytest.raises(InputError, match=expected):
make_all_excel_compliance_checks(all_sheets)
Expand Down Expand Up @@ -380,15 +381,15 @@ def test_problem(self, f1_s1_good_en: ExcelSheet, f2_s2_missing_translations: Ex
"For the following nodes, the translations are missing:\n"
" - Row Number: 2 | Column(s): de_list\n"
" - Row Number: 3 | Column(s): en_1\n"
" - Row Number: 8 | Column(s): de_1"
" - Row Number: 6 | Column(s): en_1, en_2, en_list"
)
with pytest.raises(InputError, match=expected):
_check_for_missing_translations_all_excels(all_sheets)


class TestAllNodesTranslatedIntoAllLanguages:
def test_good(self, f1_s1_good_id_filled: ExcelSheet) -> None:
_check_for_missing_translations_one_sheet(f1_s1_good_id_filled)
assert not _check_for_missing_translations_one_sheet(f1_s1_good_id_filled)

def test_missing_translation(self, f1_s1_missing_translation_id_filled: ExcelSheet) -> None:
expected = [
Expand All @@ -406,47 +407,31 @@ def test_missing_translation(self, f1_s1_missing_translation_id_filled: ExcelShe
assert res.index_num == expct.index_num


class TestCheckOneHierarchy:
def test_good(self) -> None:
test_df = pd.DataFrame(
{
"en_1": ["exist1_en", "exist2_en", "exist3_en"],
"de_1": ["exist1_de", "exist2_de", "exist3_de"],
"fr_1": ["exist1_fr", "exist2_fr", "exist3_fr"],
}
)
assert not _check_for_missing_translations_one_column_level(["en_1", "de_1", "fr_1"], test_df)
class TestCheckMissingTranslationsOneRow:
def test_good(self, f2_s2_good_en_de: ExcelSheet, cols_en_de_1_3: Columns) -> None:
for i, row in f2_s2_good_en_de.df.iterrows():
assert not _check_missing_translations_one_row(int(str(i)), row, cols_en_de_1_3)

def test_missing_translation(self) -> None:
test_df = pd.DataFrame(
{
"en_1": ["exist1_en", pd.NA, "exist3_en"],
"de_1": ["exist1_de", pd.NA, "exist3_de"],
"fr_1": ["exist1_fr", "exist2_fr", "exist3_fr"],
}
)
res = _check_for_missing_translations_one_column_level(["en_1", "de_1", "fr_1"], test_df)
assert len(res) == 1
prbl = res[0]
assert prbl.empty_columns == ["en_1", "de_1"]
assert prbl.index_num == 1
def test_one_missing(self, f2_s2_missing_translations: ExcelSheet, cols_en_de_1_3: Columns) -> None:
result = _check_missing_translations_one_row(1, f2_s2_missing_translations.df.loc[1], cols_en_de_1_3) # type: ignore[arg-type]
assert isinstance(result, MissingNodeTranslationProblem)
assert set(result.empty_columns) == {"en_1"}
assert result.index_num == 1

def test_three_missing(self, f2_s2_missing_translations: ExcelSheet, cols_en_de_1_3: Columns) -> None:
result = _check_missing_translations_one_row(4, f2_s2_missing_translations.df.loc[4], cols_en_de_1_3) # type: ignore[arg-type]
assert isinstance(result, MissingNodeTranslationProblem)
assert set(result.empty_columns) == {"en_list", "en_1", "en_2"}
assert result.index_num == 4

class TestCheckOneNodeForTranslation:
def test_good(self) -> None:
test_series = pd.Series(["exist_en", "exist_de"], index=["en_1", "de_1"])
assert not _check_for_missing_translations_one_node(test_series, ["en_1", "de_1"], 1)

def test_good_empty(self) -> None:
test_series = pd.Series([pd.NA, pd.NA], index=["en_1", "de_1"])
assert not _check_for_missing_translations_one_node(test_series, ["en_1", "de_1"], 1)

def test_missing_translation(self) -> None:
test_series = pd.Series(["exist_en", pd.NA], index=["en_1", "de_1"])
result = _check_for_missing_translations_one_node(test_series, ["en_1", "de_1"], 1)
result = cast(MissingNodeTranslationProblem, result)
assert result.empty_columns == ["de_1"]
assert result.index_num == 1
def test_one_group_good(self, f2_s2_good_en_de: ExcelSheet) -> None:
for i, row in f2_s2_good_en_de.df.iterrows():
assert not _check_for_missing_translations_one_column_group(row, ["en_list", "de_list"])

def test_one_group_missing(self) -> None:
series = pd.Series(data=[pd.NA, "content"], index=["en_1", "de_1"])
result = _check_for_missing_translations_one_column_group(series, ["en_1", "de_1"])
assert result == ["en_1"]


def test_make_columns() -> None:
Expand Down

0 comments on commit ab54d96

Please sign in to comment.