Skip to content

Commit

Permalink
Preprocess - Use ISO language codes for Udpipe
Browse files Browse the repository at this point in the history
  • Loading branch information
PrimozGodec committed Feb 5, 2024
1 parent e733e96 commit 3a6f9b7
Show file tree
Hide file tree
Showing 2 changed files with 206 additions and 39 deletions.
69 changes: 36 additions & 33 deletions orangecontrib/text/widgets/owpreprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
from orangecontrib.text.language import ISO2LANG, LANG2ISO
from orangecontrib.text.misc import nltk_data_dir
from orangecontrib.text.preprocess import *
from orangecontrib.text.preprocess.normalize import UDPipeStopIteration
from orangecontrib.text.preprocess.normalize import UDPipeStopIteration, UDPipeModels
from orangecontrib.text.tag import AveragedPerceptronTagger, MaxEntTagger, \
POSTagger

Expand Down Expand Up @@ -87,19 +87,20 @@ def __init__(
Boxs initial value (as an ISO code).
"""
super().__init__(parent)
self.setMinimumWidth(80)
self.__add_items(items, include_none)
self.set_current_language(value)
self.currentIndexChanged.connect(self.__index_changed)
self.callback = callback
self.setMinimumWidth(80)
items = [(ISO2LANG[itm], itm) for itm in items]
self.add_items(items, include_none, value)
self.currentIndexChanged.connect(self.index_changed)

def __add_items(self, items: Iterable[str], include_non: bool):
if include_non:
def add_items(self, items: Iterable[Tuple[str, str]], include_none: bool, language: str):
if include_none:
self.addItem(_DEFAULT_NONE, None)
for itm in sorted(items, key=ISO2LANG.get):
self.addItem(ISO2LANG[itm], itm)
for itm in sorted(items):
self.addItem(*itm)
self.set_current_language(language)

def __index_changed(self, index: QModelIndex):
def index_changed(self, index: QModelIndex):
self.callback(self.itemData(index))

def set_current_language(self, iso_language: Optional[str]):
Expand All @@ -115,34 +116,35 @@ def set_current_language(self, iso_language: Optional[str]):
self.setCurrentIndex(index)


class UDPipeComboBox(QComboBox):
def __init__(self, master: BaseEditor, value: str, default: str,
callback: Callable):
super().__init__(master)
self.__items = [] # type: List
class UDPipeComboBox(LanguageComboBox):
def __init__(
self, master: BaseEditor, value: str, default: str, callback: Callable
):
self.__items: List = []
self.__default_lang = default
self.add_items(value)
self.currentTextChanged.connect(callback)
self.setMinimumWidth(80)
super().__init__(master, [], value, False, callback)

@property
def items(self) -> List:
return UDPipeLemmatizer().models.supported_languages
return UDPipeModels().supported_languages

def add_items(self, value: str):
def add_items(self, _, include_none: bool, language: str):
self.__items = self.items
self.addItems(self.__items)
if value in self.__items:
self.setCurrentText(value)
elif self.__default_lang in self.__items:
self.setCurrentText(self.__default_lang)
super().add_items(self.__items, include_none, language)

def set_current_language(self, iso_language: Optional[str]):
iso_items = {iso for _, iso in self.__items}
if iso_language in iso_items:
super().set_current_language(iso_language)
elif self.__default_lang in iso_items:
super().set_current_language(self.__default_lang)
elif self.__items:
self.setCurrentIndex(0)

def showPopup(self):
if self.__items != self.items:
self.clear()
self.add_items(self.currentText())
self.add_items(None, False, self.itemData(self.currentIndex()))
super().showPopup()


Expand Down Expand Up @@ -475,14 +477,13 @@ class NormalizationModule(SingleMethodModule):
UDPipe: UDPipeLemmatizer,
Lemmagen: LemmagenLemmatizer}
DEFAULT_METHOD = Porter
DEFAULT_UDPIPE_LANG = "English" # todo: remove when udpipe use iso
DEFAULT_LANGUAGE = "en"
DEFAULT_USE_TOKE = False

def __init__(self, parent=None, **kwargs):
super().__init__(parent, **kwargs)
self.__snowball_lang = self.DEFAULT_LANGUAGE
self.__udpipe_lang = self.DEFAULT_UDPIPE_LANG
self.__udpipe_lang = self.DEFAULT_LANGUAGE
self.__lemmagen_lang = self.DEFAULT_LANGUAGE
self.__use_tokenizer = self.DEFAULT_USE_TOKE

Expand All @@ -494,7 +495,7 @@ def __init__(self, parent=None, **kwargs):
self.__set_snowball_lang
)
self.__combo_udl = UDPipeComboBox(
self, self.__udpipe_lang, self.DEFAULT_UDPIPE_LANG, self.__set_udpipe_lang
self, self.__udpipe_lang, self.DEFAULT_LANGUAGE, self.__set_udpipe_lang
)
self.__check_use = QCheckBox("UDPipe tokenizer",
checked=self.DEFAULT_USE_TOKE)
Expand Down Expand Up @@ -538,7 +539,7 @@ def setParameters(self, params: Dict):
super().setParameters(params)
snowball_lang = params.get("snowball_language", self.DEFAULT_LANGUAGE)
self.__set_snowball_lang(snowball_lang)
udpipe_lang = params.get("udpipe_language", self.DEFAULT_UDPIPE_LANG)
udpipe_lang = params.get("udpipe_language", self.DEFAULT_LANGUAGE)
self.__set_udpipe_lang(udpipe_lang)
use_tokenizer = params.get("udpipe_tokenizer", self.DEFAULT_USE_TOKE)
self.__set_use_tokenizer(use_tokenizer)
Expand All @@ -560,7 +561,7 @@ def __set_snowball_lang(self, language: str):
def __set_udpipe_lang(self, language: str):
if self.__udpipe_lang != language:
self.__udpipe_lang = language
self.__combo_udl.setCurrentText(language)
self.__combo_udl.set_current_language(language)
self.changed.emit()
if self.method == self.UDPipe:
self.edited.emit()
Expand Down Expand Up @@ -593,13 +594,12 @@ def parameters(self) -> Dict:
def createinstance(params: Dict) -> BaseNormalizer:
method = params.get("method", NormalizationModule.DEFAULT_METHOD)
args = {}
def_udpipe = NormalizationModule.DEFAULT_UDPIPE_LANG
def_lang = NormalizationModule.DEFAULT_LANGUAGE
if method == NormalizationModule.Snowball:
args = {"language": params.get("snowball_language", def_lang)}
elif method == NormalizationModule.UDPipe:
def_use = NormalizationModule.DEFAULT_USE_TOKE
args = {"language": params.get("udpipe_language", def_udpipe),
args = {"language": params.get("udpipe_language", def_lang),
"use_tokenizer": params.get("udpipe_tokenizer", def_use)}
elif method == NormalizationModule.Lemmagen:
args = {"language": params.get("lemmagen_language", def_lang)}
Expand Down Expand Up @@ -1395,6 +1395,9 @@ def str_into_paths(label):
for key in ("lemmagen_language", "snowball_language"):
if key in pp:
pp[key] = LANG2ISO[pp[key]]
up_lang = "udpipe_language"
if up_lang in pp:
pp[up_lang] = UDPipeModels().language_to_iso(pp[up_lang])


if __name__ == "__main__":
Expand Down
176 changes: 170 additions & 6 deletions orangecontrib/text/widgets/tests/test_owpreprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
POSTaggingModule,
LanguageComboBox,
_DEFAULT_NONE,
UDPipeComboBox,
)


Expand Down Expand Up @@ -127,7 +128,7 @@ def test_udpipe_offline(self):
@patch("orangecontrib.text.preprocess.normalize.UDPipeModels.online",
PropertyMock(return_value=False))
@patch("orangecontrib.text.preprocess.normalize.UDPipeModels.model_files",
PropertyMock(return_value=[]))
PropertyMock(return_value={}))
@patch("orangecontrib.text.widgets.owpreprocess.OWPreprocess.start", Mock())
def test_udpipe_no_models(self):
widget = self.create_widget(OWPreprocess)
Expand Down Expand Up @@ -201,12 +202,12 @@ def test_migrate_settings_normalize(self):
settings = {"__version__": 1,
"normalizer": {"enabled": True, "method_index": 2,
"snowball_language": "French",
"udpipe_language": "German",
"udpipe_language": "Portuguese",
"udpipe_tokenizer": True}}
widget = self.create_widget(OWPreprocess, stored_settings=settings)
params = [("preprocess.normalize",
{"method": 2, "snowball_language": "fr",
"udpipe_language": "German", "udpipe_tokenizer": True})]
"udpipe_language": "pt", "udpipe_tokenizer": True})]
self.assertEqual(widget.storedsettings["preprocessors"], params)

def test_migrate_settings_filter(self):
Expand Down Expand Up @@ -358,6 +359,133 @@ def test_migrate_snowball_language_settings(self):
normalize_settings = widget.storedsettings["preprocessors"][0][1]
self.assertEqual("en", normalize_settings["snowball_language"])

def test_migrate_udpipe_language_settings(self):
"""Test migration to iso langauge codes"""
settings = {
"__version__": 3,
"storedsettings": {
"preprocessors": [
("preprocess.normalize", {"udpipe_language": "Slovenian"}),
]
},
}
widget = self.create_widget(OWPreprocess, stored_settings=settings)
normalize_settings = widget.storedsettings["preprocessors"][0][1]
self.assertEqual("sl", normalize_settings["udpipe_language"])

settings = {
"__version__": 3,
"storedsettings": {
"preprocessors": [
("preprocess.normalize", {"udpipe_language": "English (lines)"}),
]
},
}
widget = self.create_widget(OWPreprocess, stored_settings=settings)
normalize_settings = widget.storedsettings["preprocessors"][0][1]
self.assertEqual("en_lines", normalize_settings["udpipe_language"])

settings = {
"__version__": 3,
"storedsettings": {
"preprocessors": [
("preprocess.normalize", {"udpipe_language": "Abc"}),
]
},
}
widget = self.create_widget(OWPreprocess, stored_settings=settings)
normalize_settings = widget.storedsettings["preprocessors"][0][1]
self.assertIsNone(normalize_settings["udpipe_language"])

@unittest.skip("Very slow test")
def test_migrate_udpipe_language_settings_slow(self):
"""
Test migration to iso langauge codes. To run it successfully remove
patch on the TestOWPreprocessMigrateSettings class
"""
migrations = [
("Ancient greek proiel", "grc_proiel"),
("Ancient greek", "grc"),
("Arabic", "ar"),
("Basque", "eu"),
("Belarusian", "be"),
("Bulgarian", "bg"),
("Catalan", "ca"),
("Chinese", "zh"),
("Coptic", "cop"),
("Croatian", "hr"),
("Czech cac", "cs_cac"),
("Czech cltt", "cs_cltt"),
("Czech", "cs"),
("Danish", "da"),
("Dutch lassysmall", "nl_lassysmall"),
("Dutch", "nl"),
("English lines", "en_lines"),
("English partut", "en_partut"),
("English", "en"),
("Estonian", "et"),
("Finnish ftb", "fi_ftb"),
("Finnish", "fi"),
("French partut", "fr_partut"),
("French sequoia", "fr_sequoia"),
("French", "fr"),
("Galician treegal", "gl_treegal"),
("Galician", "gl"),
("German", "de"),
("Gothic", "got"),
("Greek", "el"),
("Hebrew", "he"),
("Hindi", "hi"),
("Hungarian", "hu"),
("Indonesian", "id"),
("Irish", "ga"),
("Italian", "it"),
("Japanese", "ja"),
("Kazakh", "kk"),
("Korean", "ko"),
("Latin ittb", "la_ittb"),
("Latin proiel", "la_proiel"),
("Latin", "la"),
("Latvian", "lv"),
("Lithuanian", "lt"),
("Norwegian bokmaal", "nb"),
("Norwegian nynorsk", "nn"),
("Old church slavonic", "cu"),
("Persian", "fa"),
("Polish", "pl"),
("Portuguese br", "pt_br"),
("Portuguese", "pt"),
("Romanian", "ro"),
("Russian syntagrus", "ru_syntagrus"),
("Russian", "ru"),
("Sanskrit", "sa"),
("Slovak", "sk"),
("Slovenian sst", "sl_sst"),
("Slovenian", "sl"),
("Spanish ancora", "es_ancora"),
("Spanish", "es"),
("Swedish lines", "sv_lines"),
("Swedish", "sv"),
("Tamil", "ta"),
("Turkish", "tr"),
("Ukrainian", "uk"),
("Urdu", "ur"),
("Uyghur", "ug"),
("Vietnamese", "vi"),
]
for old_value, new_value in migrations:
settings = {
"__version__": 3,
"storedsettings": {
"preprocessors": [
("preprocess.normalize", {"udpipe_language": old_value}),
]
},
}
widget = self.create_widget(OWPreprocess, stored_settings=settings)
normalize_settings = widget.storedsettings["preprocessors"][0][1]
self.assertEqual(new_value, normalize_settings["udpipe_language"])


class TestTransformationModule(WidgetTest):
def setUp(self):
Expand Down Expand Up @@ -500,7 +628,7 @@ def test_parameters(self):
params = {
"method": NormalizationModule.Porter,
"snowball_language": "en",
"udpipe_language": "English",
"udpipe_language": "en",
"lemmagen_language": "en",
"udpipe_tokenizer": False,
}
Expand All @@ -510,7 +638,7 @@ def test_set_parameters(self):
params = {
"method": NormalizationModule.UDPipe,
"snowball_language": "nl",
"udpipe_language": "Slovenian",
"udpipe_language": "sl",
"lemmagen_language": "bg",
"udpipe_tokenizer": True,
}
Expand Down Expand Up @@ -549,7 +677,7 @@ def test_repr(self):
@patch("orangecontrib.text.preprocess.normalize.UDPipeModels.online",
PropertyMock(return_value=False))
@patch("orangecontrib.text.preprocess.normalize.UDPipeModels.model_files",
PropertyMock(return_value=[]))
PropertyMock(return_value={}))
def test_udpipe_no_models(self):
editor = NormalizationModule()
button = editor._SingleMethodModule__group.button(editor.UDPipe)
Expand Down Expand Up @@ -837,5 +965,41 @@ def test_change_item(self):
mock.assert_called_once_with(None)


@patch(SF_LIST, new=Mock(return_value=SERVER_FILES))
class TestUDPipeComboBox(WidgetTest):
ITEMS = ["English", "English (lines)", "English (partut)", "Lithuanian",
"Portuguese", "Slovenian", "Slovenian (sst)"]

def test_basic_setup(self):
mock = Mock()
cb = UDPipeComboBox(None, "pt", "en", mock)
self.assertEqual(7, cb.count())
self.assertEqual(self.ITEMS, [cb.itemText(i) for i in range(cb.count())])
self.assertEqual("Portuguese", cb.currentText())

def test_set_current_language(self):
mock = Mock()
cb = UDPipeComboBox(None, "pt", "en", mock)
self.assertEqual("Portuguese", cb.currentText())
cb.set_current_language("sl")
self.assertEqual("Slovenian", cb.currentText())
cb.set_current_language("abc") # should set to default
self.assertEqual("English", cb.currentText())
# when no default language in the dropdown set to first
cb.removeItem(0)
x = cb._UDPipeComboBox__items
cb._UDPipeComboBox__items = x[:3] + x[4:]
cb.set_current_language("abc")
self.assertEqual("English (lines)", cb.currentText())

def test_change_item(self):
mock = Mock()
cb = UDPipeComboBox(None, "pt", "en", mock)
self.assertEqual(self.ITEMS, [cb.itemText(i) for i in range(cb.count())])
mock.assert_not_called()
simulate.combobox_activate_item(cb, "Slovenian")
mock.assert_called_once_with("sl")


if __name__ == "__main__":
unittest.main()

0 comments on commit 3a6f9b7

Please sign in to comment.