Skip to content

Commit

Permalink
Merge pull request #961 from PrimozGodec/language-keyword
Browse files Browse the repository at this point in the history
[ENH] Keywords - language from corpus
  • Loading branch information
VesnaT authored Apr 2, 2024
2 parents cdb0901 + 153e06f commit 6aad2ad
Show file tree
Hide file tree
Showing 5 changed files with 195 additions and 64 deletions.
61 changes: 15 additions & 46 deletions orangecontrib/text/keywords/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,51 +17,18 @@
from orangecontrib.text.keywords.rake import Rake
from orangecontrib.text.language import ISO2LANG
from orangecontrib.text.preprocess import StopwordsFilter

# all available languages for RAKE
from orangecontrib.text.vectorization import BowVectorizer


# todo: refactor when refactoring language for keywords module
# this is a temporary solution since supported_languages now returns lang ISO codes
RAKE_LANGUAGES = [ISO2LANG[la] for la in StopwordsFilter.supported_languages()]
# all available languages for RAKE
RAKE_LANGUAGES = StopwordsFilter.supported_languages()
# all available languages for YAKE!
YAKE_LANGUAGE_MAPPING = {
"Arabic": "ar",
"Armenian": "hy",
"Breton": "br",
"Bulgarian": "bg",
"Chinese": "zh",
"Croatian": "hr",
"Czech": "cz",
"Danish": "da",
"Dutch": "nl",
"English": "en",
"Estonian": "et",
"Finnish": "fi",
"French": "fr",
"German": "de",
"Greek": "el",
"Hindi": "hi",
"Hungarian": "hu",
"Indonesian": "id",
"Italian": "it",
"Japanese": "ja",
"Latvian": "lv",
"Lithuanian": "lt",
"Norwegian": "no",
"Persian": "fa",
"Polish": "pl",
"Portuguese": "pt",
"Romanian": "ro",
"Russian": "ru",
"Slovak": "sk",
"Slovenian": "sl",
"Spanish": "es",
"Swedish": "sv",
"Turkish": "tr",
"Ukrainian": "uk"
}
# fmt: off
YAKE_LANGUAGES = [
"ar", "hy", "br", "bg", "zh", "hr", "cs", "da", "nl", "en", "et", "fi",
"fr", "de", "el", "hi", "hu", "id", "it", "ja", "lv", "lt", "no", "fa",
"pl", "pt", "ro", "ru", "sk", "sl", "es", "sv", "tr", "uk"
]
# fmt: on


def tfidf_keywords(
Expand Down Expand Up @@ -110,7 +77,7 @@ def tfidf_keywords(

def yake_keywords(
texts: List[str],
language: str = "English",
language: str = "en",
max_len: int = 1,
progress_callback: Callable = None
) -> List[List[Tuple[str, float]]]:
Expand All @@ -135,7 +102,6 @@ def yake_keywords(
if progress_callback is None:
progress_callback = dummy_callback

language = YAKE_LANGUAGE_MAPPING[language]
extractor = yake.KeywordExtractor(lan=language, n=max_len)

keywords = []
Expand All @@ -148,7 +114,7 @@ def yake_keywords(

def rake_keywords(
texts: List[str],
language: str = "English",
language: str = "en",
max_len: int = 1,
progress_callback: Callable = None
) -> List[List[Tuple[str, float]]]:
Expand All @@ -174,9 +140,12 @@ def rake_keywords(
if progress_callback is None:
progress_callback = dummy_callback

if language.lower() not in [l.lower() for l in RAKE_LANGUAGES]:
if language not in RAKE_LANGUAGES:
raise ValueError(f"Language must be one of: {RAKE_LANGUAGES}")

language = ISO2LANG[language]
# some languages (e.g. Slovenian have different name than ISO name in nltk)
language = StopwordsFilter.LANG2NLTK.get(language, language)
stop_words_ = [x.strip() for x in stopwords.words(language.lower())]
rake_object = Rake(stop_words_, max_words_length=max_len)

Expand Down
3 changes: 2 additions & 1 deletion orangecontrib/text/language.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,8 @@ def __init__(
"""
if languages is None:
# if languages not provided take all available languages
languages = sorted(filter(None, ISO2LANG), key=ISO2LANG.get)
languages = filter(None, ISO2LANG)
languages = sorted(languages, key=ISO2LANG.get)
if include_none:
languages = [None] + languages
super().__init__(iterable=languages)
Expand Down
29 changes: 28 additions & 1 deletion orangecontrib/text/tests/test_language.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,34 @@
from Orange.data import StringVariable, Domain

from orangecontrib.text import Corpus
from orangecontrib.text.language import detect_language, ISO2LANG
from orangecontrib.text.language import detect_language, ISO2LANG, LanguageModel


class TestLanguageModel(TestCase):
def test_model_without_languages(self):
# no None, all languages
lm = LanguageModel()
self.assertEqual(len(ISO2LANG) - 1, lm.rowCount())
all_langs = [lm.data(lm.index(i)) for i in range(lm.rowCount())]
expected = sorted(list(ISO2LANG.values())[:-1])
self.assertEqual(expected, all_langs)

lm = LanguageModel(include_none=True)
self.assertEqual(len(ISO2LANG), lm.rowCount())
all_langs = [lm.data(lm.index(i)) for i in range(lm.rowCount())]
expected = sorted(list(ISO2LANG.values())[:-1])
self.assertEqual(["(no language)"] + expected, all_langs)

def test_model_with_languages(self):
lm = LanguageModel(include_none=True, languages=["en", "ar", "it"])
self.assertEqual(4, lm.rowCount())
all_langs = [lm.data(lm.index(i)) for i in range(lm.rowCount())]
self.assertEqual(["(no language)", "Arabic", "English", "Italian"], all_langs)

lm = LanguageModel(languages=["en", "ar", "it"])
self.assertEqual(3, lm.rowCount())
all_langs = [lm.data(lm.index(i)) for i in range(lm.rowCount())]
self.assertEqual(["Arabic", "English", "Italian"], all_langs)


class TestLanguage(TestCase):
Expand Down
64 changes: 56 additions & 8 deletions orangecontrib/text/widgets/owkeywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,12 @@

from orangecontrib.text import Corpus
from orangecontrib.text.keywords import ScoringMethods, AggregationMethods, \
YAKE_LANGUAGE_MAPPING, RAKE_LANGUAGES
YAKE_LANGUAGES, RAKE_LANGUAGES
from orangecontrib.text.language import LanguageModel
from orangecontrib.text.preprocess import BaseNormalizer
from orangecontrib.text.widgets.utils.words import create_words_table, \
WORDS_COLUMN_NAME

YAKE_LANGUAGES = list(YAKE_LANGUAGE_MAPPING.keys())
CONNECTION_WARNING = (
f"{ScoringMethods.MBERT} could not extract keywords from some "
"documents due to connection error. Please rerun keyword extraction."
Expand Down Expand Up @@ -202,15 +202,17 @@ class OWKeywords(OWWidget, ConcurrentWidgetMixin):
keywords = "extract keywords, characteristic, term"

buttons_area_orientation = Qt.Vertical
settings_version = 2

# Qt.DescendingOrder is IntEnum in PyQt5 and Enum in PyQt6 (both have value attr)
# in setting we want to save integer and not Enum object (in case of PyQt6)
DEFAULT_SORTING = (1, enum2int(Qt.DescendingOrder))
DEFAULT_LANGUAGE = "en"

settingsHandler = DomainContextHandler()
selected_scoring_methods: Set[str] = Setting({ScoringMethods.TF_IDF})
yake_lang_index: int = Setting(YAKE_LANGUAGES.index("English"))
rake_lang_index: int = Setting(RAKE_LANGUAGES.index("English"))
yake_language: Optional[str] = Setting(None, schema_only=True)
rake_language: Optional[str] = Setting(None, schema_only=True)
agg_method: int = Setting(AggregationMethods.MEAN)
sel_method: int = ContextSetting(SelectionMethods.N_BEST)
n_selected: int = ContextSetting(3)
Expand All @@ -236,18 +238,33 @@ def __init__(self):
self.words: Optional[List] = None
self.__cached_keywords = {}
self.model = KeywordsTableModel(parent=self)

# languages from workflow should be retained when data on input
self.__pending_yake_language = self.yake_language
self.__pending_rake_language = self.rake_language
# language setting is None by default to prevent default language is
# saved as pending. It is set to default (here) after pending is stored
self.yake_language = self.yake_language or self.DEFAULT_LANGUAGE
self.rake_language = self.rake_language or self.DEFAULT_LANGUAGE

self._setup_gui()

def _setup_gui(self):
grid = QGridLayout()
box = gui.widgetBox(self.controlArea, "Scoring Methods", grid)

yake_cb = gui.comboBox(
self.controlArea, self, "yake_lang_index", items=YAKE_LANGUAGES,
self.controlArea,
self,
"yake_language",
model=LanguageModel(include_none=False, languages=YAKE_LANGUAGES),
callback=self.__on_yake_lang_changed
)
rake_cb = gui.comboBox(
self.controlArea, self, "rake_lang_index", items=RAKE_LANGUAGES,
self.controlArea,
self,
"rake_language",
model=LanguageModel(include_none=False, languages=RAKE_LANGUAGES),
callback=self.__on_rake_lang_changed
)

Expand Down Expand Up @@ -371,6 +388,17 @@ def set_corpus(self, corpus: Optional[Corpus]):
self.corpus = corpus
self.openContext(self.corpus)
self.__sel_method_buttons.button(self.sel_method).setChecked(True)
if corpus is not None and corpus.language is not None:
if self.__pending_rake_language is not None:
self.yake_language = self.__pending_yake_language
self.rake_language = self.__pending_rake_language
self.__pending_yake_language = None
self.__pending_rake_language = None
else:
if corpus.language in YAKE_LANGUAGES:
self.yake_language = corpus.language
if corpus.language in RAKE_LANGUAGES:
self.rake_language = corpus.language

def _clear(self):
self.clear_messages()
Expand All @@ -397,11 +425,11 @@ def update_scores(self):
self.Warning.extraction_warnings.clear()
kwargs = {
ScoringMethods.YAKE: {
"language": YAKE_LANGUAGES[self.yake_lang_index],
"language": self.yake_language,
"max_len": self.corpus.ngram_range[1] if self.corpus else 1
},
ScoringMethods.RAKE: {
"language": RAKE_LANGUAGES[self.rake_lang_index],
"language": self.rake_language,
"max_len": self.corpus.ngram_range[1] if self.corpus else 1,
},
}
Expand Down Expand Up @@ -508,6 +536,26 @@ def send_report(self):
self.report_paragraph("Words", ", ".join(self.words))
self.report_table("Keywords", self.view, num_format="{:.3f}")

@classmethod
def migrate_settings(cls, settings: Dict[str, Any], version: Optional[int]):
if version is None or version < 2:
# before version 2 settings were indexes now they are strings
# with language name and selected aggregator name
if "yake_lang_index" in settings:
settings["yake_language"] = YAKE_LANGUAGES[settings["yake_lang_index"]]
if "rake_lang_index" in settings:
# historic copy of RAKE_LANGUAGES, since current list (now set) depends
# on languages in NLTK. If they change order or add a language settings
# will not be migrated correctly
# fmt: off
previous_order = [
"ar", "az", "eu", "bn", "ca", "zh", "da", "nl", "en", "fi",
"fr", "de", "el", "he", "hi_eng", "hu", "id", "it", "kk",
"ne", "no", "pt", "ro", "ru", "sl", "es", "sv", "tg", "tr"
]
# fmt: on
settings["rake_language"] = previous_order[settings["rake_lang_index"]]


if __name__ == "__main__":
# pylint: disable=ungrouped-imports
Expand Down
Loading

0 comments on commit 6aad2ad

Please sign in to comment.