generate_stop_words implemented, polish key phrases implemented

many things improved
fipl-hse · Oct 4, 2022 · e15df15 · e15df15
1 parent 104abce
commit e15df15
Show file tree

Hide file tree

Showing 2 changed files with 34 additions and 2 deletions.
diff --git a/lab_2_keywords_cooccurrence/main.py b/lab_2_keywords_cooccurrence/main.py
@@ -215,7 +215,14 @@ def generate_stop_words(text: str, max_length: int) -> Optional[Sequence[str]]:
     :param max_length: maximum length (in characters) of an individual stop word
     :return: a list of stop words
     """
-    pass
+    if not type_check(text, str, True) or not type_check(max_length, int) or max_length <= 0:
+        return None
+    punctuation = r"[–—!¡\"“”#$%&'()⟨⟩«»*+,./:;‹›<=>?¿@\]\[\\_`{|}~…⋯-]+"
+    tokens = re.sub(''.join(
+        (punctuation, r"(?=$|\s)|(?<=\s)", punctuation, r"|^", punctuation)), '', text).lower().split()
+    frequencies = {token: tokens.count(token) for token in set(tokens)}
+    percent_80 = sorted(frequencies.values(), reverse=True)[int(len(frequencies) * 0.2)]
+    return [token for token in sorted(frequencies) if frequencies[token] >= percent_80 and len(token) <= max_length]
 
 
 def load_stop_words(path: Path) -> Optional[Mapping[str, Sequence[str]]]:
@@ -227,5 +234,5 @@ def load_stop_words(path: Path) -> Optional[Mapping[str, Sequence[str]]]:
     if not type_check(path, Path, True):
         return None
     with open(path, 'r', encoding='utf-8') as file:
-        stop_words = json.load(file)
+        stop_words = dict(json.load(file))
     return stop_words
diff --git a/lab_2_keywords_cooccurrence/start.py b/lab_2_keywords_cooccurrence/start.py
@@ -76,6 +76,31 @@ def read_target_text(file_path: Path) -> str:
 
     STOP_WORDS = load_stop_words(ASSETS_PATH / 'stopwords.json')
 
+    # for polish text
+    PHRASES = extract_phrases(read_target_text(ASSETS_PATH / 'polish.txt'))
+    if PHRASES:
+        CANDIDATE_KEYWORD_PHRASES = extract_candidate_keyword_phrases(PHRASES, STOP_WORDS['pl'])
+    if CANDIDATE_KEYWORD_PHRASES:
+        WORD_FREQUENCIES = calculate_frequencies_for_content_words(CANDIDATE_KEYWORD_PHRASES)
+    if CANDIDATE_KEYWORD_PHRASES and WORD_FREQUENCIES:
+        WORD_DEGREES = calculate_word_degrees(CANDIDATE_KEYWORD_PHRASES, list(WORD_FREQUENCIES.keys()))
+    if WORD_DEGREES and WORD_FREQUENCIES:
+        WORD_SCORES = calculate_word_scores(WORD_DEGREES, WORD_FREQUENCIES)
+    if CANDIDATE_KEYWORD_PHRASES and WORD_SCORES:
+        KEYWORD_PHRASES_WITH_SCORES = calculate_cumulative_score_for_candidates(CANDIDATE_KEYWORD_PHRASES, WORD_SCORES)
+    if KEYWORD_PHRASES_WITH_SCORES:
+        TOP_N = get_top_n(KEYWORD_PHRASES_WITH_SCORES, 10, 10)
+    if CANDIDATE_KEYWORD_PHRASES and PHRASES:
+        CANDIDATES_ADJOINED = \
+            extract_candidate_keyword_phrases_with_adjoining(CANDIDATE_KEYWORD_PHRASES, PHRASES)
+    if CANDIDATES_ADJOINED and WORD_SCORES:
+        CUMULATIVE_SCORE_WITH_STOP_WORDS = calculate_cumulative_score_for_candidates_with_stop_words(
+            CANDIDATES_ADJOINED, WORD_SCORES, STOP_WORDS['pl'])
+    else:
+        CUMULATIVE_SCORE_WITH_STOP_WORDS = {}
+    FINAL_CUMULATIVE_SCORE = KEYWORD_PHRASES_WITH_SCORES | CUMULATIVE_SCORE_WITH_STOP_WORDS
+    print(FINAL_CUMULATIVE_SCORE)
+
     RESULT = True
 
     assert RESULT, 'Keywords are not extracted'