From e15df15fb116c4bc2d5694ea6ca25ed027dd94e3 Mon Sep 17 00:00:00 2001 From: AndreyKarnauhov <91670823+AndreyKarnauhov@users.noreply.github.com> Date: Wed, 5 Oct 2022 01:31:21 +0300 Subject: [PATCH] generate_stop_words implemented, polish key phrases implemented many things improved --- lab_2_keywords_cooccurrence/main.py | 11 +++++++++-- lab_2_keywords_cooccurrence/start.py | 25 +++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/lab_2_keywords_cooccurrence/main.py b/lab_2_keywords_cooccurrence/main.py index 0d9ee0c0..aaafba8d 100644 --- a/lab_2_keywords_cooccurrence/main.py +++ b/lab_2_keywords_cooccurrence/main.py @@ -215,7 +215,14 @@ def generate_stop_words(text: str, max_length: int) -> Optional[Sequence[str]]: :param max_length: maximum length (in characters) of an individual stop word :return: a list of stop words """ - pass + if not type_check(text, str, True) or not type_check(max_length, int) or max_length <= 0: + return None + punctuation = r"[–—!¡\"“”#$%&'()⟨⟩«»*+,./:;‹›<=>?¿@\]\[\\_`{|}~…⋯-]+" + tokens = re.sub(''.join( + (punctuation, r"(?=$|\s)|(?<=\s)", punctuation, r"|^", punctuation)), '', text).lower().split() + frequencies = {token: tokens.count(token) for token in set(tokens)} + percent_80 = sorted(frequencies.values(), reverse=True)[int(len(frequencies) * 0.2)] + return [token for token in sorted(frequencies) if frequencies[token] >= percent_80 and len(token) <= max_length] def load_stop_words(path: Path) -> Optional[Mapping[str, Sequence[str]]]: @@ -227,5 +234,5 @@ def load_stop_words(path: Path) -> Optional[Mapping[str, Sequence[str]]]: if not type_check(path, Path, True): return None with open(path, 'r', encoding='utf-8') as file: - stop_words = json.load(file) + stop_words = dict(json.load(file)) return stop_words diff --git a/lab_2_keywords_cooccurrence/start.py b/lab_2_keywords_cooccurrence/start.py index 69ce0370..d9431fb1 100644 --- a/lab_2_keywords_cooccurrence/start.py +++ b/lab_2_keywords_cooccurrence/start.py @@ -76,6 +76,31 @@ def read_target_text(file_path: Path) -> str: STOP_WORDS = load_stop_words(ASSETS_PATH / 'stopwords.json') + # for polish text + PHRASES = extract_phrases(read_target_text(ASSETS_PATH / 'polish.txt')) + if PHRASES: + CANDIDATE_KEYWORD_PHRASES = extract_candidate_keyword_phrases(PHRASES, STOP_WORDS['pl']) + if CANDIDATE_KEYWORD_PHRASES: + WORD_FREQUENCIES = calculate_frequencies_for_content_words(CANDIDATE_KEYWORD_PHRASES) + if CANDIDATE_KEYWORD_PHRASES and WORD_FREQUENCIES: + WORD_DEGREES = calculate_word_degrees(CANDIDATE_KEYWORD_PHRASES, list(WORD_FREQUENCIES.keys())) + if WORD_DEGREES and WORD_FREQUENCIES: + WORD_SCORES = calculate_word_scores(WORD_DEGREES, WORD_FREQUENCIES) + if CANDIDATE_KEYWORD_PHRASES and WORD_SCORES: + KEYWORD_PHRASES_WITH_SCORES = calculate_cumulative_score_for_candidates(CANDIDATE_KEYWORD_PHRASES, WORD_SCORES) + if KEYWORD_PHRASES_WITH_SCORES: + TOP_N = get_top_n(KEYWORD_PHRASES_WITH_SCORES, 10, 10) + if CANDIDATE_KEYWORD_PHRASES and PHRASES: + CANDIDATES_ADJOINED = \ + extract_candidate_keyword_phrases_with_adjoining(CANDIDATE_KEYWORD_PHRASES, PHRASES) + if CANDIDATES_ADJOINED and WORD_SCORES: + CUMULATIVE_SCORE_WITH_STOP_WORDS = calculate_cumulative_score_for_candidates_with_stop_words( + CANDIDATES_ADJOINED, WORD_SCORES, STOP_WORDS['pl']) + else: + CUMULATIVE_SCORE_WITH_STOP_WORDS = {} + FINAL_CUMULATIVE_SCORE = KEYWORD_PHRASES_WITH_SCORES | CUMULATIVE_SCORE_WITH_STOP_WORDS + print(FINAL_CUMULATIVE_SCORE) + RESULT = True assert RESULT, 'Keywords are not extracted'