Skip to content
This repository has been archived by the owner on Sep 1, 2023. It is now read-only.

Commit

Permalink
generate_stop_words implemented, polish key phrases implemented
Browse files Browse the repository at this point in the history
many things improved
  • Loading branch information
AndreyKarnauhov committed Oct 4, 2022
1 parent 104abce commit e15df15
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 2 deletions.
11 changes: 9 additions & 2 deletions lab_2_keywords_cooccurrence/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,14 @@ def generate_stop_words(text: str, max_length: int) -> Optional[Sequence[str]]:
:param max_length: maximum length (in characters) of an individual stop word
:return: a list of stop words
"""
pass
if not type_check(text, str, True) or not type_check(max_length, int) or max_length <= 0:
return None
punctuation = r"[–—!¡\"“”#$%&'()⟨⟩«»*+,./:;‹›<=>?¿@\]\[\\_`{|}~…⋯-]+"
tokens = re.sub(''.join(
(punctuation, r"(?=$|\s)|(?<=\s)", punctuation, r"|^", punctuation)), '', text).lower().split()
frequencies = {token: tokens.count(token) for token in set(tokens)}
percent_80 = sorted(frequencies.values(), reverse=True)[int(len(frequencies) * 0.2)]
return [token for token in sorted(frequencies) if frequencies[token] >= percent_80 and len(token) <= max_length]


def load_stop_words(path: Path) -> Optional[Mapping[str, Sequence[str]]]:
Expand All @@ -227,5 +234,5 @@ def load_stop_words(path: Path) -> Optional[Mapping[str, Sequence[str]]]:
if not type_check(path, Path, True):
return None
with open(path, 'r', encoding='utf-8') as file:
stop_words = json.load(file)
stop_words = dict(json.load(file))
return stop_words
25 changes: 25 additions & 0 deletions lab_2_keywords_cooccurrence/start.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,31 @@ def read_target_text(file_path: Path) -> str:

STOP_WORDS = load_stop_words(ASSETS_PATH / 'stopwords.json')

# for polish text
PHRASES = extract_phrases(read_target_text(ASSETS_PATH / 'polish.txt'))
if PHRASES:
CANDIDATE_KEYWORD_PHRASES = extract_candidate_keyword_phrases(PHRASES, STOP_WORDS['pl'])
if CANDIDATE_KEYWORD_PHRASES:
WORD_FREQUENCIES = calculate_frequencies_for_content_words(CANDIDATE_KEYWORD_PHRASES)
if CANDIDATE_KEYWORD_PHRASES and WORD_FREQUENCIES:
WORD_DEGREES = calculate_word_degrees(CANDIDATE_KEYWORD_PHRASES, list(WORD_FREQUENCIES.keys()))
if WORD_DEGREES and WORD_FREQUENCIES:
WORD_SCORES = calculate_word_scores(WORD_DEGREES, WORD_FREQUENCIES)
if CANDIDATE_KEYWORD_PHRASES and WORD_SCORES:
KEYWORD_PHRASES_WITH_SCORES = calculate_cumulative_score_for_candidates(CANDIDATE_KEYWORD_PHRASES, WORD_SCORES)
if KEYWORD_PHRASES_WITH_SCORES:
TOP_N = get_top_n(KEYWORD_PHRASES_WITH_SCORES, 10, 10)
if CANDIDATE_KEYWORD_PHRASES and PHRASES:
CANDIDATES_ADJOINED = \
extract_candidate_keyword_phrases_with_adjoining(CANDIDATE_KEYWORD_PHRASES, PHRASES)
if CANDIDATES_ADJOINED and WORD_SCORES:
CUMULATIVE_SCORE_WITH_STOP_WORDS = calculate_cumulative_score_for_candidates_with_stop_words(
CANDIDATES_ADJOINED, WORD_SCORES, STOP_WORDS['pl'])
else:
CUMULATIVE_SCORE_WITH_STOP_WORDS = {}
FINAL_CUMULATIVE_SCORE = KEYWORD_PHRASES_WITH_SCORES | CUMULATIVE_SCORE_WITH_STOP_WORDS
print(FINAL_CUMULATIVE_SCORE)

RESULT = True

assert RESULT, 'Keywords are not extracted'

0 comments on commit e15df15

Please sign in to comment.