diff --git a/CHANGELOG.md b/CHANGELOG.md index 2b49cfa..9085f99 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -194,6 +194,15 @@ Enabled nightly run of build and test via Github actions [dde3172](https://github.com/neomatrix369/nlp_profiler/commit/dde31723b7cb1c1105b9df828bf7429094113de4) [@neomatrix369](https://github.com/neomatrix369) _Sun Nov 14 09:12:33 2021 +0000_ +--- +### GitHub branch `grammar_check` Grammar_quality_check: language tool replaced with Gingerit + +Implemented functionality via PR [#69](https://github.com/neomatrix369/nlp_profiler/pull/69) - details described in the body of the PR. + +Replaced language tool with Gingerit for faster calculations + +[b5a5dda](https://github.com/neomatrix369/nlp_profiler/pull/69/commits/b5a5ddaad01f07230cf232712671d43dd9db9862) [@bitanb1999](https://github.com/bitanb1999) _Sun March 13 00:31:31 2023 +0000_ + --- Return to [README.md](README.md) diff --git a/nlp_profiler/high_level_features/grammar_quality_check.py b/nlp_profiler/high_level_features/grammar_quality_check.py index a0d85a5..ca037c8 100644 --- a/nlp_profiler/high_level_features/grammar_quality_check.py +++ b/nlp_profiler/high_level_features/grammar_quality_check.py @@ -1,36 +1,39 @@ -import language_tool_python +# changing the grammar checker from language tool to gingerit for better results +from gingerit.gingerit import GingerIt -language_tool = language_tool_python.LanguageTool('en-GB') +parser = GingerIt() import pandas as pd import math -from nlp_profiler.constants import NOT_APPLICABLE, NaN, DEFAULT_PARALLEL_METHOD, \ - GRAMMAR_CHECK_SCORE_COL, GRAMMAR_CHECK_COL +from nlp_profiler.constants import ( + NOT_APPLICABLE, + NaN, + DEFAULT_PARALLEL_METHOD, + GRAMMAR_CHECK_SCORE_COL, + GRAMMAR_CHECK_COL, +) from nlp_profiler.generate_features import generate_features -def apply_grammar_check(heading: str, - new_dataframe: pd.DataFrame, - text_column: dict, - parallelisation_method: str = DEFAULT_PARALLEL_METHOD): +def apply_grammar_check( + heading: str, new_dataframe: pd.DataFrame, text_column: dict, parallelisation_method: str = DEFAULT_PARALLEL_METHOD +): grammar_checks_steps = [ (GRAMMAR_CHECK_SCORE_COL, text_column, grammar_check_score), (GRAMMAR_CHECK_COL, GRAMMAR_CHECK_SCORE_COL, grammar_quality), ] - generate_features( - heading, grammar_checks_steps, - new_dataframe, parallelisation_method - ) + generate_features(heading, grammar_checks_steps, new_dataframe, parallelisation_method) ### Grammar check: this is a very slow process ### take a lot of time per text it analysis def grammar_check_score(text: str) -> int: - if (not isinstance(text, str)) or (len(text.strip()) == 0): + if not (isinstance(text, str) or text.strip()): return NaN - - matches = language_tool.check(text) - return len(matches) + # calling the parser function to parse through the text for errors + matches = parser.parse(text) + # the corrections is an array of dictionaries containing the position and the word that has been changed + return len(matches["corrections"]) def grammar_quality(score: int) -> str: @@ -40,6 +43,6 @@ def grammar_quality(score: int) -> str: if score == 1: return "1 issue" elif score > 1: - return f"{int(score)} issues" + return f"{score} issues" return "No issues" diff --git a/requirements.txt b/requirements.txt index e27ff26..c159d76 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,4 +10,5 @@ spacy >= 2.3.0,<3.0.0 pandas < 1.3.0 # pinned to this version as higher versions conflicts with swifter version 1.0.5 and higher swifter >= 1.0.3 en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.0/en_core_web_sm-2.3.0.tar.gz -textstat >= 0.7.0 \ No newline at end of file +textstat >= 0.7.0 +gingerit == 0.9.0