From 354f1fa78b220920bcd356c5118ff0e6734c3327 Mon Sep 17 00:00:00 2001 From: Bitan Biswas Date: Thu, 9 Mar 2023 23:24:38 +0530 Subject: [PATCH 1/7] changes made to grammar check function --- .../high_level_features/grammar_quality_check.py | 9 ++++----- requirements.txt | 3 ++- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/nlp_profiler/high_level_features/grammar_quality_check.py b/nlp_profiler/high_level_features/grammar_quality_check.py index a0d85a5..03e4f94 100644 --- a/nlp_profiler/high_level_features/grammar_quality_check.py +++ b/nlp_profiler/high_level_features/grammar_quality_check.py @@ -1,6 +1,5 @@ -import language_tool_python - -language_tool = language_tool_python.LanguageTool('en-GB') +from gingerit.gingerit import GingerIt +parser=GingerIt() import pandas as pd import math @@ -29,8 +28,8 @@ def grammar_check_score(text: str) -> int: if (not isinstance(text, str)) or (len(text.strip()) == 0): return NaN - matches = language_tool.check(text) - return len(matches) + matches = parser.parse(text) + return len(matches["corrections"]) def grammar_quality(score: int) -> str: diff --git a/requirements.txt b/requirements.txt index e27ff26..c159d76 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,4 +10,5 @@ spacy >= 2.3.0,<3.0.0 pandas < 1.3.0 # pinned to this version as higher versions conflicts with swifter version 1.0.5 and higher swifter >= 1.0.3 en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.0/en_core_web_sm-2.3.0.tar.gz -textstat >= 0.7.0 \ No newline at end of file +textstat >= 0.7.0 +gingerit == 0.9.0 From 2efde59bde8c083cc4ac450dc4c50842192aa19f Mon Sep 17 00:00:00 2001 From: Bitan Biswas Date: Fri, 10 Mar 2023 01:09:06 +0530 Subject: [PATCH 2/7] sourcery ai changes incorporated and grammar function updated as mentioned in PR #69 --- nlp_profiler/high_level_features/grammar_quality_check.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nlp_profiler/high_level_features/grammar_quality_check.py b/nlp_profiler/high_level_features/grammar_quality_check.py index 03e4f94..4176b06 100644 --- a/nlp_profiler/high_level_features/grammar_quality_check.py +++ b/nlp_profiler/high_level_features/grammar_quality_check.py @@ -25,7 +25,7 @@ def apply_grammar_check(heading: str, ### Grammar check: this is a very slow process ### take a lot of time per text it analysis def grammar_check_score(text: str) -> int: - if (not isinstance(text, str)) or (len(text.strip()) == 0): + if not isinstance(text, str) or not text.strip(): return NaN matches = parser.parse(text) @@ -39,6 +39,6 @@ def grammar_quality(score: int) -> str: if score == 1: return "1 issue" elif score > 1: - return f"{int(score)} issues" + return f"{score} issues" return "No issues" From 3e7c92f7fa3fceb38d12513d3286d0ffd5b51a71 Mon Sep 17 00:00:00 2001 From: Bitan Biswas Date: Sat, 11 Mar 2023 13:52:39 +0530 Subject: [PATCH 3/7] comments added for readability --- nlp_profiler/high_level_features/grammar_quality_check.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/nlp_profiler/high_level_features/grammar_quality_check.py b/nlp_profiler/high_level_features/grammar_quality_check.py index 4176b06..43bcc81 100644 --- a/nlp_profiler/high_level_features/grammar_quality_check.py +++ b/nlp_profiler/high_level_features/grammar_quality_check.py @@ -1,5 +1,6 @@ +#changing the grammar checker from language tool to gingerit for better results from gingerit.gingerit import GingerIt -parser=GingerIt() +parser = GingerIt() import pandas as pd import math @@ -27,8 +28,9 @@ def apply_grammar_check(heading: str, def grammar_check_score(text: str) -> int: if not isinstance(text, str) or not text.strip(): return NaN - + #calling the parser function to parse through the text for errors matches = parser.parse(text) + #the corrections is an array of dictionaries containing the position and the word that has been changed return len(matches["corrections"]) From 5b6d95f05e7b3c1f0d6237314d1d160ae07e5f0b Mon Sep 17 00:00:00 2001 From: Bitan Biswas Date: Sun, 12 Mar 2023 16:55:51 +0530 Subject: [PATCH 4/7] code cleaned --- nlp_profiler/high_level_features/grammar_quality_check.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nlp_profiler/high_level_features/grammar_quality_check.py b/nlp_profiler/high_level_features/grammar_quality_check.py index 43bcc81..abd0f6e 100644 --- a/nlp_profiler/high_level_features/grammar_quality_check.py +++ b/nlp_profiler/high_level_features/grammar_quality_check.py @@ -26,7 +26,7 @@ def apply_grammar_check(heading: str, ### Grammar check: this is a very slow process ### take a lot of time per text it analysis def grammar_check_score(text: str) -> int: - if not isinstance(text, str) or not text.strip(): + if not (isinstance(text, str) or text.strip()): return NaN #calling the parser function to parse through the text for errors matches = parser.parse(text) From b5a5ddaad01f07230cf232712671d43dd9db9862 Mon Sep 17 00:00:00 2001 From: Bitan Biswas Date: Sun, 12 Mar 2023 23:40:43 +0530 Subject: [PATCH 5/7] code cleaned with black --- .../grammar_quality_check.py | 28 ++++++++++--------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/nlp_profiler/high_level_features/grammar_quality_check.py b/nlp_profiler/high_level_features/grammar_quality_check.py index abd0f6e..ca037c8 100644 --- a/nlp_profiler/high_level_features/grammar_quality_check.py +++ b/nlp_profiler/high_level_features/grammar_quality_check.py @@ -1,26 +1,28 @@ -#changing the grammar checker from language tool to gingerit for better results +# changing the grammar checker from language tool to gingerit for better results from gingerit.gingerit import GingerIt + parser = GingerIt() import pandas as pd import math -from nlp_profiler.constants import NOT_APPLICABLE, NaN, DEFAULT_PARALLEL_METHOD, \ - GRAMMAR_CHECK_SCORE_COL, GRAMMAR_CHECK_COL +from nlp_profiler.constants import ( + NOT_APPLICABLE, + NaN, + DEFAULT_PARALLEL_METHOD, + GRAMMAR_CHECK_SCORE_COL, + GRAMMAR_CHECK_COL, +) from nlp_profiler.generate_features import generate_features -def apply_grammar_check(heading: str, - new_dataframe: pd.DataFrame, - text_column: dict, - parallelisation_method: str = DEFAULT_PARALLEL_METHOD): +def apply_grammar_check( + heading: str, new_dataframe: pd.DataFrame, text_column: dict, parallelisation_method: str = DEFAULT_PARALLEL_METHOD +): grammar_checks_steps = [ (GRAMMAR_CHECK_SCORE_COL, text_column, grammar_check_score), (GRAMMAR_CHECK_COL, GRAMMAR_CHECK_SCORE_COL, grammar_quality), ] - generate_features( - heading, grammar_checks_steps, - new_dataframe, parallelisation_method - ) + generate_features(heading, grammar_checks_steps, new_dataframe, parallelisation_method) ### Grammar check: this is a very slow process @@ -28,9 +30,9 @@ def apply_grammar_check(heading: str, def grammar_check_score(text: str) -> int: if not (isinstance(text, str) or text.strip()): return NaN - #calling the parser function to parse through the text for errors + # calling the parser function to parse through the text for errors matches = parser.parse(text) - #the corrections is an array of dictionaries containing the position and the word that has been changed + # the corrections is an array of dictionaries containing the position and the word that has been changed return len(matches["corrections"]) From 2b950495fe3cd594a051f426a04bb52ff4caac66 Mon Sep 17 00:00:00 2001 From: Bitan Biswas <76226078+bitanb1999@users.noreply.github.com> Date: Mon, 13 Mar 2023 00:26:17 +0530 Subject: [PATCH 6/7] Update CHANGELOG.md --- CHANGELOG.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2b49cfa..b3c5bdd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -194,6 +194,15 @@ Enabled nightly run of build and test via Github actions [dde3172](https://github.com/neomatrix369/nlp_profiler/commit/dde31723b7cb1c1105b9df828bf7429094113de4) [@neomatrix369](https://github.com/neomatrix369) _Sun Nov 14 09:12:33 2021 +0000_ +--- +### GitHub branch `grammar_check` Grammar_quality_check: language tool replaced with Gingerit + +Implemented functionality via PR [#69](https://github.com/neomatrix369/nlp_profiler/pull/69) - details described in the body of the PR. + +Replaced language tool with Gingerit for faster calculations + +[b5a5dda](https://github.com/neomatrix369/nlp_profiler/pull/69/commits/b5a5ddaad01f07230cf232712671d43dd9db9862) [@bitanb1999](https://github.com/bitanb1999) _Sun Nov 14 09:12:33 2021 +0000_ + --- Return to [README.md](README.md) From c891ba31a64cb6d876fcb32f5d52ec06bcf09ef0 Mon Sep 17 00:00:00 2001 From: Bitan Biswas <76226078+bitanb1999@users.noreply.github.com> Date: Mon, 13 Mar 2023 00:32:00 +0530 Subject: [PATCH 7/7] Update CHANGELOG.md --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b3c5bdd..9085f99 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -201,7 +201,7 @@ Implemented functionality via PR [#69](https://github.com/neomatrix369/nlp_profi Replaced language tool with Gingerit for faster calculations -[b5a5dda](https://github.com/neomatrix369/nlp_profiler/pull/69/commits/b5a5ddaad01f07230cf232712671d43dd9db9862) [@bitanb1999](https://github.com/bitanb1999) _Sun Nov 14 09:12:33 2021 +0000_ +[b5a5dda](https://github.com/neomatrix369/nlp_profiler/pull/69/commits/b5a5ddaad01f07230cf232712671d43dd9db9862) [@bitanb1999](https://github.com/bitanb1999) _Sun March 13 00:31:31 2023 +0000_ ---