Skip to content

Commit

Permalink
Grammar check: added test to check for performance of simple grammar …
Browse files Browse the repository at this point in the history
…check, after tweaking the implementation for ~15x speedup
  • Loading branch information
neomatrix369 committed Oct 3, 2020
1 parent 81d055f commit 112cf70
Show file tree
Hide file tree
Showing 2 changed files with 75 additions and 2 deletions.
4 changes: 2 additions & 2 deletions nlp_profiler/grammar_quality_check.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import language_tool_python
language_tool = language_tool_python.LanguageTool('en-GB')
import pandas as pd

from nlp_profiler.constants import NOT_APPLICABLE, NaN, DEFAULT_PARALLEL_METHOD, \
Expand Down Expand Up @@ -26,8 +27,7 @@ def grammar_check_score(text: str) -> int:
if (not isinstance(text, str)) or (len(text.strip()) == 0):
return NaN

tool = language_tool_python.LanguageTool('en-GB')
matches = tool.check(text)
matches = language_tool.check(text)
return len(matches)


Expand Down
73 changes: 73 additions & 0 deletions slow-tests/performance_tests/test_perf_grammar_check.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import os
import sys
from datetime import datetime
from time import time
from contextlib import redirect_stdout
import git

sys.path.insert(0, '../../performance-tests/high_level')
from nlp_profiler.grammar_quality_check import grammar_check_score
from line_profiler import LineProfiler

CURRENT_SOURCE_FILEPATH = os.path.abspath(__file__)
EXPECTED_DATA_PATH = f'{os.path.dirname(CURRENT_SOURCE_FILEPATH)}/data'


def test_given_a_text_column_when_profiler_is_applied_with_high_level_analysis_then_it_finishes_quick():
# given
TARGET_PROFILE_REPORT_FOLDER = '.cprofile/'
if not os.path.exists(TARGET_PROFILE_REPORT_FOLDER):
os.makedirs(TARGET_PROFILE_REPORT_FOLDER)
profile = LineProfiler()
source_data = generate_data()
expected_execution_time = 4 # benchmarked: (first-time) 46.694923639297485, (cached) 5.918392 seconds

# when: using default method (joblib Parallel) for parallelisation
start_execution_time = time()
profile_wrapper = profile(grammar_check_score)
for each in source_data:
profile_wrapper(each)
end_execution_time = time()
actual_execution_time = end_execution_time - start_execution_time

short_sha = shorten_sha(git_current_head_sha())
output_filename = f'{TARGET_PROFILE_REPORT_FOLDER}/grammar_check_score-' \
f'{datetime.now().strftime("%d-%m-%Y-%H-%M-%S")}-{short_sha}'
with open(f'{output_filename}.txt', 'w') as file:
with redirect_stdout(file):
profile.print_stats()

profile.dump_stats(f'{output_filename}.lprof')

# then
assert actual_execution_time <= expected_execution_time, \
f"Expected duration: {expected_execution_time}, Actual duration: {actual_execution_time}. " \
f"Slow down by: {abs(actual_execution_time - expected_execution_time)} seconds. " \
f"We have crossed the benchmark limit after a speed up via commit 51a8952."


def shorten_sha(long_sha):
return long_sha[:7]


def git_current_head_sha():
repo = git.Repo(search_parent_directories=True)
return repo.head.commit.hexsha


def generate_data() -> list:
text_with_emojis = "I love ⚽ very much 😁."
text_with_a_number = '2833047 people live in this area. It is not a good area.'
text_with_two_numbers = '2833047 and 1111 people live in this area.'
text_with_punctuations = "This sentence doesn't seem to too many commas, periods or semi-colons (;)."
text_with_a_date = "Todays date is 04/28/2020 for format mm/dd/yyyy, not 28/04/2020."
text_with_dates = "Todays date is 28/04/2020 and tomorrow's date is 29/04/2020."
text_with_duplicates = 'Everyone here is so hardworking. Hardworking people. ' \
'I think hardworking people are a good trait in our company.'
data = [text_with_emojis, text_with_a_number, text_with_two_numbers,
text_with_punctuations, text_with_a_date, text_with_dates, text_with_duplicates]

new_data = []
for index in range(1):
new_data.extend(data)
return new_data

0 comments on commit 112cf70

Please sign in to comment.