-
-
Notifications
You must be signed in to change notification settings - Fork 37
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Grammar check: added test to check for performance of simple grammar …
…check, after tweaking the implementation for ~15x speedup
- Loading branch information
1 parent
81d055f
commit 112cf70
Showing
2 changed files
with
75 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
import os | ||
import sys | ||
from datetime import datetime | ||
from time import time | ||
from contextlib import redirect_stdout | ||
import git | ||
|
||
sys.path.insert(0, '../../performance-tests/high_level') | ||
from nlp_profiler.grammar_quality_check import grammar_check_score | ||
from line_profiler import LineProfiler | ||
|
||
CURRENT_SOURCE_FILEPATH = os.path.abspath(__file__) | ||
EXPECTED_DATA_PATH = f'{os.path.dirname(CURRENT_SOURCE_FILEPATH)}/data' | ||
|
||
|
||
def test_given_a_text_column_when_profiler_is_applied_with_high_level_analysis_then_it_finishes_quick(): | ||
# given | ||
TARGET_PROFILE_REPORT_FOLDER = '.cprofile/' | ||
if not os.path.exists(TARGET_PROFILE_REPORT_FOLDER): | ||
os.makedirs(TARGET_PROFILE_REPORT_FOLDER) | ||
profile = LineProfiler() | ||
source_data = generate_data() | ||
expected_execution_time = 4 # benchmarked: (first-time) 46.694923639297485, (cached) 5.918392 seconds | ||
|
||
# when: using default method (joblib Parallel) for parallelisation | ||
start_execution_time = time() | ||
profile_wrapper = profile(grammar_check_score) | ||
for each in source_data: | ||
profile_wrapper(each) | ||
end_execution_time = time() | ||
actual_execution_time = end_execution_time - start_execution_time | ||
|
||
short_sha = shorten_sha(git_current_head_sha()) | ||
output_filename = f'{TARGET_PROFILE_REPORT_FOLDER}/grammar_check_score-' \ | ||
f'{datetime.now().strftime("%d-%m-%Y-%H-%M-%S")}-{short_sha}' | ||
with open(f'{output_filename}.txt', 'w') as file: | ||
with redirect_stdout(file): | ||
profile.print_stats() | ||
|
||
profile.dump_stats(f'{output_filename}.lprof') | ||
|
||
# then | ||
assert actual_execution_time <= expected_execution_time, \ | ||
f"Expected duration: {expected_execution_time}, Actual duration: {actual_execution_time}. " \ | ||
f"Slow down by: {abs(actual_execution_time - expected_execution_time)} seconds. " \ | ||
f"We have crossed the benchmark limit after a speed up via commit 51a8952." | ||
|
||
|
||
def shorten_sha(long_sha): | ||
return long_sha[:7] | ||
|
||
|
||
def git_current_head_sha(): | ||
repo = git.Repo(search_parent_directories=True) | ||
return repo.head.commit.hexsha | ||
|
||
|
||
def generate_data() -> list: | ||
text_with_emojis = "I love ⚽ very much 😁." | ||
text_with_a_number = '2833047 people live in this area. It is not a good area.' | ||
text_with_two_numbers = '2833047 and 1111 people live in this area.' | ||
text_with_punctuations = "This sentence doesn't seem to too many commas, periods or semi-colons (;)." | ||
text_with_a_date = "Todays date is 04/28/2020 for format mm/dd/yyyy, not 28/04/2020." | ||
text_with_dates = "Todays date is 28/04/2020 and tomorrow's date is 29/04/2020." | ||
text_with_duplicates = 'Everyone here is so hardworking. Hardworking people. ' \ | ||
'I think hardworking people are a good trait in our company.' | ||
data = [text_with_emojis, text_with_a_number, text_with_two_numbers, | ||
text_with_punctuations, text_with_a_date, text_with_dates, text_with_duplicates] | ||
|
||
new_data = [] | ||
for index in range(1): | ||
new_data.extend(data) | ||
return new_data |