forked from voicegain/transcription-compare
-
Notifications
You must be signed in to change notification settings - Fork 0
/
transcribe-compare
68 lines (55 loc) · 2.67 KB
/
transcribe-compare
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#!/usr/bin/env python
import click
from transcription_compare.levenshtein_distance_calculator import UKKLevenshteinDistanceCalculator
from transcription_compare.tokenizer import CharacterTokenizer, WordTokenizer
from transcription_compare.utils.digit_util import DigitUtil
@click.command()
@click.option('--reference', '-r', type=str, help='source string')
@click.option('--output', '-o', type=str, help='target string')
@click.option('--reference_file', '-R', type=click.File('r'), help='source file path')
@click.option('--output_file', '-O', type=click.File('r'), help='target file path')
@click.option('--alignment', '-a', default=False, is_flag=True,
help='Do you want to see the alignment result? True/False')
@click.option('--error_type', '-e', default='CER', type=click.Choice(['CER', 'WER']))
@click.option('--output_format', '-j', default='TABLE', type=click.Choice(['JSON', 'TABLE']))
@click.option('--to_lower', '-l', default=False, is_flag=True, help='Do you want to lower all the words? True/False')
@click.option('--remove_punctuation', '-p', default=False, is_flag=True,
help='Do you want to remove all the punctuation? True/False')
def main(reference, output, reference_file, output_file, alignment, error_type, output_format,
to_lower, remove_punctuation):
"""
Transcription compare tool provided by VoiceGain
"""
if reference is not None:
reference = reference
elif reference_file is not None:
# with open(reference_file, 'r') as file1:
reference = reference_file.read()
else:
raise ValueError("One of --reference and --reference_file must be specified")
if output is not None:
output = output
elif output_file is not None:
# with open(output_file, 'r') as file2:
output = output_file.read()
else:
raise ValueError("One of --output and --output_file must be specified")
digit_util = DigitUtil()
if error_type == "CER":
calculator = UKKLevenshteinDistanceCalculator(
tokenizer=CharacterTokenizer(),
get_alignment_result=alignment
)
else:
calculator = UKKLevenshteinDistanceCalculator(
tokenizer=WordTokenizer(),
get_alignment_result=alignment,
digit_util=digit_util
)
if output_format == 'TABLE':
click.echo(calculator.get_distance(reference, output, to_lower=to_lower, remove_punctuation=remove_punctuation))
if output_format == 'JSON':
click.echo(calculator.get_distance(reference, output, to_lower=to_lower,
remove_punctuation=remove_punctuation).to_json())
if __name__ == '__main__':
main()