forked from ekQ/raplysaattori
-
Notifications
You must be signed in to change notification settings - Fork 0
/
phonetics.py
80 lines (67 loc) · 2.33 KB
/
phonetics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# -*- coding: utf-8 -*-
import os
import codecs
import re
'''
This file contains all phonetics related functions. The phonetic
transcription is obtained using eSpeak speech synthesizer
(http://espeak.sourceforge.net/).
For English the list of available phonetic vowels can be found here:
http://espeak.sourceforge.net/phonemes.html
'''
def is_vow(c, language='fi'):
'''
Is the given (lowercase) character a vowel or not.
'''
if language == 'fi': # Finnish
return c in u'aeiouyäöå'
elif len(language) >= 2 and language[:2] == 'en': # English
# In order to increase recall for the rhyme detection, we
# ignore the schwa vowel '@' as it can be rhymed with several
# different vowels. However, in BattleBot we do not ignore it
# in order to get a higher precision.
return c in u'3L5aAeEiI0VuUoO'
else:
raise Exception("Unknown language: %s" % language)
def map_vow(c, language):
'''
Map vowel to a similar sounding vowel (only for English).
'''
# This list is somewhat arbitrary, so some native English speaker
# who knows about phonetics might be able to improve it.
vow_map = {
'0':'o',
'O':'o',
'I':'i',
'E':'e'
}
if len(language) >= 2 and language[:2] == 'en' and c in vow_map:
return vow_map[c]
else:
return c
def is_space(c):
'''
Is the given character a space or newline (other space characters are
cleaned in the preprocessing phase).
'''
return c==' ' or c=='\n'
def get_phonetic_transcription(text, language='en-us', output_fname=None):
if output_fname is None:
fname2 = u'temp_transcription.txt'
else:
fname2 = output_fname
if output_fname is None or not os.path.exists(fname2):
print "Transcribing: %s" % fname2
fname = u'temp_lyrics.txt'
f = codecs.open(fname, 'w', 'utf8')
f.write(text)
f.close()
cmd = u'espeak -xq -v%s -f %s > %s' % (language, fname, fname2)
os.system(cmd)
f2 = codecs.open(fname2, 'r', 'utf8')
new_text = f2.read()
# Remove some unwanted stuff from the transcription
new_text = re.sub("_:'Ekskl@m,eIS@n_:", "", new_text)
new_text = re.sub("'", "", new_text)
new_text = re.sub(",", "", new_text)
return new_text