-
Notifications
You must be signed in to change notification settings - Fork 12
/
librispeech_gen.py
133 lines (107 loc) · 5.75 KB
/
librispeech_gen.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import argparse
import h5py
import numpy as np
import os
import subprocess
import wavio
from glob import glob
from tqdm import tqdm
from random import shuffle
#from utils import UTTERANCE, CHAPTER, SPEAKER, SOUND
from constants import UTTERANCE, CHAPTER, SPEAKER, SOUND
LIBRI_SPEECH_URL = "http://www.openslr.org/12/"
EXTRACTED_FOLDER = "LibriSpeech"
def parse_args():
parser = argparse.ArgumentParser(description='LibriSpeech')
# General settings
parser.add_argument('--dataset',
required=True,
help="The name of a particular dataset from {}".format(LIBRI_SPEECH_URL))
parser.add_argument('--url',
default=LIBRI_SPEECH_URL,
help="Where datasets are stored. Default: {}".format(LIBRI_SPEECH_URL))
parser.add_argument('--path', required=True, help="Where to store results")
parser.add_argument('--force_download', action='store_true', help="Force downloading from website.")
parser.add_argument('--force_extraction', action='store_true', help="Forcing extraction from tar.gz file.")
parser.add_argument('--force_convert', action='store_true', help="Forcing convertation to wav")
parser.add_argument('--force_h5py', action='store_true', help="Forcing storing to h5py_torch")
parser.add_argument('--sr', default=16000, help="Sample rate for wav. Default is 16kHz")
parser.add_argument('--wav_dir', default=EXTRACTED_FOLDER+"Wav", help="Where to store wav files")
parser.add_argument('--rm_flac', default=True, help="Remove or not folder with flac files")
parser.add_argument('--take_random', action='store_true',
help="Take N random wav files for storing in h5py_torch")
return parser.parse_args()
if __name__=="__main__":
opt = parse_args()
# Download tar
data_url = os.path.join(opt.url, opt.dataset)
tar_path = os.path.join(opt.path, opt.dataset)
extraction_path = os.path.join(opt.path, EXTRACTED_FOLDER, opt.dataset[:-7])
wav_path = os.path.join(opt.path, opt.dataset[:-7] + "_wav" + str(opt.sr // 1000))
dataset_path = os.path.join(opt.path, opt.dataset[:-7])+'.hdf5'
if opt.force_h5py:
print("Force h5py_torch creation. {} file will me replaced.".format(dataset_path))
subprocess.run("rm -rf {}".format(dataset_path), shell=True, check=True)
if os.path.exists(dataset_path) and not (opt.force_download or opt.force_extraction or opt.force_convert):
print('Dataset is already downloaded and prepared')
exit()
# rm folders
if opt.force_download:
if opt.force_download and os.path.exists(tar_path):
print("Force download. {} file will me replaced.".format(tar_path))
os.remove(tar_path)
if opt.force_extraction and os.path.exists(extraction_path):
print("Force extraction. {} file will me replaced.".format(extraction_path))
subprocess.run("rm -rf {}".format(extraction_path), shell=True, check=True)
if opt.force_convert and os.path.exists(wav_path):
print("Force extraction. {} file will me replaced.".format(wav_path))
subprocess.run("rm -rf {}".format(wav_path), shell=True, check=True)
if not os.path.exists(tar_path) or opt.force_download:
print("Download tar.gz")
subprocess.run("wget {} -P {}".format(data_url, opt.path), shell=True, check=True)
else:
print("Dataset has already downloaded")
# Extract tar
if (not os.path.exists(extraction_path) and not os.path.exists(wav_path)) or opt.force_extraction:
print("Extraction path:", extraction_path)
subprocess.run("tar xvzf {} -C {}".format(tar_path, opt.path), shell=True, check=True)
else:
print("Dataset has already extracted")
# Convert to wav
wav_path = os.path.join(opt.path, opt.dataset[:-7] + "_wav" + str(opt.sr//1000))
print("wav_path", wav_path)
if not os.path.exists(wav_path) or opt.force_convert or opt.force_extraction:
os.mkdir(wav_path)
flacs = glob('{}/**/*.flac'.format(extraction_path), recursive=True)
for flac in flacs:
wav_file = os.path.join(wav_path, flac.split("/")[-1][:-5] + '.wav')
subprocess.run('ffmpeg -i {} {} -ar {}'.format(flac, wav_file, opt.sr), shell=True, check=True)
else:
print("Dataset has already converted to wav with sr {}".format(opt.sr))
if opt.rm_flac and os.path.exists(extraction_path):
print("Flac folder {} will be removed".format(extraction_path))
subprocess.run("rm -rf {}".format(extraction_path), shell=True, check=True)
print("Convertation to wav is finished")
if not os.path.exists(dataset_path):
print("Packing into {} file".format(dataset_path))
wav_files = os.listdir(wav_path)
shuffle(wav_files)
if opt.take_random is not None:
wav_files = wav_files[:opt.take_random]
data_len = len(wav_files)
f = h5py.File(dataset_path, 'w')
dt = h5py.special_dtype(vlen=np.float32)
sound = f.create_dataset(SOUND, (data_len, ), dtype=dt)
speaker = f.create_dataset(SPEAKER, (data_len,), dtype=np.int)
chapter = f.create_dataset(CHAPTER, (data_len,), dtype=np.int)
utterance = f.create_dataset(UTTERANCE, (data_len,), dtype=np.int)
for i, wav_file in tqdm(enumerate(wav_files), total=data_len):
file_name = wav_file.split("/")[-1][:-4]
sound_wav = wavio.read(os.path.join(wav_path, wav_file)).data.T[0]
speaker_id, chapter_id, utterance_id = map(int, file_name.split("-"))
sound[i] = sound_wav
speaker[i], chapter[i], utterance[i] = speaker_id, chapter_id, utterance_id
f.flush()
f.close()
else:
print("{} file has been already prepared.".format(dataset_path))