-
Notifications
You must be signed in to change notification settings - Fork 5
/
utils.py
executable file
·95 lines (84 loc) · 3.59 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import tensorflow as tf
# For reading audio files
from tensorflow.contrib.framework.python.ops import audio_ops
# Calculate and plot spectrogram for the wav audio file
def graph_spectrogram(
wav_file,
window_length=1650, # Length of each window segment
step_size=65, # Step size
fft_length=200,
normalize=False,
train=True
):
"""
Parameters
----------
wav_file: str, required
Path to the wav file.
window_length: int, optional
The length of the window for applying STFT. Defaults to 1650.
HIGHER the window length, LOWER the no. of features produced in axis 1.
step_size: int, optional
The traversal step for STFT. Defaults to 65.
HIGHER the step size, LOWER the no. of features produced in axis 1.
fft_length: int, optional
Used while applying discrete FFT to each of the windows. Defaults to 200.
HIGHER the FFT length, HIGHER the no. of features produced in axis 2.
normalize: boolean, optional
If True, normalize the output audio (by dividing by 2**15).
Defaults to False.
"""
# Load the raw audio data
# Due to the mixed nature of Urban 8K dataset,
# the dataset is somewhat untidy in terms of sampling frequency and
# audio formats (Some being 32 bit PCM or at a different sampling frequency)
# Therefore, we use tf.read_file and tf.contrib.ffmpeg to explicitly read
# audio files in 44.1 KHz. Doing this takes care of these problems.
if train: # If the function is being used during training
audio_binary = tf.read_file(wav_file) # The raw audio data
# Decode and convert into a Tensor
data = tf.contrib.ffmpeg.decode_audio(
audio_binary,
file_format='wav',
samples_per_second=44100,
channel_count=2
)
if normalize:
pass # Output of tf.contrib.ffmpeg.decode_audio is already normalized
else:
data *= 2**15
else: # If being used in the app, in which case we will take care
# to ensure proper audio format, thus read directly.
_, data = get_wav_info(wav_file, normalize)
# Convert to single dimensional vector by taking max of both channels.
# Works better than just dropping a channel.
data = tf.reduce_max(data, axis=1)
data = data[None, ...] # To make the output shape comply with the model
# Compute spectrogram for the signal by converting it to frequency
# domain by applying Short-time Fourier Transform.
# Returns a 3 dimensional complex64 tensor.
specgrams = tf.contrib.signal.stft(
data,
frame_length=window_length,
frame_step=step_size,
fft_length=fft_length
)
# There are two ways we can utilize the spectrogram, power spectrogram
# and magnitude spectrogram. We will use power spectrogram, given by
# taking the modulus of the spectrogram tensor and squaring it.
pxx = tf.real(specgrams * tf.conj(specgrams))
return pxx
# Load a wav file
def get_wav_info(wav_file, normalize=True):
audio_binary = tf.read_file(wav_file) # The raw audio data
desired_channels = 2 # Always
wav_decoder = audio_ops.decode_wav(audio_binary,
desired_channels=desired_channels)
if normalize:
data = wav_decoder.audio
else:
# decode_wav does a normalization step. Multiplying by 2^15 undoes that
data = wav_decoder.audio * 2**15
# Get the sampling frequency, useful for debugging later on.
rate = wav_decoder.sample_rate
return rate, data