Skip to content

Commit

Permalink
Add final training configs as well as release 16 KHz model (#19)
Browse files Browse the repository at this point in the history
* adding final configs for all models

* changs for 16khz

* add latest version for 16khz model

* update package version

---------

Co-authored-by: Ishaan Kumar <[email protected]>
  • Loading branch information
ritheshkumar95 and eeishaan authored Jul 5, 2023
1 parent 06e8049 commit 408235a
Show file tree
Hide file tree
Showing 10 changed files with 384 additions and 8 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,13 @@ pip install git+https://github.com/descriptinc/descript-audio-codec

### Weights
Weights are released as part of this repo under MIT license.
We release weights for models that can natively support 24kHz and 44.1kHz sampling rates.
We release weights for models that can natively support 16 kHz, 24kHz, and 44.1kHz sampling rates.
Weights are automatically downloaded when you first run `encode` or `decode` command. You can cache them using one of the following commands
```bash
python3 -m dac download # downloads the default 44kHz variant
python3 -m dac download --model_type 44khz # downloads the 44kHz variant
python3 -m dac download --model_type 24khz # downloads the 24kHz variant
python3 -m dac download --model_type 16khz # downloads the 16kHz variant
```
We provide a Dockerfile that installs all required dependencies for encoding and decoding. The build process caches the default model weights inside the image. This allows the image to be used without an internet connection. [Please refer to instructions below.](#docker-image)

Expand Down
123 changes: 123 additions & 0 deletions conf/final/16khz.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
# Model setup
DAC.sample_rate: 16000
DAC.encoder_dim: 64
DAC.encoder_rates: [2, 4, 5, 8]
DAC.decoder_dim: 1536
DAC.decoder_rates: [8, 5, 4, 2]

# Quantization
DAC.n_codebooks: 12
DAC.codebook_size: 1024
DAC.codebook_dim: 8
DAC.quantizer_dropout: 0.5

# Discriminator
Discriminator.sample_rate: 16000
Discriminator.rates: []
Discriminator.periods: [2, 3, 5, 7, 11]
Discriminator.fft_sizes: [2048, 1024, 512]
Discriminator.bands:
- [0.0, 0.1]
- [0.1, 0.25]
- [0.25, 0.5]
- [0.5, 0.75]
- [0.75, 1.0]

# Optimization
AdamW.betas: [0.8, 0.99]
AdamW.lr: 0.0001
ExponentialLR.gamma: 0.999996

amp: false
val_batch_size: 100
device: cuda
num_iters: 400000
save_iters: [10000, 50000, 100000, 200000]
valid_freq: 1000
sample_freq: 10000
num_workers: 32
val_idx: [0, 1, 2, 3, 4, 5, 6, 7]
seed: 0
lambdas:
mel/loss: 15.0
adv/feat_loss: 2.0
adv/gen_loss: 1.0
vq/commitment_loss: 0.25
vq/codebook_loss: 1.0

VolumeNorm.db: [const, -16]

# Transforms
build_transform.preprocess:
- Identity
build_transform.augment_prob: 0.0
build_transform.augment:
- Identity
build_transform.postprocess:
- VolumeNorm
- RescaleAudio
- ShiftPhase

# Loss setup
MultiScaleSTFTLoss.window_lengths: [2048, 512]
MelSpectrogramLoss.n_mels: [5, 10, 20, 40, 80, 160, 320]
MelSpectrogramLoss.window_lengths: [32, 64, 128, 256, 512, 1024, 2048]
MelSpectrogramLoss.mel_fmin: [0, 0, 0, 0, 0, 0, 0]
MelSpectrogramLoss.mel_fmax: [null, null, null, null, null, null, null]
MelSpectrogramLoss.pow: 1.0
MelSpectrogramLoss.clamp_eps: 1.0e-5
MelSpectrogramLoss.mag_weight: 0.0

# Data
batch_size: 72
train/AudioDataset.duration: 0.38
train/AudioDataset.n_examples: 10000000

val/AudioDataset.duration: 5.0
val/build_transform.augment_prob: 1.0
val/AudioDataset.n_examples: 250

test/AudioDataset.duration: 10.0
test/build_transform.augment_prob: 1.0
test/AudioDataset.n_examples: 1000

AudioLoader.shuffle: true
AudioDataset.without_replacement: true

train/build_dataset.folders:
speech_fb:
- /data/daps/train
speech_hq:
- /data/vctk
- /data/vocalset
- /data/read_speech
- /data/french_speech
speech_uq:
- /data/emotional_speech/
- /data/common_voice/
- /data/german_speech/
- /data/russian_speech/
- /data/spanish_speech/
music_hq:
- /data/musdb/train
music_uq:
- /data/jamendo
general:
- /data/audioset/data/unbalanced_train_segments/
- /data/audioset/data/balanced_train_segments/

val/build_dataset.folders:
speech_hq:
- /data/daps/val
music_hq:
- /data/musdb/test
general:
- /data/audioset/data/eval_segments/

test/build_dataset.folders:
speech_hq:
- /data/daps/test
music_hq:
- /data/musdb/test
general:
- /data/audioset/data/eval_segments/
123 changes: 123 additions & 0 deletions conf/final/24khz.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
# Model setup
DAC.sample_rate: 24000
DAC.encoder_dim: 64
DAC.encoder_rates: [2, 4, 5, 8]
DAC.decoder_dim: 1536
DAC.decoder_rates: [8, 5, 4, 2]

# Quantization
DAC.n_codebooks: 32
DAC.codebook_size: 1024
DAC.codebook_dim: 8
DAC.quantizer_dropout: 0.5

# Discriminator
Discriminator.sample_rate: 24000
Discriminator.rates: []
Discriminator.periods: [2, 3, 5, 7, 11]
Discriminator.fft_sizes: [2048, 1024, 512]
Discriminator.bands:
- [0.0, 0.1]
- [0.1, 0.25]
- [0.25, 0.5]
- [0.5, 0.75]
- [0.75, 1.0]

# Optimization
AdamW.betas: [0.8, 0.99]
AdamW.lr: 0.0001
ExponentialLR.gamma: 0.999996

amp: false
val_batch_size: 100
device: cuda
num_iters: 400000
save_iters: [10000, 50000, 100000, 200000]
valid_freq: 1000
sample_freq: 10000
num_workers: 32
val_idx: [0, 1, 2, 3, 4, 5, 6, 7]
seed: 0
lambdas:
mel/loss: 15.0
adv/feat_loss: 2.0
adv/gen_loss: 1.0
vq/commitment_loss: 0.25
vq/codebook_loss: 1.0

VolumeNorm.db: [const, -16]

# Transforms
build_transform.preprocess:
- Identity
build_transform.augment_prob: 0.0
build_transform.augment:
- Identity
build_transform.postprocess:
- VolumeNorm
- RescaleAudio
- ShiftPhase

# Loss setup
MultiScaleSTFTLoss.window_lengths: [2048, 512]
MelSpectrogramLoss.n_mels: [5, 10, 20, 40, 80, 160, 320]
MelSpectrogramLoss.window_lengths: [32, 64, 128, 256, 512, 1024, 2048]
MelSpectrogramLoss.mel_fmin: [0, 0, 0, 0, 0, 0, 0]
MelSpectrogramLoss.mel_fmax: [null, null, null, null, null, null, null]
MelSpectrogramLoss.pow: 1.0
MelSpectrogramLoss.clamp_eps: 1.0e-5
MelSpectrogramLoss.mag_weight: 0.0

# Data
batch_size: 72
train/AudioDataset.duration: 0.38
train/AudioDataset.n_examples: 10000000

val/AudioDataset.duration: 5.0
val/build_transform.augment_prob: 1.0
val/AudioDataset.n_examples: 250

test/AudioDataset.duration: 10.0
test/build_transform.augment_prob: 1.0
test/AudioDataset.n_examples: 1000

AudioLoader.shuffle: true
AudioDataset.without_replacement: true

train/build_dataset.folders:
speech_fb:
- /data/daps/train
speech_hq:
- /data/vctk
- /data/vocalset
- /data/read_speech
- /data/french_speech
speech_uq:
- /data/emotional_speech/
- /data/common_voice/
- /data/german_speech/
- /data/russian_speech/
- /data/spanish_speech/
music_hq:
- /data/musdb/train
music_uq:
- /data/jamendo
general:
- /data/audioset/data/unbalanced_train_segments/
- /data/audioset/data/balanced_train_segments/

val/build_dataset.folders:
speech_hq:
- /data/daps/val
music_hq:
- /data/musdb/test
general:
- /data/audioset/data/eval_segments/

test/build_dataset.folders:
speech_hq:
- /data/daps/test
music_hq:
- /data/musdb/test
general:
- /data/audioset/data/eval_segments/
123 changes: 123 additions & 0 deletions conf/final/44khz.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
# Model setup
DAC.sample_rate: 44100
DAC.encoder_dim: 64
DAC.encoder_rates: [2, 4, 8, 8]
DAC.decoder_dim: 1536
DAC.decoder_rates: [8, 8, 4, 2]

# Quantization
DAC.n_codebooks: 9
DAC.codebook_size: 1024
DAC.codebook_dim: 8
DAC.quantizer_dropout: 0.5

# Discriminator
Discriminator.sample_rate: 44100
Discriminator.rates: []
Discriminator.periods: [2, 3, 5, 7, 11]
Discriminator.fft_sizes: [2048, 1024, 512]
Discriminator.bands:
- [0.0, 0.1]
- [0.1, 0.25]
- [0.25, 0.5]
- [0.5, 0.75]
- [0.75, 1.0]

# Optimization
AdamW.betas: [0.8, 0.99]
AdamW.lr: 0.0001
ExponentialLR.gamma: 0.999996

amp: false
val_batch_size: 100
device: cuda
num_iters: 400000
save_iters: [10000, 50000, 100000, 200000]
valid_freq: 1000
sample_freq: 10000
num_workers: 32
val_idx: [0, 1, 2, 3, 4, 5, 6, 7]
seed: 0
lambdas:
mel/loss: 15.0
adv/feat_loss: 2.0
adv/gen_loss: 1.0
vq/commitment_loss: 0.25
vq/codebook_loss: 1.0

VolumeNorm.db: [const, -16]

# Transforms
build_transform.preprocess:
- Identity
build_transform.augment_prob: 0.0
build_transform.augment:
- Identity
build_transform.postprocess:
- VolumeNorm
- RescaleAudio
- ShiftPhase

# Loss setup
MultiScaleSTFTLoss.window_lengths: [2048, 512]
MelSpectrogramLoss.n_mels: [5, 10, 20, 40, 80, 160, 320]
MelSpectrogramLoss.window_lengths: [32, 64, 128, 256, 512, 1024, 2048]
MelSpectrogramLoss.mel_fmin: [0, 0, 0, 0, 0, 0, 0]
MelSpectrogramLoss.mel_fmax: [null, null, null, null, null, null, null]
MelSpectrogramLoss.pow: 1.0
MelSpectrogramLoss.clamp_eps: 1.0e-5
MelSpectrogramLoss.mag_weight: 0.0

# Data
batch_size: 72
train/AudioDataset.duration: 0.38
train/AudioDataset.n_examples: 10000000

val/AudioDataset.duration: 5.0
val/build_transform.augment_prob: 1.0
val/AudioDataset.n_examples: 250

test/AudioDataset.duration: 10.0
test/build_transform.augment_prob: 1.0
test/AudioDataset.n_examples: 1000

AudioLoader.shuffle: true
AudioDataset.without_replacement: true

train/build_dataset.folders:
speech_fb:
- /data/daps/train
speech_hq:
- /data/vctk
- /data/vocalset
- /data/read_speech
- /data/french_speech
speech_uq:
- /data/emotional_speech/
- /data/common_voice/
- /data/german_speech/
- /data/russian_speech/
- /data/spanish_speech/
music_hq:
- /data/musdb/train
music_uq:
- /data/jamendo
general:
- /data/audioset/data/unbalanced_train_segments/
- /data/audioset/data/balanced_train_segments/

val/build_dataset.folders:
speech_hq:
- /data/daps/val
music_hq:
- /data/musdb/test
general:
- /data/audioset/data/eval_segments/

test/build_dataset.folders:
speech_hq:
- /data/daps/test
music_hq:
- /data/musdb/test
general:
- /data/audioset/data/eval_segments/
2 changes: 1 addition & 1 deletion dac/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = "0.0.4"
__version__ = "0.0.5"

# preserved here for legacy reasons
__model_version__ = "latest"
Expand Down
Loading

0 comments on commit 408235a

Please sign in to comment.