diff --git a/docs/audiocraft/data/audio.html b/docs/audiocraft/data/audio.html
new file mode 100644
index 00000000..617b6e7b
--- /dev/null
+++ b/docs/audiocraft/data/audio.html
@@ -0,0 +1,522 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.data.audio API documentation</title>
+<meta name="description" content="Audio IO methods are defined in this module (info, read, write),
+We rely on av library for faster read when possible, otherwise on torchaudio." />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.data.audio</code></h1>
+</header>
+<section id="section-intro">
+<p>Audio IO methods are defined in this module (info, read, write),
+We rely on av library for faster read when possible, otherwise on torchaudio.</p>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+&#34;&#34;&#34;
+Audio IO methods are defined in this module (info, read, write),
+We rely on av library for faster read when possible, otherwise on torchaudio.
+&#34;&#34;&#34;
+
+from dataclasses import dataclass
+from pathlib import Path
+import logging
+import typing as tp
+
+import numpy as np
+import soundfile
+import torch
+from torch.nn import functional as F
+import torchaudio as ta
+
+import av
+
+from .audio_utils import f32_pcm, i16_pcm, normalize_audio
+
+
+_av_initialized = False
+
+
+def _init_av():
+    global _av_initialized
+    if _av_initialized:
+        return
+    logger = logging.getLogger(&#39;libav.mp3&#39;)
+    logger.setLevel(logging.ERROR)
+    _av_initialized = True
+
+
+@dataclass(frozen=True)
+class AudioFileInfo:
+    sample_rate: int
+    duration: float
+    channels: int
+
+
+def _av_info(filepath: tp.Union[str, Path]) -&gt; AudioFileInfo:
+    _init_av()
+    with av.open(str(filepath)) as af:
+        stream = af.streams.audio[0]
+        sample_rate = stream.codec_context.sample_rate
+        duration = float(stream.duration * stream.time_base)
+        channels = stream.channels
+        return AudioFileInfo(sample_rate, duration, channels)
+
+
+def _soundfile_info(filepath: tp.Union[str, Path]) -&gt; AudioFileInfo:
+    info = soundfile.info(filepath)
+    return AudioFileInfo(info.samplerate, info.duration, info.channels)
+
+
+def audio_info(filepath: tp.Union[str, Path]) -&gt; AudioFileInfo:
+    # torchaudio no longer returns useful duration informations for some formats like mp3s.
+    filepath = Path(filepath)
+    if filepath.suffix in [&#39;.flac&#39;, &#39;.ogg&#39;]:  # TODO: Validate .ogg can be safely read with av_info
+        # ffmpeg has some weird issue with flac.
+        return _soundfile_info(filepath)
+    else:
+        return _av_info(filepath)
+
+
+def _av_read(filepath: tp.Union[str, Path], seek_time: float = 0, duration: float = -1.) -&gt; tp.Tuple[torch.Tensor, int]:
+    &#34;&#34;&#34;FFMPEG-based audio file reading using PyAV bindings.
+    Soundfile cannot read mp3 and av_read is more efficient than torchaudio.
+
+    Args:
+        filepath (str or Path): Path to audio file to read.
+        seek_time (float): Time at which to start reading in the file.
+        duration (float): Duration to read from the file. If set to -1, the whole file is read.
+    Returns:
+        Tuple[torch.Tensor, int]: Tuple containing audio data and sample rate
+    &#34;&#34;&#34;
+    _init_av()
+    with av.open(str(filepath)) as af:
+        stream = af.streams.audio[0]
+        sr = stream.codec_context.sample_rate
+        num_frames = int(sr * duration) if duration &gt;= 0 else -1
+        frame_offset = int(sr * seek_time)
+        # we need a small negative offset otherwise we get some edge artifact
+        # from the mp3 decoder.
+        af.seek(int(max(0, (seek_time - 0.1)) / stream.time_base), stream=stream)
+        frames = []
+        length = 0
+        for frame in af.decode(streams=stream.index):
+            current_offset = int(frame.rate * frame.pts * frame.time_base)
+            strip = max(0, frame_offset - current_offset)
+            buf = torch.from_numpy(frame.to_ndarray())
+            if buf.shape[0] != stream.channels:
+                buf = buf.view(-1, stream.channels).t()
+            buf = buf[:, strip:]
+            frames.append(buf)
+            length += buf.shape[1]
+            if num_frames &gt; 0 and length &gt;= num_frames:
+                break
+        assert frames
+        # If the above assert fails, it is likely because we seeked past the end of file point,
+        # in which case ffmpeg returns a single frame with only zeros, and a weird timestamp.
+        # This will need proper debugging, in due time.
+        wav = torch.cat(frames, dim=1)
+        assert wav.shape[0] == stream.channels
+        if num_frames &gt; 0:
+            wav = wav[:, :num_frames]
+        return f32_pcm(wav), sr
+
+
+def audio_read(filepath: tp.Union[str, Path], seek_time: float = 0.,
+               duration: float = -1., pad: bool = False) -&gt; tp.Tuple[torch.Tensor, int]:
+    &#34;&#34;&#34;Read audio by picking the most appropriate backend tool based on the audio format.
+
+    Args:
+        filepath (str or Path): Path to audio file to read.
+        seek_time (float): Time at which to start reading in the file.
+        duration (float): Duration to read from the file. If set to -1, the whole file is read.
+        pad (bool): Pad output audio if not reaching expected duration.
+    Returns:
+        Tuple[torch.Tensor, int]: Tuple containing audio data and sample rate.
+    &#34;&#34;&#34;
+    fp = Path(filepath)
+    if fp.suffix in [&#39;.flac&#39;, &#39;.ogg&#39;]:  # TODO: check if we can safely use av_read for .ogg
+        # There is some bug with ffmpeg and reading flac
+        info = _soundfile_info(filepath)
+        frames = -1 if duration &lt;= 0 else int(duration * info.sample_rate)
+        frame_offset = int(seek_time * info.sample_rate)
+        wav, sr = soundfile.read(filepath, start=frame_offset, frames=frames, dtype=np.float32)
+        assert info.sample_rate == sr, f&#34;Mismatch of sample rates {info.sample_rate} {sr}&#34;
+        wav = torch.from_numpy(wav).t().contiguous()
+        if len(wav.shape) == 1:
+            wav = torch.unsqueeze(wav, 0)
+    elif (
+        fp.suffix in [&#39;.wav&#39;, &#39;.mp3&#39;] and fp.suffix[1:] in ta.utils.sox_utils.list_read_formats()
+        and duration &lt;= 0 and seek_time == 0
+    ):
+        # Torchaudio is faster if we load an entire file at once.
+        wav, sr = ta.load(fp)
+    else:
+        wav, sr = _av_read(filepath, seek_time, duration)
+    if pad and duration &gt; 0:
+        expected_frames = int(duration * sr)
+        wav = F.pad(wav, (0, expected_frames - wav.shape[-1]))
+    return wav, sr
+
+
+def audio_write(stem_name: tp.Union[str, Path],
+                wav: torch.Tensor, sample_rate: int,
+                format: str = &#39;wav&#39;, mp3_rate: int = 320, normalize: bool = True,
+                strategy: str = &#39;peak&#39;, peak_clip_headroom_db: float = 1,
+                rms_headroom_db: float = 18, loudness_headroom_db: float = 14,
+                loudness_compressor: bool = False,
+                log_clipping: bool = True, make_parent_dir: bool = True,
+                add_suffix: bool = True) -&gt; Path:
+    &#34;&#34;&#34;Convenience function for saving audio to disk. Returns the filename the audio was written to.
+
+    Args:
+        stem_name (str or Path): Filename without extension which will be added automatically.
+        format (str): Either &#34;wav&#34; or &#34;mp3&#34;.
+        mp3_rate (int): kbps when using mp3s.
+        normalize (bool): if `True` (default), normalizes according to the prescribed
+            strategy (see after). If `False`, the strategy is only used in case clipping
+            would happen.
+        strategy (str): Can be either &#39;clip&#39;, &#39;peak&#39;, or &#39;rms&#39;. Default is &#39;peak&#39;,
+            i.e. audio is normalized by its largest value. RMS normalizes by root-mean-square
+            with extra headroom to avoid clipping. &#39;clip&#39; just clips.
+        peak_clip_headroom_db (float): Headroom in dB when doing &#39;peak&#39; or &#39;clip&#39; strategy.
+        rms_headroom_db (float): Headroom in dB when doing &#39;rms&#39; strategy. This must be much larger
+            than the `peak_clip` one to avoid further clipping.
+        loudness_headroom_db (float): Target loudness for loudness normalization.
+        loudness_compressor (bool): Uses tanh for soft clipping when strategy is &#39;loudness&#39;.
+         when strategy is &#39;loudness&#39;log_clipping (bool): If True, basic logging on stderr when clipping still
+            occurs despite strategy (only for &#39;rms&#39;).
+        make_parent_dir (bool): Make parent directory if it doesn&#39;t exist.
+    Returns:
+        Path: Path of the saved audio.
+    &#34;&#34;&#34;
+    assert wav.dtype.is_floating_point, &#34;wav is not floating point&#34;
+    if wav.dim() == 1:
+        wav = wav[None]
+    elif wav.dim() &gt; 2:
+        raise ValueError(&#34;Input wav should be at most 2 dimension.&#34;)
+    assert wav.isfinite().all()
+    wav = normalize_audio(wav, normalize, strategy, peak_clip_headroom_db,
+                          rms_headroom_db, loudness_headroom_db, log_clipping=log_clipping,
+                          sample_rate=sample_rate, stem_name=str(stem_name))
+    kwargs: dict = {}
+    if format == &#39;mp3&#39;:
+        suffix = &#39;.mp3&#39;
+        kwargs.update({&#34;compression&#34;: mp3_rate})
+    elif format == &#39;wav&#39;:
+        wav = i16_pcm(wav)
+        suffix = &#39;.wav&#39;
+        kwargs.update({&#34;encoding&#34;: &#34;PCM_S&#34;, &#34;bits_per_sample&#34;: 16})
+    else:
+        raise RuntimeError(f&#34;Invalid format {format}. Only wav or mp3 are supported.&#34;)
+    if not add_suffix:
+        suffix = &#39;&#39;
+    path = Path(str(stem_name) + suffix)
+    if make_parent_dir:
+        path.parent.mkdir(exist_ok=True, parents=True)
+    try:
+        ta.save(path, wav, sample_rate, **kwargs)
+    except Exception:
+        if path.exists():
+            # we do not want to leave half written files around.
+            path.unlink()
+        raise
+    return path</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-functions">Functions</h2>
+<dl>
+<dt id="audiocraft.data.audio.audio_info"><code class="name flex">
+<span>def <span class="ident">audio_info</span></span>(<span>filepath: Union[str, pathlib.Path]) ‑> <a title="audiocraft.data.audio.AudioFileInfo" href="#audiocraft.data.audio.AudioFileInfo">AudioFileInfo</a></span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def audio_info(filepath: tp.Union[str, Path]) -&gt; AudioFileInfo:
+    # torchaudio no longer returns useful duration informations for some formats like mp3s.
+    filepath = Path(filepath)
+    if filepath.suffix in [&#39;.flac&#39;, &#39;.ogg&#39;]:  # TODO: Validate .ogg can be safely read with av_info
+        # ffmpeg has some weird issue with flac.
+        return _soundfile_info(filepath)
+    else:
+        return _av_info(filepath)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.data.audio.audio_read"><code class="name flex">
+<span>def <span class="ident">audio_read</span></span>(<span>filepath: Union[str, pathlib.Path], seek_time: float = 0.0, duration: float = -1.0, pad: bool = False) ‑> Tuple[torch.Tensor, int]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Read audio by picking the most appropriate backend tool based on the audio format.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>filepath</code></strong> :&ensp;<code>str</code> or <code>Path</code></dt>
+<dd>Path to audio file to read.</dd>
+<dt><strong><code>seek_time</code></strong> :&ensp;<code>float</code></dt>
+<dd>Time at which to start reading in the file.</dd>
+<dt><strong><code>duration</code></strong> :&ensp;<code>float</code></dt>
+<dd>Duration to read from the file. If set to -1, the whole file is read.</dd>
+<dt><strong><code>pad</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Pad output audio if not reaching expected duration.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><code>Tuple[torch.Tensor, int]</code></dt>
+<dd>Tuple containing audio data and sample rate.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def audio_read(filepath: tp.Union[str, Path], seek_time: float = 0.,
+               duration: float = -1., pad: bool = False) -&gt; tp.Tuple[torch.Tensor, int]:
+    &#34;&#34;&#34;Read audio by picking the most appropriate backend tool based on the audio format.
+
+    Args:
+        filepath (str or Path): Path to audio file to read.
+        seek_time (float): Time at which to start reading in the file.
+        duration (float): Duration to read from the file. If set to -1, the whole file is read.
+        pad (bool): Pad output audio if not reaching expected duration.
+    Returns:
+        Tuple[torch.Tensor, int]: Tuple containing audio data and sample rate.
+    &#34;&#34;&#34;
+    fp = Path(filepath)
+    if fp.suffix in [&#39;.flac&#39;, &#39;.ogg&#39;]:  # TODO: check if we can safely use av_read for .ogg
+        # There is some bug with ffmpeg and reading flac
+        info = _soundfile_info(filepath)
+        frames = -1 if duration &lt;= 0 else int(duration * info.sample_rate)
+        frame_offset = int(seek_time * info.sample_rate)
+        wav, sr = soundfile.read(filepath, start=frame_offset, frames=frames, dtype=np.float32)
+        assert info.sample_rate == sr, f&#34;Mismatch of sample rates {info.sample_rate} {sr}&#34;
+        wav = torch.from_numpy(wav).t().contiguous()
+        if len(wav.shape) == 1:
+            wav = torch.unsqueeze(wav, 0)
+    elif (
+        fp.suffix in [&#39;.wav&#39;, &#39;.mp3&#39;] and fp.suffix[1:] in ta.utils.sox_utils.list_read_formats()
+        and duration &lt;= 0 and seek_time == 0
+    ):
+        # Torchaudio is faster if we load an entire file at once.
+        wav, sr = ta.load(fp)
+    else:
+        wav, sr = _av_read(filepath, seek_time, duration)
+    if pad and duration &gt; 0:
+        expected_frames = int(duration * sr)
+        wav = F.pad(wav, (0, expected_frames - wav.shape[-1]))
+    return wav, sr</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.data.audio.audio_write"><code class="name flex">
+<span>def <span class="ident">audio_write</span></span>(<span>stem_name: Union[str, pathlib.Path], wav: torch.Tensor, sample_rate: int, format: str = 'wav', mp3_rate: int = 320, normalize: bool = True, strategy: str = 'peak', peak_clip_headroom_db: float = 1, rms_headroom_db: float = 18, loudness_headroom_db: float = 14, loudness_compressor: bool = False, log_clipping: bool = True, make_parent_dir: bool = True, add_suffix: bool = True) ‑> pathlib.Path</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Convenience function for saving audio to disk. Returns the filename the audio was written to.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>stem_name</code></strong> :&ensp;<code>str</code> or <code>Path</code></dt>
+<dd>Filename without extension which will be added automatically.</dd>
+<dt><strong><code>format</code></strong> :&ensp;<code>str</code></dt>
+<dd>Either "wav" or "mp3".</dd>
+<dt><strong><code>mp3_rate</code></strong> :&ensp;<code>int</code></dt>
+<dd>kbps when using mp3s.</dd>
+<dt><strong><code>normalize</code></strong> :&ensp;<code>bool</code></dt>
+<dd>if <code>True</code> (default), normalizes according to the prescribed
+strategy (see after). If <code>False</code>, the strategy is only used in case clipping
+would happen.</dd>
+<dt><strong><code>strategy</code></strong> :&ensp;<code>str</code></dt>
+<dd>Can be either 'clip', 'peak', or 'rms'. Default is 'peak',
+i.e. audio is normalized by its largest value. RMS normalizes by root-mean-square
+with extra headroom to avoid clipping. 'clip' just clips.</dd>
+<dt><strong><code>peak_clip_headroom_db</code></strong> :&ensp;<code>float</code></dt>
+<dd>Headroom in dB when doing 'peak' or 'clip' strategy.</dd>
+<dt><strong><code>rms_headroom_db</code></strong> :&ensp;<code>float</code></dt>
+<dd>Headroom in dB when doing 'rms' strategy. This must be much larger
+than the <code>peak_clip</code> one to avoid further clipping.</dd>
+<dt><strong><code>loudness_headroom_db</code></strong> :&ensp;<code>float</code></dt>
+<dd>Target loudness for loudness normalization.</dd>
+<dt><strong><code>loudness_compressor</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Uses tanh for soft clipping when strategy is 'loudness'.</dd>
+<dt>when strategy is 'loudness'log_clipping (bool): If True, basic logging on stderr when clipping still</dt>
+<dt>occurs despite strategy (only for 'rms').</dt>
+<dt><strong><code>make_parent_dir</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Make parent directory if it doesn't exist.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><code>Path</code></dt>
+<dd>Path of the saved audio.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def audio_write(stem_name: tp.Union[str, Path],
+                wav: torch.Tensor, sample_rate: int,
+                format: str = &#39;wav&#39;, mp3_rate: int = 320, normalize: bool = True,
+                strategy: str = &#39;peak&#39;, peak_clip_headroom_db: float = 1,
+                rms_headroom_db: float = 18, loudness_headroom_db: float = 14,
+                loudness_compressor: bool = False,
+                log_clipping: bool = True, make_parent_dir: bool = True,
+                add_suffix: bool = True) -&gt; Path:
+    &#34;&#34;&#34;Convenience function for saving audio to disk. Returns the filename the audio was written to.
+
+    Args:
+        stem_name (str or Path): Filename without extension which will be added automatically.
+        format (str): Either &#34;wav&#34; or &#34;mp3&#34;.
+        mp3_rate (int): kbps when using mp3s.
+        normalize (bool): if `True` (default), normalizes according to the prescribed
+            strategy (see after). If `False`, the strategy is only used in case clipping
+            would happen.
+        strategy (str): Can be either &#39;clip&#39;, &#39;peak&#39;, or &#39;rms&#39;. Default is &#39;peak&#39;,
+            i.e. audio is normalized by its largest value. RMS normalizes by root-mean-square
+            with extra headroom to avoid clipping. &#39;clip&#39; just clips.
+        peak_clip_headroom_db (float): Headroom in dB when doing &#39;peak&#39; or &#39;clip&#39; strategy.
+        rms_headroom_db (float): Headroom in dB when doing &#39;rms&#39; strategy. This must be much larger
+            than the `peak_clip` one to avoid further clipping.
+        loudness_headroom_db (float): Target loudness for loudness normalization.
+        loudness_compressor (bool): Uses tanh for soft clipping when strategy is &#39;loudness&#39;.
+         when strategy is &#39;loudness&#39;log_clipping (bool): If True, basic logging on stderr when clipping still
+            occurs despite strategy (only for &#39;rms&#39;).
+        make_parent_dir (bool): Make parent directory if it doesn&#39;t exist.
+    Returns:
+        Path: Path of the saved audio.
+    &#34;&#34;&#34;
+    assert wav.dtype.is_floating_point, &#34;wav is not floating point&#34;
+    if wav.dim() == 1:
+        wav = wav[None]
+    elif wav.dim() &gt; 2:
+        raise ValueError(&#34;Input wav should be at most 2 dimension.&#34;)
+    assert wav.isfinite().all()
+    wav = normalize_audio(wav, normalize, strategy, peak_clip_headroom_db,
+                          rms_headroom_db, loudness_headroom_db, log_clipping=log_clipping,
+                          sample_rate=sample_rate, stem_name=str(stem_name))
+    kwargs: dict = {}
+    if format == &#39;mp3&#39;:
+        suffix = &#39;.mp3&#39;
+        kwargs.update({&#34;compression&#34;: mp3_rate})
+    elif format == &#39;wav&#39;:
+        wav = i16_pcm(wav)
+        suffix = &#39;.wav&#39;
+        kwargs.update({&#34;encoding&#34;: &#34;PCM_S&#34;, &#34;bits_per_sample&#34;: 16})
+    else:
+        raise RuntimeError(f&#34;Invalid format {format}. Only wav or mp3 are supported.&#34;)
+    if not add_suffix:
+        suffix = &#39;&#39;
+    path = Path(str(stem_name) + suffix)
+    if make_parent_dir:
+        path.parent.mkdir(exist_ok=True, parents=True)
+    try:
+        ta.save(path, wav, sample_rate, **kwargs)
+    except Exception:
+        if path.exists():
+            # we do not want to leave half written files around.
+            path.unlink()
+        raise
+    return path</code></pre>
+</details>
+</dd>
+</dl>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.data.audio.AudioFileInfo"><code class="flex name class">
+<span>class <span class="ident">AudioFileInfo</span></span>
+<span>(</span><span>sample_rate: int, duration: float, channels: int)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>AudioFileInfo(sample_rate: int, duration: float, channels: int)</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class AudioFileInfo:
+    sample_rate: int
+    duration: float
+    channels: int</code></pre>
+</details>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.data.audio.AudioFileInfo.channels"><code class="name">var <span class="ident">channels</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.data.audio.AudioFileInfo.duration"><code class="name">var <span class="ident">duration</span> : float</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.data.audio.AudioFileInfo.sample_rate"><code class="name">var <span class="ident">sample_rate</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.data" href="index.html">audiocraft.data</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-functions">Functions</a></h3>
+<ul class="">
+<li><code><a title="audiocraft.data.audio.audio_info" href="#audiocraft.data.audio.audio_info">audio_info</a></code></li>
+<li><code><a title="audiocraft.data.audio.audio_read" href="#audiocraft.data.audio.audio_read">audio_read</a></code></li>
+<li><code><a title="audiocraft.data.audio.audio_write" href="#audiocraft.data.audio.audio_write">audio_write</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.data.audio.AudioFileInfo" href="#audiocraft.data.audio.AudioFileInfo">AudioFileInfo</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.data.audio.AudioFileInfo.channels" href="#audiocraft.data.audio.AudioFileInfo.channels">channels</a></code></li>
+<li><code><a title="audiocraft.data.audio.AudioFileInfo.duration" href="#audiocraft.data.audio.AudioFileInfo.duration">duration</a></code></li>
+<li><code><a title="audiocraft.data.audio.AudioFileInfo.sample_rate" href="#audiocraft.data.audio.AudioFileInfo.sample_rate">sample_rate</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/docs/audiocraft/data/audio_dataset.html b/docs/audiocraft/data/audio_dataset.html
new file mode 100644
index 00000000..c3b10ba1
--- /dev/null
+++ b/docs/audiocraft/data/audio_dataset.html
@@ -0,0 +1,1539 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.data.audio_dataset API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.data.audio_dataset</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import copy
+from concurrent.futures import ThreadPoolExecutor, Future
+from dataclasses import dataclass, fields
+from contextlib import ExitStack
+import gzip
+import json
+import logging
+import os
+from pathlib import Path
+import random
+import sys
+import typing as tp
+
+import torch
+import torch.nn.functional as F
+
+from .audio import audio_read, audio_info
+from .audio_utils import convert_audio
+from .zip import PathInZip
+
+try:
+    import dora
+except ImportError:
+    dora = None  # type: ignore
+
+
+@dataclass(order=True)
+class BaseInfo:
+
+    @classmethod
+    def _dict2fields(cls, dictionary: dict):
+        return {
+            field.name: dictionary[field.name]
+            for field in fields(cls) if field.name in dictionary
+        }
+
+    @classmethod
+    def from_dict(cls, dictionary: dict):
+        _dictionary = cls._dict2fields(dictionary)
+        return cls(**_dictionary)
+
+    def to_dict(self):
+        return {
+            field.name: self.__getattribute__(field.name)
+            for field in fields(self)
+            }
+
+
+@dataclass(order=True)
+class AudioMeta(BaseInfo):
+    path: str
+    duration: float
+    sample_rate: int
+    amplitude: tp.Optional[float] = None
+    weight: tp.Optional[float] = None
+    # info_path is used to load additional information about the audio file that is stored in zip files.
+    info_path: tp.Optional[PathInZip] = None
+
+    @classmethod
+    def from_dict(cls, dictionary: dict):
+        base = cls._dict2fields(dictionary)
+        if &#39;info_path&#39; in base and base[&#39;info_path&#39;] is not None:
+            base[&#39;info_path&#39;] = PathInZip(base[&#39;info_path&#39;])
+        return cls(**base)
+
+    def to_dict(self):
+        d = super().to_dict()
+        if d[&#39;info_path&#39;] is not None:
+            d[&#39;info_path&#39;] = str(d[&#39;info_path&#39;])
+        return d
+
+
+@dataclass(order=True)
+class SegmentInfo(BaseInfo):
+    meta: AudioMeta
+    seek_time: float
+    n_frames: int  # actual number of frames without padding
+    total_frames: int  # total number of frames, padding included
+    sample_rate: int  # actual sample rate
+
+
+DEFAULT_EXTS = [&#39;.wav&#39;, &#39;.mp3&#39;, &#39;.flac&#39;, &#39;.ogg&#39;, &#39;.m4a&#39;]
+
+logger = logging.getLogger(__name__)
+
+
+def _get_audio_meta(file_path: str, minimal: bool = True) -&gt; AudioMeta:
+    &#34;&#34;&#34;AudioMeta from a path to an audio file.
+
+    Args:
+        file_path (str): Resolved path of valid audio file.
+        minimal (bool): Whether to only load the minimal set of metadata (takes longer if not).
+    Returns:
+        AudioMeta: Audio file path and its metadata.
+    &#34;&#34;&#34;
+    info = audio_info(file_path)
+    amplitude: tp.Optional[float] = None
+    if not minimal:
+        wav, sr = audio_read(file_path)
+        amplitude = wav.abs().max().item()
+    return AudioMeta(file_path, info.duration, info.sample_rate, amplitude)
+
+
+def _resolve_audio_meta(m: AudioMeta, fast: bool = True) -&gt; AudioMeta:
+    &#34;&#34;&#34;If Dora is available as a dependency, try to resolve potential relative paths
+    in list of AudioMeta. This method is expected to be used when loading meta from file.
+
+    Args:
+        m (AudioMeta): Audio meta to resolve.
+        fast (bool): If True, uses a really fast check for determining if a file is already absolute or not.
+            Only valid on Linux/Mac.
+    Returns:
+        AudioMeta: Audio meta with resolved path.
+    &#34;&#34;&#34;
+    def is_abs(m):
+        if fast:
+            return str(m)[0] == &#39;/&#39;
+        else:
+            os.path.isabs(str(m))
+
+    if not dora:
+        return m
+
+    if not is_abs(m.path):
+        m.path = dora.git_save.to_absolute_path(m.path)
+    if m.info_path is not None and not is_abs(m.info_path.zip_path):
+        m.info_path.zip_path = dora.git_save.to_absolute_path(m.path)
+    return m
+
+
+def find_audio_files(path: tp.Union[Path, str],
+                     exts: tp.List[str] = DEFAULT_EXTS,
+                     resolve: bool = True,
+                     minimal: bool = True,
+                     progress: bool = False,
+                     workers: int = 0) -&gt; tp.List[AudioMeta]:
+    &#34;&#34;&#34;Build a list of AudioMeta from a given path,
+    collecting relevant audio files and fetching meta info.
+
+    Args:
+        path (str or Path): Path to folder containing audio files.
+        exts (list of str): List of file extensions to consider for audio files.
+        minimal (bool): Whether to only load the minimal set of metadata (takes longer if not).
+        progress (bool): Whether to log progress on audio files collection.
+        workers (int): number of parallel workers, if 0, use only the current thread.
+    Returns:
+        List[AudioMeta]: List of audio file path and its metadata.
+    &#34;&#34;&#34;
+    audio_files = []
+    futures: tp.List[Future] = []
+    pool: tp.Optional[ThreadPoolExecutor] = None
+    with ExitStack() as stack:
+        if workers &gt; 0:
+            pool = ThreadPoolExecutor(workers)
+            stack.enter_context(pool)
+
+        if progress:
+            print(&#34;Finding audio files...&#34;)
+        for root, folders, files in os.walk(path, followlinks=True):
+            for file in files:
+                full_path = Path(root) / file
+                if full_path.suffix.lower() in exts:
+                    audio_files.append(full_path)
+                    if pool is not None:
+                        futures.append(pool.submit(_get_audio_meta, str(audio_files[-1]), minimal))
+                    if progress:
+                        print(format(len(audio_files), &#34; 8d&#34;), end=&#39;\r&#39;, file=sys.stderr)
+
+        if progress:
+            print(&#34;Getting audio metadata...&#34;)
+        meta: tp.List[AudioMeta] = []
+        for idx, file_path in enumerate(audio_files):
+            try:
+                if pool is None:
+                    m = _get_audio_meta(str(file_path), minimal)
+                else:
+                    m = futures[idx].result()
+                if resolve:
+                    m = _resolve_audio_meta(m)
+            except Exception as err:
+                print(&#34;Error with&#34;, str(file_path), err, file=sys.stderr)
+                continue
+            meta.append(m)
+            if progress:
+                print(format((1 + idx) / len(audio_files), &#34; 3.1%&#34;), end=&#39;\r&#39;, file=sys.stderr)
+    meta.sort()
+    return meta
+
+
+def load_audio_meta(path: tp.Union[str, Path],
+                    resolve: bool = True, fast: bool = True) -&gt; tp.List[AudioMeta]:
+    &#34;&#34;&#34;Load list of AudioMeta from an optionally compressed json file.
+
+    Args:
+        path (str or Path): Path to JSON file.
+        resolve (bool): Whether to resolve the path from AudioMeta (default=True).
+        fast (bool): activates some tricks to make things faster.
+    Returns:
+        List[AudioMeta]: List of audio file path and its total duration.
+    &#34;&#34;&#34;
+    open_fn = gzip.open if str(path).lower().endswith(&#39;.gz&#39;) else open
+    with open_fn(path, &#39;rb&#39;) as fp:  # type: ignore
+        lines = fp.readlines()
+    meta = []
+    for line in lines:
+        d = json.loads(line)
+        m = AudioMeta.from_dict(d)
+        if resolve:
+            m = _resolve_audio_meta(m, fast=fast)
+        meta.append(m)
+    return meta
+
+
+def save_audio_meta(path: tp.Union[str, Path], meta: tp.List[AudioMeta]):
+    &#34;&#34;&#34;Save the audio metadata to the file pointer as json.
+
+    Args:
+        path (str or Path): Path to JSON file.
+        metadata (list of BaseAudioMeta): List of audio meta to save.
+    &#34;&#34;&#34;
+    Path(path).parent.mkdir(exist_ok=True, parents=True)
+    open_fn = gzip.open if str(path).lower().endswith(&#39;.gz&#39;) else open
+    with open_fn(path, &#39;wb&#39;) as fp:  # type: ignore
+        for m in meta:
+            json_str = json.dumps(m.to_dict()) + &#39;\n&#39;
+            json_bytes = json_str.encode(&#39;utf-8&#39;)
+            fp.write(json_bytes)
+
+
+class AudioDataset:
+    &#34;&#34;&#34;Base audio dataset.
+
+    The dataset takes a list of AudioMeta and create a dataset composed of segments of audio
+    and potentially additional information, by creating random segments from the list of audio
+    files referenced in the metadata and applying minimal data pre-processing such as resampling,
+    mixing of channels, padding, etc.
+
+    If no segment_duration value is provided, the AudioDataset will return the full wav for each
+    audio file. Otherwise, it will randomly sample audio files and create a segment of the specified
+    duration, applying padding if required.
+
+    By default, only the torch Tensor corresponding to the waveform is returned. Setting return_info=True
+    allows to return a tuple containing the torch Tensor and additional metadata on the segment and the
+    original audio meta.
+
+    Args:
+        meta (tp.List[AudioMeta]): List of audio files metadata.
+        segment_duration (float): Optional segment duration of audio to load.
+            If not specified, the dataset will load the full audio segment from the file.
+        shuffle (bool): Set to `True` to have the data reshuffled at every epoch.
+        sample_rate (int): Target sample rate of the loaded audio samples.
+        channels (int): Target number of channels of the loaded audio samples.
+        sample_on_duration (bool): Set to `True` to sample segments with probability
+            dependent on audio file duration. This is only used if `segment_duration` is provided.
+        sample_on_weight (bool): Set to `True` to sample segments using the `weight` entry of
+            `AudioMeta`. If `sample_on_duration` is also True, the actual weight will be the product
+            of the file duration and file weight. This is only used if `segment_duration` is provided.
+        min_segment_ratio (float): Minimum segment ratio to use when the audio file
+            is shorter than the desired segment.
+        max_read_retry (int): Maximum number of retries to sample an audio segment from the dataset.
+        return_info (bool): Whether to return the wav only or return wav along with segment info and metadata.
+        min_audio_duration (tp.Optional[float], optional): Minimum audio file duration, in seconds, if provided
+            audio shorter than this will be filtered out.
+        max_audio_duration (tp.Optional[float], optional): Maximal audio file duration in seconds, if provided
+            audio longer than this will be filtered out.
+    &#34;&#34;&#34;
+    def __init__(self,
+                 meta: tp.List[AudioMeta],
+                 segment_duration: tp.Optional[float] = None,
+                 shuffle: bool = True,
+                 num_samples: int = 10_000,
+                 sample_rate: int = 48_000,
+                 channels: int = 2,
+                 pad: bool = True,
+                 sample_on_duration: bool = True,
+                 sample_on_weight: bool = True,
+                 min_segment_ratio: float = 0.5,
+                 max_read_retry: int = 10,
+                 return_info: bool = False,
+                 min_audio_duration: tp.Optional[float] = None,
+                 max_audio_duration: tp.Optional[float] = None
+                 ):
+        assert len(meta) &gt; 0, &#39;No audio meta provided to AudioDataset. Please check loading of audio meta.&#39;
+        assert segment_duration is None or segment_duration &gt; 0
+        assert segment_duration is None or min_segment_ratio &gt;= 0
+        logging.debug(f&#39;sample_on_duration: {sample_on_duration}&#39;)
+        logging.debug(f&#39;sample_on_weight: {sample_on_weight}&#39;)
+        logging.debug(f&#39;pad: {pad}&#39;)
+        logging.debug(f&#39;min_segment_ratio: {min_segment_ratio}&#39;)
+
+        self.segment_duration = segment_duration
+        self.min_segment_ratio = min_segment_ratio
+        self.max_audio_duration = max_audio_duration
+        self.min_audio_duration = min_audio_duration
+        if self.min_audio_duration is not None and self.max_audio_duration is not None:
+            assert self.min_audio_duration &lt;= self.max_audio_duration
+        self.meta: tp.List[AudioMeta] = self._filter_duration(meta)
+        assert len(self.meta)  # Fail fast if all data has been filtered.
+        self.total_duration = sum(d.duration for d in self.meta)
+
+        if segment_duration is None:
+            num_samples = len(self.meta)
+        self.num_samples = num_samples
+        self.shuffle = shuffle
+        self.sample_rate = sample_rate
+        self.channels = channels
+        self.pad = pad
+        self.sample_on_weight = sample_on_weight
+        self.sample_on_duration = sample_on_duration
+        self.sampling_probabilities = self._get_sampling_probabilities()
+        self.max_read_retry = max_read_retry
+        self.return_info = return_info
+
+    def __len__(self):
+        return self.num_samples
+
+    def _get_sampling_probabilities(self, normalized: bool = True):
+        &#34;&#34;&#34;Return the sampling probabilities for each file inside `self.meta`.
+        &#34;&#34;&#34;
+        scores: tp.List[float] = []
+        for file_meta in self.meta:
+            score = 1.
+            if self.sample_on_weight and file_meta.weight is not None:
+                score *= file_meta.weight
+            if self.sample_on_duration:
+                score *= file_meta.duration
+            scores.append(score)
+        probabilities = torch.tensor(scores)
+        if normalized:
+            probabilities /= probabilities.sum()
+        return probabilities
+
+    def sample_file(self, rng: torch.Generator) -&gt; AudioMeta:
+        &#34;&#34;&#34;Sample a given file from `self.meta`. Can be overriden in subclasses.
+        This is only called if `segment_duration` is not None.
+
+        You must use the provided random number generator `rng` for reproducibility.
+        &#34;&#34;&#34;
+        if not self.sample_on_weight and not self.sample_on_duration:
+            file_index = int(torch.randint(len(self.sampling_probabilities), (1,), generator=rng).item())
+        else:
+            file_index = int(torch.multinomial(self.sampling_probabilities, 1, generator=rng).item())
+
+        return self.meta[file_index]
+
+    def __getitem__(self, index: int) -&gt; tp.Union[torch.Tensor, tp.Tuple[torch.Tensor, SegmentInfo]]:
+        if self.segment_duration is None:
+            file_meta = self.meta[index]
+            out, sr = audio_read(file_meta.path)
+            out = convert_audio(out, sr, self.sample_rate, self.channels)
+            n_frames = out.shape[-1]
+            segment_info = SegmentInfo(file_meta, seek_time=0., n_frames=n_frames, total_frames=n_frames,
+                                       sample_rate=self.sample_rate)
+        else:
+            rng = torch.Generator()
+            if self.shuffle:
+                # We use index, plus extra randomness
+                rng.manual_seed(index + self.num_samples * random.randint(0, 2**24))
+            else:
+                # We only use index
+                rng.manual_seed(index)
+
+            for retry in range(self.max_read_retry):
+                file_meta = self.sample_file(rng)
+                # We add some variance in the file position even if audio file is smaller than segment
+                # without ending up with empty segments
+                max_seek = max(0, file_meta.duration - self.segment_duration * self.min_segment_ratio)
+                seek_time = torch.rand(1, generator=rng).item() * max_seek
+                try:
+                    out, sr = audio_read(file_meta.path, seek_time, self.segment_duration, pad=False)
+                    out = convert_audio(out, sr, self.sample_rate, self.channels)
+                    n_frames = out.shape[-1]
+                    target_frames = int(self.segment_duration * self.sample_rate)
+                    if self.pad:
+                        out = F.pad(out, (0, target_frames - n_frames))
+                    segment_info = SegmentInfo(file_meta, seek_time, n_frames=n_frames, total_frames=target_frames,
+                                               sample_rate=self.sample_rate)
+                except Exception as exc:
+                    logger.warning(&#34;Error opening file %s: %r&#34;, file_meta.path, exc)
+                    if retry == self.max_read_retry - 1:
+                        raise
+                else:
+                    break
+
+        if self.return_info:
+            # Returns the wav and additional information on the wave segment
+            return out, segment_info
+        else:
+            return out
+
+    def collater(self, samples):
+        &#34;&#34;&#34;The collater function has to be provided to the dataloader
+        if AudioDataset has return_info=True in order to properly collate
+        the samples of a batch.
+        &#34;&#34;&#34;
+        if self.segment_duration is None and len(samples) &gt; 1:
+            assert self.pad, &#34;Must allow padding when batching examples of different durations.&#34;
+
+        # In this case the audio reaching the collater is of variable length as segment_duration=None.
+        to_pad = self.segment_duration is None and self.pad
+        if to_pad:
+            max_len = max([wav.shape[-1] for wav, _ in samples])
+
+            def _pad_wav(wav):
+                return F.pad(wav, (0, max_len - wav.shape[-1]))
+
+        if self.return_info:
+            if len(samples) &gt; 0:
+                assert len(samples[0]) == 2
+                assert isinstance(samples[0][0], torch.Tensor)
+                assert isinstance(samples[0][1], SegmentInfo)
+
+            wavs = [wav for wav, _ in samples]
+            segment_infos = [copy.deepcopy(info) for _, info in samples]
+
+            if to_pad:
+                # Each wav could be of a different duration as they are not segmented.
+                for i in range(len(samples)):
+                    # Determines the total legth of the signal with padding, so we update here as we pad.
+                    segment_infos[i].total_frames = max_len
+                    wavs[i] = _pad_wav(wavs[i])
+
+            wav = torch.stack(wavs)
+            return wav, segment_infos
+        else:
+            assert isinstance(samples[0], torch.Tensor)
+            if to_pad:
+                samples = [_pad_wav(s) for s in samples]
+            return torch.stack(samples)
+
+    def _filter_duration(self, meta: tp.List[AudioMeta]) -&gt; tp.List[AudioMeta]:
+        &#34;&#34;&#34;Filters out audio files with short durations.
+        Removes from meta files that have durations that will not allow to samples examples from them.
+        &#34;&#34;&#34;
+        orig_len = len(meta)
+
+        # Filter data that is too short.
+        if self.min_audio_duration is not None:
+            meta = [m for m in meta if m.duration &gt;= self.min_audio_duration]
+
+        # Filter data that is too long.
+        if self.max_audio_duration is not None:
+            meta = [m for m in meta if m.duration &lt;= self.max_audio_duration]
+
+        filtered_len = len(meta)
+        removed_percentage = 100*(1-float(filtered_len)/orig_len)
+        msg = &#39;Removed %.2f percent of the data because it was too short or too long.&#39; % removed_percentage
+        if removed_percentage &lt; 10:
+            logging.debug(msg)
+        else:
+            logging.warning(msg)
+        return meta
+
+    @classmethod
+    def from_meta(cls, root: tp.Union[str, Path], **kwargs):
+        &#34;&#34;&#34;Instantiate AudioDataset from a path to a directory containing a manifest as a jsonl file.
+
+        Args:
+            root (str or Path): Path to root folder containing audio files.
+            kwargs: Additional keyword arguments for the AudioDataset.
+        &#34;&#34;&#34;
+        root = Path(root)
+        if root.is_dir():
+            if (root / &#39;data.jsonl&#39;).exists():
+                root = root / &#39;data.jsonl&#39;
+            elif (root / &#39;data.jsonl.gz&#39;).exists():
+                root = root / &#39;data.jsonl.gz&#39;
+            else:
+                raise ValueError(&#34;Don&#39;t know where to read metadata from in the dir. &#34;
+                                 &#34;Expecting either a data.jsonl or data.jsonl.gz file but none found.&#34;)
+        meta = load_audio_meta(root)
+        return cls(meta, **kwargs)
+
+    @classmethod
+    def from_path(cls, root: tp.Union[str, Path], minimal_meta: bool = True,
+                  exts: tp.List[str] = DEFAULT_EXTS, **kwargs):
+        &#34;&#34;&#34;Instantiate AudioDataset from a path containing (possibly nested) audio files.
+
+        Args:
+            root (str or Path): Path to root folder containing audio files.
+            minimal_meta (bool): Whether to only load minimal metadata or not.
+            exts (list of str): Extensions for audio files.
+            kwargs: Additional keyword arguments for the AudioDataset.
+        &#34;&#34;&#34;
+        root = Path(root)
+        if root.is_file():
+            meta = load_audio_meta(root, resolve=True)
+        else:
+            meta = find_audio_files(root, exts, minimal=minimal_meta, resolve=True)
+        return cls(meta, **kwargs)
+
+
+def main():
+    logging.basicConfig(stream=sys.stderr, level=logging.INFO)
+    parser = argparse.ArgumentParser(
+        prog=&#39;audio_dataset&#39;,
+        description=&#39;Generate .jsonl files by scanning a folder.&#39;)
+    parser.add_argument(&#39;root&#39;, help=&#39;Root folder with all the audio files&#39;)
+    parser.add_argument(&#39;output_meta_file&#39;,
+                        help=&#39;Output file to store the metadata, &#39;)
+    parser.add_argument(&#39;--complete&#39;,
+                        action=&#39;store_false&#39;, dest=&#39;minimal&#39;, default=True,
+                        help=&#39;Retrieve all metadata, even the one that are expansive &#39;
+                             &#39;to compute (e.g. normalization).&#39;)
+    parser.add_argument(&#39;--resolve&#39;,
+                        action=&#39;store_true&#39;, default=False,
+                        help=&#39;Resolve the paths to be absolute and with no symlinks.&#39;)
+    parser.add_argument(&#39;--workers&#39;,
+                        default=10, type=int,
+                        help=&#39;Number of workers.&#39;)
+    args = parser.parse_args()
+    meta = find_audio_files(args.root, DEFAULT_EXTS, progress=True,
+                            resolve=args.resolve, minimal=args.minimal, workers=args.workers)
+    save_audio_meta(args.output_meta_file, meta)
+
+
+if __name__ == &#39;__main__&#39;:
+    main()</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-functions">Functions</h2>
+<dl>
+<dt id="audiocraft.data.audio_dataset.find_audio_files"><code class="name flex">
+<span>def <span class="ident">find_audio_files</span></span>(<span>path: Union[str, pathlib.Path], exts: List[str] = ['.wav', '.mp3', '.flac', '.ogg', '.m4a'], resolve: bool = True, minimal: bool = True, progress: bool = False, workers: int = 0) ‑> List[<a title="audiocraft.data.audio_dataset.AudioMeta" href="#audiocraft.data.audio_dataset.AudioMeta">AudioMeta</a>]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Build a list of AudioMeta from a given path,
+collecting relevant audio files and fetching meta info.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>path</code></strong> :&ensp;<code>str</code> or <code>Path</code></dt>
+<dd>Path to folder containing audio files.</dd>
+<dt><strong><code>exts</code></strong> :&ensp;<code>list</code> of <code>str</code></dt>
+<dd>List of file extensions to consider for audio files.</dd>
+<dt><strong><code>minimal</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to only load the minimal set of metadata (takes longer if not).</dd>
+<dt><strong><code>progress</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to log progress on audio files collection.</dd>
+<dt><strong><code>workers</code></strong> :&ensp;<code>int</code></dt>
+<dd>number of parallel workers, if 0, use only the current thread.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><code>List[<a title="audiocraft.data.audio_dataset.AudioMeta" href="#audiocraft.data.audio_dataset.AudioMeta">AudioMeta</a>]</code></dt>
+<dd>List of audio file path and its metadata.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def find_audio_files(path: tp.Union[Path, str],
+                     exts: tp.List[str] = DEFAULT_EXTS,
+                     resolve: bool = True,
+                     minimal: bool = True,
+                     progress: bool = False,
+                     workers: int = 0) -&gt; tp.List[AudioMeta]:
+    &#34;&#34;&#34;Build a list of AudioMeta from a given path,
+    collecting relevant audio files and fetching meta info.
+
+    Args:
+        path (str or Path): Path to folder containing audio files.
+        exts (list of str): List of file extensions to consider for audio files.
+        minimal (bool): Whether to only load the minimal set of metadata (takes longer if not).
+        progress (bool): Whether to log progress on audio files collection.
+        workers (int): number of parallel workers, if 0, use only the current thread.
+    Returns:
+        List[AudioMeta]: List of audio file path and its metadata.
+    &#34;&#34;&#34;
+    audio_files = []
+    futures: tp.List[Future] = []
+    pool: tp.Optional[ThreadPoolExecutor] = None
+    with ExitStack() as stack:
+        if workers &gt; 0:
+            pool = ThreadPoolExecutor(workers)
+            stack.enter_context(pool)
+
+        if progress:
+            print(&#34;Finding audio files...&#34;)
+        for root, folders, files in os.walk(path, followlinks=True):
+            for file in files:
+                full_path = Path(root) / file
+                if full_path.suffix.lower() in exts:
+                    audio_files.append(full_path)
+                    if pool is not None:
+                        futures.append(pool.submit(_get_audio_meta, str(audio_files[-1]), minimal))
+                    if progress:
+                        print(format(len(audio_files), &#34; 8d&#34;), end=&#39;\r&#39;, file=sys.stderr)
+
+        if progress:
+            print(&#34;Getting audio metadata...&#34;)
+        meta: tp.List[AudioMeta] = []
+        for idx, file_path in enumerate(audio_files):
+            try:
+                if pool is None:
+                    m = _get_audio_meta(str(file_path), minimal)
+                else:
+                    m = futures[idx].result()
+                if resolve:
+                    m = _resolve_audio_meta(m)
+            except Exception as err:
+                print(&#34;Error with&#34;, str(file_path), err, file=sys.stderr)
+                continue
+            meta.append(m)
+            if progress:
+                print(format((1 + idx) / len(audio_files), &#34; 3.1%&#34;), end=&#39;\r&#39;, file=sys.stderr)
+    meta.sort()
+    return meta</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.data.audio_dataset.load_audio_meta"><code class="name flex">
+<span>def <span class="ident">load_audio_meta</span></span>(<span>path: Union[str, pathlib.Path], resolve: bool = True, fast: bool = True) ‑> List[<a title="audiocraft.data.audio_dataset.AudioMeta" href="#audiocraft.data.audio_dataset.AudioMeta">AudioMeta</a>]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Load list of AudioMeta from an optionally compressed json file.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>path</code></strong> :&ensp;<code>str</code> or <code>Path</code></dt>
+<dd>Path to JSON file.</dd>
+<dt><strong><code>resolve</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to resolve the path from AudioMeta (default=True).</dd>
+<dt><strong><code>fast</code></strong> :&ensp;<code>bool</code></dt>
+<dd>activates some tricks to make things faster.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><code>List[<a title="audiocraft.data.audio_dataset.AudioMeta" href="#audiocraft.data.audio_dataset.AudioMeta">AudioMeta</a>]</code></dt>
+<dd>List of audio file path and its total duration.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def load_audio_meta(path: tp.Union[str, Path],
+                    resolve: bool = True, fast: bool = True) -&gt; tp.List[AudioMeta]:
+    &#34;&#34;&#34;Load list of AudioMeta from an optionally compressed json file.
+
+    Args:
+        path (str or Path): Path to JSON file.
+        resolve (bool): Whether to resolve the path from AudioMeta (default=True).
+        fast (bool): activates some tricks to make things faster.
+    Returns:
+        List[AudioMeta]: List of audio file path and its total duration.
+    &#34;&#34;&#34;
+    open_fn = gzip.open if str(path).lower().endswith(&#39;.gz&#39;) else open
+    with open_fn(path, &#39;rb&#39;) as fp:  # type: ignore
+        lines = fp.readlines()
+    meta = []
+    for line in lines:
+        d = json.loads(line)
+        m = AudioMeta.from_dict(d)
+        if resolve:
+            m = _resolve_audio_meta(m, fast=fast)
+        meta.append(m)
+    return meta</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.data.audio_dataset.main"><code class="name flex">
+<span>def <span class="ident">main</span></span>(<span>)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def main():
+    logging.basicConfig(stream=sys.stderr, level=logging.INFO)
+    parser = argparse.ArgumentParser(
+        prog=&#39;audio_dataset&#39;,
+        description=&#39;Generate .jsonl files by scanning a folder.&#39;)
+    parser.add_argument(&#39;root&#39;, help=&#39;Root folder with all the audio files&#39;)
+    parser.add_argument(&#39;output_meta_file&#39;,
+                        help=&#39;Output file to store the metadata, &#39;)
+    parser.add_argument(&#39;--complete&#39;,
+                        action=&#39;store_false&#39;, dest=&#39;minimal&#39;, default=True,
+                        help=&#39;Retrieve all metadata, even the one that are expansive &#39;
+                             &#39;to compute (e.g. normalization).&#39;)
+    parser.add_argument(&#39;--resolve&#39;,
+                        action=&#39;store_true&#39;, default=False,
+                        help=&#39;Resolve the paths to be absolute and with no symlinks.&#39;)
+    parser.add_argument(&#39;--workers&#39;,
+                        default=10, type=int,
+                        help=&#39;Number of workers.&#39;)
+    args = parser.parse_args()
+    meta = find_audio_files(args.root, DEFAULT_EXTS, progress=True,
+                            resolve=args.resolve, minimal=args.minimal, workers=args.workers)
+    save_audio_meta(args.output_meta_file, meta)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.data.audio_dataset.save_audio_meta"><code class="name flex">
+<span>def <span class="ident">save_audio_meta</span></span>(<span>path: Union[str, pathlib.Path], meta: List[<a title="audiocraft.data.audio_dataset.AudioMeta" href="#audiocraft.data.audio_dataset.AudioMeta">AudioMeta</a>])</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Save the audio metadata to the file pointer as json.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>path</code></strong> :&ensp;<code>str</code> or <code>Path</code></dt>
+<dd>Path to JSON file.</dd>
+<dt><strong><code>metadata</code></strong> :&ensp;<code>list</code> of <code>BaseAudioMeta</code></dt>
+<dd>List of audio meta to save.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def save_audio_meta(path: tp.Union[str, Path], meta: tp.List[AudioMeta]):
+    &#34;&#34;&#34;Save the audio metadata to the file pointer as json.
+
+    Args:
+        path (str or Path): Path to JSON file.
+        metadata (list of BaseAudioMeta): List of audio meta to save.
+    &#34;&#34;&#34;
+    Path(path).parent.mkdir(exist_ok=True, parents=True)
+    open_fn = gzip.open if str(path).lower().endswith(&#39;.gz&#39;) else open
+    with open_fn(path, &#39;wb&#39;) as fp:  # type: ignore
+        for m in meta:
+            json_str = json.dumps(m.to_dict()) + &#39;\n&#39;
+            json_bytes = json_str.encode(&#39;utf-8&#39;)
+            fp.write(json_bytes)</code></pre>
+</details>
+</dd>
+</dl>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.data.audio_dataset.AudioDataset"><code class="flex name class">
+<span>class <span class="ident">AudioDataset</span></span>
+<span>(</span><span>meta: List[<a title="audiocraft.data.audio_dataset.AudioMeta" href="#audiocraft.data.audio_dataset.AudioMeta">AudioMeta</a>], segment_duration: Optional[float] = None, shuffle: bool = True, num_samples: int = 10000, sample_rate: int = 48000, channels: int = 2, pad: bool = True, sample_on_duration: bool = True, sample_on_weight: bool = True, min_segment_ratio: float = 0.5, max_read_retry: int = 10, return_info: bool = False, min_audio_duration: Optional[float] = None, max_audio_duration: Optional[float] = None)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Base audio dataset.</p>
+<p>The dataset takes a list of AudioMeta and create a dataset composed of segments of audio
+and potentially additional information, by creating random segments from the list of audio
+files referenced in the metadata and applying minimal data pre-processing such as resampling,
+mixing of channels, padding, etc.</p>
+<p>If no segment_duration value is provided, the AudioDataset will return the full wav for each
+audio file. Otherwise, it will randomly sample audio files and create a segment of the specified
+duration, applying padding if required.</p>
+<p>By default, only the torch Tensor corresponding to the waveform is returned. Setting return_info=True
+allows to return a tuple containing the torch Tensor and additional metadata on the segment and the
+original audio meta.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>meta</code></strong> :&ensp;<code>tp.List[<a title="audiocraft.data.audio_dataset.AudioMeta" href="#audiocraft.data.audio_dataset.AudioMeta">AudioMeta</a>]</code></dt>
+<dd>List of audio files metadata.</dd>
+<dt><strong><code>segment_duration</code></strong> :&ensp;<code>float</code></dt>
+<dd>Optional segment duration of audio to load.
+If not specified, the dataset will load the full audio segment from the file.</dd>
+<dt><strong><code>shuffle</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Set to <code>True</code> to have the data reshuffled at every epoch.</dd>
+<dt><strong><code>sample_rate</code></strong> :&ensp;<code>int</code></dt>
+<dd>Target sample rate of the loaded audio samples.</dd>
+<dt><strong><code>channels</code></strong> :&ensp;<code>int</code></dt>
+<dd>Target number of channels of the loaded audio samples.</dd>
+<dt><strong><code>sample_on_duration</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Set to <code>True</code> to sample segments with probability
+dependent on audio file duration. This is only used if <code>segment_duration</code> is provided.</dd>
+<dt><strong><code>sample_on_weight</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Set to <code>True</code> to sample segments using the <code>weight</code> entry of
+<code><a title="audiocraft.data.audio_dataset.AudioMeta" href="#audiocraft.data.audio_dataset.AudioMeta">AudioMeta</a></code>. If <code>sample_on_duration</code> is also True, the actual weight will be the product
+of the file duration and file weight. This is only used if <code>segment_duration</code> is provided.</dd>
+<dt><strong><code>min_segment_ratio</code></strong> :&ensp;<code>float</code></dt>
+<dd>Minimum segment ratio to use when the audio file
+is shorter than the desired segment.</dd>
+<dt><strong><code>max_read_retry</code></strong> :&ensp;<code>int</code></dt>
+<dd>Maximum number of retries to sample an audio segment from the dataset.</dd>
+<dt><strong><code>return_info</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to return the wav only or return wav along with segment info and metadata.</dd>
+<dt><strong><code>min_audio_duration</code></strong> :&ensp;<code>tp.Optional[float]</code>, optional</dt>
+<dd>Minimum audio file duration, in seconds, if provided
+audio shorter than this will be filtered out.</dd>
+<dt><strong><code>max_audio_duration</code></strong> :&ensp;<code>tp.Optional[float]</code>, optional</dt>
+<dd>Maximal audio file duration in seconds, if provided
+audio longer than this will be filtered out.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class AudioDataset:
+    &#34;&#34;&#34;Base audio dataset.
+
+    The dataset takes a list of AudioMeta and create a dataset composed of segments of audio
+    and potentially additional information, by creating random segments from the list of audio
+    files referenced in the metadata and applying minimal data pre-processing such as resampling,
+    mixing of channels, padding, etc.
+
+    If no segment_duration value is provided, the AudioDataset will return the full wav for each
+    audio file. Otherwise, it will randomly sample audio files and create a segment of the specified
+    duration, applying padding if required.
+
+    By default, only the torch Tensor corresponding to the waveform is returned. Setting return_info=True
+    allows to return a tuple containing the torch Tensor and additional metadata on the segment and the
+    original audio meta.
+
+    Args:
+        meta (tp.List[AudioMeta]): List of audio files metadata.
+        segment_duration (float): Optional segment duration of audio to load.
+            If not specified, the dataset will load the full audio segment from the file.
+        shuffle (bool): Set to `True` to have the data reshuffled at every epoch.
+        sample_rate (int): Target sample rate of the loaded audio samples.
+        channels (int): Target number of channels of the loaded audio samples.
+        sample_on_duration (bool): Set to `True` to sample segments with probability
+            dependent on audio file duration. This is only used if `segment_duration` is provided.
+        sample_on_weight (bool): Set to `True` to sample segments using the `weight` entry of
+            `AudioMeta`. If `sample_on_duration` is also True, the actual weight will be the product
+            of the file duration and file weight. This is only used if `segment_duration` is provided.
+        min_segment_ratio (float): Minimum segment ratio to use when the audio file
+            is shorter than the desired segment.
+        max_read_retry (int): Maximum number of retries to sample an audio segment from the dataset.
+        return_info (bool): Whether to return the wav only or return wav along with segment info and metadata.
+        min_audio_duration (tp.Optional[float], optional): Minimum audio file duration, in seconds, if provided
+            audio shorter than this will be filtered out.
+        max_audio_duration (tp.Optional[float], optional): Maximal audio file duration in seconds, if provided
+            audio longer than this will be filtered out.
+    &#34;&#34;&#34;
+    def __init__(self,
+                 meta: tp.List[AudioMeta],
+                 segment_duration: tp.Optional[float] = None,
+                 shuffle: bool = True,
+                 num_samples: int = 10_000,
+                 sample_rate: int = 48_000,
+                 channels: int = 2,
+                 pad: bool = True,
+                 sample_on_duration: bool = True,
+                 sample_on_weight: bool = True,
+                 min_segment_ratio: float = 0.5,
+                 max_read_retry: int = 10,
+                 return_info: bool = False,
+                 min_audio_duration: tp.Optional[float] = None,
+                 max_audio_duration: tp.Optional[float] = None
+                 ):
+        assert len(meta) &gt; 0, &#39;No audio meta provided to AudioDataset. Please check loading of audio meta.&#39;
+        assert segment_duration is None or segment_duration &gt; 0
+        assert segment_duration is None or min_segment_ratio &gt;= 0
+        logging.debug(f&#39;sample_on_duration: {sample_on_duration}&#39;)
+        logging.debug(f&#39;sample_on_weight: {sample_on_weight}&#39;)
+        logging.debug(f&#39;pad: {pad}&#39;)
+        logging.debug(f&#39;min_segment_ratio: {min_segment_ratio}&#39;)
+
+        self.segment_duration = segment_duration
+        self.min_segment_ratio = min_segment_ratio
+        self.max_audio_duration = max_audio_duration
+        self.min_audio_duration = min_audio_duration
+        if self.min_audio_duration is not None and self.max_audio_duration is not None:
+            assert self.min_audio_duration &lt;= self.max_audio_duration
+        self.meta: tp.List[AudioMeta] = self._filter_duration(meta)
+        assert len(self.meta)  # Fail fast if all data has been filtered.
+        self.total_duration = sum(d.duration for d in self.meta)
+
+        if segment_duration is None:
+            num_samples = len(self.meta)
+        self.num_samples = num_samples
+        self.shuffle = shuffle
+        self.sample_rate = sample_rate
+        self.channels = channels
+        self.pad = pad
+        self.sample_on_weight = sample_on_weight
+        self.sample_on_duration = sample_on_duration
+        self.sampling_probabilities = self._get_sampling_probabilities()
+        self.max_read_retry = max_read_retry
+        self.return_info = return_info
+
+    def __len__(self):
+        return self.num_samples
+
+    def _get_sampling_probabilities(self, normalized: bool = True):
+        &#34;&#34;&#34;Return the sampling probabilities for each file inside `self.meta`.
+        &#34;&#34;&#34;
+        scores: tp.List[float] = []
+        for file_meta in self.meta:
+            score = 1.
+            if self.sample_on_weight and file_meta.weight is not None:
+                score *= file_meta.weight
+            if self.sample_on_duration:
+                score *= file_meta.duration
+            scores.append(score)
+        probabilities = torch.tensor(scores)
+        if normalized:
+            probabilities /= probabilities.sum()
+        return probabilities
+
+    def sample_file(self, rng: torch.Generator) -&gt; AudioMeta:
+        &#34;&#34;&#34;Sample a given file from `self.meta`. Can be overriden in subclasses.
+        This is only called if `segment_duration` is not None.
+
+        You must use the provided random number generator `rng` for reproducibility.
+        &#34;&#34;&#34;
+        if not self.sample_on_weight and not self.sample_on_duration:
+            file_index = int(torch.randint(len(self.sampling_probabilities), (1,), generator=rng).item())
+        else:
+            file_index = int(torch.multinomial(self.sampling_probabilities, 1, generator=rng).item())
+
+        return self.meta[file_index]
+
+    def __getitem__(self, index: int) -&gt; tp.Union[torch.Tensor, tp.Tuple[torch.Tensor, SegmentInfo]]:
+        if self.segment_duration is None:
+            file_meta = self.meta[index]
+            out, sr = audio_read(file_meta.path)
+            out = convert_audio(out, sr, self.sample_rate, self.channels)
+            n_frames = out.shape[-1]
+            segment_info = SegmentInfo(file_meta, seek_time=0., n_frames=n_frames, total_frames=n_frames,
+                                       sample_rate=self.sample_rate)
+        else:
+            rng = torch.Generator()
+            if self.shuffle:
+                # We use index, plus extra randomness
+                rng.manual_seed(index + self.num_samples * random.randint(0, 2**24))
+            else:
+                # We only use index
+                rng.manual_seed(index)
+
+            for retry in range(self.max_read_retry):
+                file_meta = self.sample_file(rng)
+                # We add some variance in the file position even if audio file is smaller than segment
+                # without ending up with empty segments
+                max_seek = max(0, file_meta.duration - self.segment_duration * self.min_segment_ratio)
+                seek_time = torch.rand(1, generator=rng).item() * max_seek
+                try:
+                    out, sr = audio_read(file_meta.path, seek_time, self.segment_duration, pad=False)
+                    out = convert_audio(out, sr, self.sample_rate, self.channels)
+                    n_frames = out.shape[-1]
+                    target_frames = int(self.segment_duration * self.sample_rate)
+                    if self.pad:
+                        out = F.pad(out, (0, target_frames - n_frames))
+                    segment_info = SegmentInfo(file_meta, seek_time, n_frames=n_frames, total_frames=target_frames,
+                                               sample_rate=self.sample_rate)
+                except Exception as exc:
+                    logger.warning(&#34;Error opening file %s: %r&#34;, file_meta.path, exc)
+                    if retry == self.max_read_retry - 1:
+                        raise
+                else:
+                    break
+
+        if self.return_info:
+            # Returns the wav and additional information on the wave segment
+            return out, segment_info
+        else:
+            return out
+
+    def collater(self, samples):
+        &#34;&#34;&#34;The collater function has to be provided to the dataloader
+        if AudioDataset has return_info=True in order to properly collate
+        the samples of a batch.
+        &#34;&#34;&#34;
+        if self.segment_duration is None and len(samples) &gt; 1:
+            assert self.pad, &#34;Must allow padding when batching examples of different durations.&#34;
+
+        # In this case the audio reaching the collater is of variable length as segment_duration=None.
+        to_pad = self.segment_duration is None and self.pad
+        if to_pad:
+            max_len = max([wav.shape[-1] for wav, _ in samples])
+
+            def _pad_wav(wav):
+                return F.pad(wav, (0, max_len - wav.shape[-1]))
+
+        if self.return_info:
+            if len(samples) &gt; 0:
+                assert len(samples[0]) == 2
+                assert isinstance(samples[0][0], torch.Tensor)
+                assert isinstance(samples[0][1], SegmentInfo)
+
+            wavs = [wav for wav, _ in samples]
+            segment_infos = [copy.deepcopy(info) for _, info in samples]
+
+            if to_pad:
+                # Each wav could be of a different duration as they are not segmented.
+                for i in range(len(samples)):
+                    # Determines the total legth of the signal with padding, so we update here as we pad.
+                    segment_infos[i].total_frames = max_len
+                    wavs[i] = _pad_wav(wavs[i])
+
+            wav = torch.stack(wavs)
+            return wav, segment_infos
+        else:
+            assert isinstance(samples[0], torch.Tensor)
+            if to_pad:
+                samples = [_pad_wav(s) for s in samples]
+            return torch.stack(samples)
+
+    def _filter_duration(self, meta: tp.List[AudioMeta]) -&gt; tp.List[AudioMeta]:
+        &#34;&#34;&#34;Filters out audio files with short durations.
+        Removes from meta files that have durations that will not allow to samples examples from them.
+        &#34;&#34;&#34;
+        orig_len = len(meta)
+
+        # Filter data that is too short.
+        if self.min_audio_duration is not None:
+            meta = [m for m in meta if m.duration &gt;= self.min_audio_duration]
+
+        # Filter data that is too long.
+        if self.max_audio_duration is not None:
+            meta = [m for m in meta if m.duration &lt;= self.max_audio_duration]
+
+        filtered_len = len(meta)
+        removed_percentage = 100*(1-float(filtered_len)/orig_len)
+        msg = &#39;Removed %.2f percent of the data because it was too short or too long.&#39; % removed_percentage
+        if removed_percentage &lt; 10:
+            logging.debug(msg)
+        else:
+            logging.warning(msg)
+        return meta
+
+    @classmethod
+    def from_meta(cls, root: tp.Union[str, Path], **kwargs):
+        &#34;&#34;&#34;Instantiate AudioDataset from a path to a directory containing a manifest as a jsonl file.
+
+        Args:
+            root (str or Path): Path to root folder containing audio files.
+            kwargs: Additional keyword arguments for the AudioDataset.
+        &#34;&#34;&#34;
+        root = Path(root)
+        if root.is_dir():
+            if (root / &#39;data.jsonl&#39;).exists():
+                root = root / &#39;data.jsonl&#39;
+            elif (root / &#39;data.jsonl.gz&#39;).exists():
+                root = root / &#39;data.jsonl.gz&#39;
+            else:
+                raise ValueError(&#34;Don&#39;t know where to read metadata from in the dir. &#34;
+                                 &#34;Expecting either a data.jsonl or data.jsonl.gz file but none found.&#34;)
+        meta = load_audio_meta(root)
+        return cls(meta, **kwargs)
+
+    @classmethod
+    def from_path(cls, root: tp.Union[str, Path], minimal_meta: bool = True,
+                  exts: tp.List[str] = DEFAULT_EXTS, **kwargs):
+        &#34;&#34;&#34;Instantiate AudioDataset from a path containing (possibly nested) audio files.
+
+        Args:
+            root (str or Path): Path to root folder containing audio files.
+            minimal_meta (bool): Whether to only load minimal metadata or not.
+            exts (list of str): Extensions for audio files.
+            kwargs: Additional keyword arguments for the AudioDataset.
+        &#34;&#34;&#34;
+        root = Path(root)
+        if root.is_file():
+            meta = load_audio_meta(root, resolve=True)
+        else:
+            meta = find_audio_files(root, exts, minimal=minimal_meta, resolve=True)
+        return cls(meta, **kwargs)</code></pre>
+</details>
+<h3>Static methods</h3>
+<dl>
+<dt id="audiocraft.data.audio_dataset.AudioDataset.from_meta"><code class="name flex">
+<span>def <span class="ident">from_meta</span></span>(<span>root: Union[str, pathlib.Path], **kwargs)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Instantiate AudioDataset from a path to a directory containing a manifest as a jsonl file.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>root</code></strong> :&ensp;<code>str</code> or <code>Path</code></dt>
+<dd>Path to root folder containing audio files.</dd>
+<dt><strong><code>kwargs</code></strong></dt>
+<dd>Additional keyword arguments for the AudioDataset.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@classmethod
+def from_meta(cls, root: tp.Union[str, Path], **kwargs):
+    &#34;&#34;&#34;Instantiate AudioDataset from a path to a directory containing a manifest as a jsonl file.
+
+    Args:
+        root (str or Path): Path to root folder containing audio files.
+        kwargs: Additional keyword arguments for the AudioDataset.
+    &#34;&#34;&#34;
+    root = Path(root)
+    if root.is_dir():
+        if (root / &#39;data.jsonl&#39;).exists():
+            root = root / &#39;data.jsonl&#39;
+        elif (root / &#39;data.jsonl.gz&#39;).exists():
+            root = root / &#39;data.jsonl.gz&#39;
+        else:
+            raise ValueError(&#34;Don&#39;t know where to read metadata from in the dir. &#34;
+                             &#34;Expecting either a data.jsonl or data.jsonl.gz file but none found.&#34;)
+    meta = load_audio_meta(root)
+    return cls(meta, **kwargs)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.data.audio_dataset.AudioDataset.from_path"><code class="name flex">
+<span>def <span class="ident">from_path</span></span>(<span>root: Union[str, pathlib.Path], minimal_meta: bool = True, exts: List[str] = ['.wav', '.mp3', '.flac', '.ogg', '.m4a'], **kwargs)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Instantiate AudioDataset from a path containing (possibly nested) audio files.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>root</code></strong> :&ensp;<code>str</code> or <code>Path</code></dt>
+<dd>Path to root folder containing audio files.</dd>
+<dt><strong><code>minimal_meta</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to only load minimal metadata or not.</dd>
+<dt><strong><code>exts</code></strong> :&ensp;<code>list</code> of <code>str</code></dt>
+<dd>Extensions for audio files.</dd>
+<dt><strong><code>kwargs</code></strong></dt>
+<dd>Additional keyword arguments for the AudioDataset.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@classmethod
+def from_path(cls, root: tp.Union[str, Path], minimal_meta: bool = True,
+              exts: tp.List[str] = DEFAULT_EXTS, **kwargs):
+    &#34;&#34;&#34;Instantiate AudioDataset from a path containing (possibly nested) audio files.
+
+    Args:
+        root (str or Path): Path to root folder containing audio files.
+        minimal_meta (bool): Whether to only load minimal metadata or not.
+        exts (list of str): Extensions for audio files.
+        kwargs: Additional keyword arguments for the AudioDataset.
+    &#34;&#34;&#34;
+    root = Path(root)
+    if root.is_file():
+        meta = load_audio_meta(root, resolve=True)
+    else:
+        meta = find_audio_files(root, exts, minimal=minimal_meta, resolve=True)
+    return cls(meta, **kwargs)</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.data.audio_dataset.AudioDataset.collater"><code class="name flex">
+<span>def <span class="ident">collater</span></span>(<span>self, samples)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>The collater function has to be provided to the dataloader
+if AudioDataset has return_info=True in order to properly collate
+the samples of a batch.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def collater(self, samples):
+    &#34;&#34;&#34;The collater function has to be provided to the dataloader
+    if AudioDataset has return_info=True in order to properly collate
+    the samples of a batch.
+    &#34;&#34;&#34;
+    if self.segment_duration is None and len(samples) &gt; 1:
+        assert self.pad, &#34;Must allow padding when batching examples of different durations.&#34;
+
+    # In this case the audio reaching the collater is of variable length as segment_duration=None.
+    to_pad = self.segment_duration is None and self.pad
+    if to_pad:
+        max_len = max([wav.shape[-1] for wav, _ in samples])
+
+        def _pad_wav(wav):
+            return F.pad(wav, (0, max_len - wav.shape[-1]))
+
+    if self.return_info:
+        if len(samples) &gt; 0:
+            assert len(samples[0]) == 2
+            assert isinstance(samples[0][0], torch.Tensor)
+            assert isinstance(samples[0][1], SegmentInfo)
+
+        wavs = [wav for wav, _ in samples]
+        segment_infos = [copy.deepcopy(info) for _, info in samples]
+
+        if to_pad:
+            # Each wav could be of a different duration as they are not segmented.
+            for i in range(len(samples)):
+                # Determines the total legth of the signal with padding, so we update here as we pad.
+                segment_infos[i].total_frames = max_len
+                wavs[i] = _pad_wav(wavs[i])
+
+        wav = torch.stack(wavs)
+        return wav, segment_infos
+    else:
+        assert isinstance(samples[0], torch.Tensor)
+        if to_pad:
+            samples = [_pad_wav(s) for s in samples]
+        return torch.stack(samples)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.data.audio_dataset.AudioDataset.sample_file"><code class="name flex">
+<span>def <span class="ident">sample_file</span></span>(<span>self, rng: torch._C.Generator) ‑> <a title="audiocraft.data.audio_dataset.AudioMeta" href="#audiocraft.data.audio_dataset.AudioMeta">AudioMeta</a></span>
+</code></dt>
+<dd>
+<div class="desc"><p>Sample a given file from <code>self.meta</code>. Can be overriden in subclasses.
+This is only called if <code>segment_duration</code> is not None.</p>
+<p>You must use the provided random number generator <code>rng</code> for reproducibility.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def sample_file(self, rng: torch.Generator) -&gt; AudioMeta:
+    &#34;&#34;&#34;Sample a given file from `self.meta`. Can be overriden in subclasses.
+    This is only called if `segment_duration` is not None.
+
+    You must use the provided random number generator `rng` for reproducibility.
+    &#34;&#34;&#34;
+    if not self.sample_on_weight and not self.sample_on_duration:
+        file_index = int(torch.randint(len(self.sampling_probabilities), (1,), generator=rng).item())
+    else:
+        file_index = int(torch.multinomial(self.sampling_probabilities, 1, generator=rng).item())
+
+    return self.meta[file_index]</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.data.audio_dataset.AudioMeta"><code class="flex name class">
+<span>class <span class="ident">AudioMeta</span></span>
+<span>(</span><span>path: str, duration: float, sample_rate: int, amplitude: Optional[float] = None, weight: Optional[float] = None, info_path: Optional[<a title="audiocraft.data.zip.PathInZip" href="zip.html#audiocraft.data.zip.PathInZip">PathInZip</a>] = None)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>AudioMeta(path: str, duration: float, sample_rate: int, amplitude: Union[float, NoneType] = None, weight: Union[float, NoneType] = None, info_path: Union[audiocraft.data.zip.PathInZip, NoneType] = None)</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class AudioMeta(BaseInfo):
+    path: str
+    duration: float
+    sample_rate: int
+    amplitude: tp.Optional[float] = None
+    weight: tp.Optional[float] = None
+    # info_path is used to load additional information about the audio file that is stored in zip files.
+    info_path: tp.Optional[PathInZip] = None
+
+    @classmethod
+    def from_dict(cls, dictionary: dict):
+        base = cls._dict2fields(dictionary)
+        if &#39;info_path&#39; in base and base[&#39;info_path&#39;] is not None:
+            base[&#39;info_path&#39;] = PathInZip(base[&#39;info_path&#39;])
+        return cls(**base)
+
+    def to_dict(self):
+        d = super().to_dict()
+        if d[&#39;info_path&#39;] is not None:
+            d[&#39;info_path&#39;] = str(d[&#39;info_path&#39;])
+        return d</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.data.audio_dataset.BaseInfo" href="#audiocraft.data.audio_dataset.BaseInfo">BaseInfo</a></li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.data.audio_dataset.AudioMeta.amplitude"><code class="name">var <span class="ident">amplitude</span> : Optional[float]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.data.audio_dataset.AudioMeta.duration"><code class="name">var <span class="ident">duration</span> : float</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.data.audio_dataset.AudioMeta.info_path"><code class="name">var <span class="ident">info_path</span> : Optional[<a title="audiocraft.data.zip.PathInZip" href="zip.html#audiocraft.data.zip.PathInZip">PathInZip</a>]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.data.audio_dataset.AudioMeta.path"><code class="name">var <span class="ident">path</span> : str</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.data.audio_dataset.AudioMeta.sample_rate"><code class="name">var <span class="ident">sample_rate</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.data.audio_dataset.AudioMeta.weight"><code class="name">var <span class="ident">weight</span> : Optional[float]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Static methods</h3>
+<dl>
+<dt id="audiocraft.data.audio_dataset.AudioMeta.from_dict"><code class="name flex">
+<span>def <span class="ident">from_dict</span></span>(<span>dictionary: dict)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@classmethod
+def from_dict(cls, dictionary: dict):
+    base = cls._dict2fields(dictionary)
+    if &#39;info_path&#39; in base and base[&#39;info_path&#39;] is not None:
+        base[&#39;info_path&#39;] = PathInZip(base[&#39;info_path&#39;])
+    return cls(**base)</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.data.audio_dataset.AudioMeta.to_dict"><code class="name flex">
+<span>def <span class="ident">to_dict</span></span>(<span>self)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def to_dict(self):
+    d = super().to_dict()
+    if d[&#39;info_path&#39;] is not None:
+        d[&#39;info_path&#39;] = str(d[&#39;info_path&#39;])
+    return d</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.data.audio_dataset.BaseInfo"><code class="flex name class">
+<span>class <span class="ident">BaseInfo</span></span>
+</code></dt>
+<dd>
+<div class="desc"><p>BaseInfo()</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class BaseInfo:
+
+    @classmethod
+    def _dict2fields(cls, dictionary: dict):
+        return {
+            field.name: dictionary[field.name]
+            for field in fields(cls) if field.name in dictionary
+        }
+
+    @classmethod
+    def from_dict(cls, dictionary: dict):
+        _dictionary = cls._dict2fields(dictionary)
+        return cls(**_dictionary)
+
+    def to_dict(self):
+        return {
+            field.name: self.__getattribute__(field.name)
+            for field in fields(self)
+            }</code></pre>
+</details>
+<h3>Subclasses</h3>
+<ul class="hlist">
+<li><a title="audiocraft.data.audio_dataset.AudioMeta" href="#audiocraft.data.audio_dataset.AudioMeta">AudioMeta</a></li>
+<li><a title="audiocraft.data.audio_dataset.SegmentInfo" href="#audiocraft.data.audio_dataset.SegmentInfo">SegmentInfo</a></li>
+</ul>
+<h3>Static methods</h3>
+<dl>
+<dt id="audiocraft.data.audio_dataset.BaseInfo.from_dict"><code class="name flex">
+<span>def <span class="ident">from_dict</span></span>(<span>dictionary: dict)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@classmethod
+def from_dict(cls, dictionary: dict):
+    _dictionary = cls._dict2fields(dictionary)
+    return cls(**_dictionary)</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.data.audio_dataset.BaseInfo.to_dict"><code class="name flex">
+<span>def <span class="ident">to_dict</span></span>(<span>self)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def to_dict(self):
+    return {
+        field.name: self.__getattribute__(field.name)
+        for field in fields(self)
+        }</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.data.audio_dataset.SegmentInfo"><code class="flex name class">
+<span>class <span class="ident">SegmentInfo</span></span>
+<span>(</span><span>meta: <a title="audiocraft.data.audio_dataset.AudioMeta" href="#audiocraft.data.audio_dataset.AudioMeta">AudioMeta</a>, seek_time: float, n_frames: int, total_frames: int, sample_rate: int)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>SegmentInfo(meta: audiocraft.data.audio_dataset.AudioMeta, seek_time: float, n_frames: int, total_frames: int, sample_rate: int)</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class SegmentInfo(BaseInfo):
+    meta: AudioMeta
+    seek_time: float
+    n_frames: int  # actual number of frames without padding
+    total_frames: int  # total number of frames, padding included
+    sample_rate: int  # actual sample rate</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.data.audio_dataset.BaseInfo" href="#audiocraft.data.audio_dataset.BaseInfo">BaseInfo</a></li>
+</ul>
+<h3>Subclasses</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.conditioners.SegmentWithAttributes" href="../modules/conditioners.html#audiocraft.modules.conditioners.SegmentWithAttributes">SegmentWithAttributes</a></li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.data.audio_dataset.SegmentInfo.meta"><code class="name">var <span class="ident">meta</span> : <a title="audiocraft.data.audio_dataset.AudioMeta" href="#audiocraft.data.audio_dataset.AudioMeta">AudioMeta</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.data.audio_dataset.SegmentInfo.n_frames"><code class="name">var <span class="ident">n_frames</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.data.audio_dataset.SegmentInfo.sample_rate"><code class="name">var <span class="ident">sample_rate</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.data.audio_dataset.SegmentInfo.seek_time"><code class="name">var <span class="ident">seek_time</span> : float</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.data.audio_dataset.SegmentInfo.total_frames"><code class="name">var <span class="ident">total_frames</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.data" href="index.html">audiocraft.data</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-functions">Functions</a></h3>
+<ul class="">
+<li><code><a title="audiocraft.data.audio_dataset.find_audio_files" href="#audiocraft.data.audio_dataset.find_audio_files">find_audio_files</a></code></li>
+<li><code><a title="audiocraft.data.audio_dataset.load_audio_meta" href="#audiocraft.data.audio_dataset.load_audio_meta">load_audio_meta</a></code></li>
+<li><code><a title="audiocraft.data.audio_dataset.main" href="#audiocraft.data.audio_dataset.main">main</a></code></li>
+<li><code><a title="audiocraft.data.audio_dataset.save_audio_meta" href="#audiocraft.data.audio_dataset.save_audio_meta">save_audio_meta</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.data.audio_dataset.AudioDataset" href="#audiocraft.data.audio_dataset.AudioDataset">AudioDataset</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.data.audio_dataset.AudioDataset.collater" href="#audiocraft.data.audio_dataset.AudioDataset.collater">collater</a></code></li>
+<li><code><a title="audiocraft.data.audio_dataset.AudioDataset.from_meta" href="#audiocraft.data.audio_dataset.AudioDataset.from_meta">from_meta</a></code></li>
+<li><code><a title="audiocraft.data.audio_dataset.AudioDataset.from_path" href="#audiocraft.data.audio_dataset.AudioDataset.from_path">from_path</a></code></li>
+<li><code><a title="audiocraft.data.audio_dataset.AudioDataset.sample_file" href="#audiocraft.data.audio_dataset.AudioDataset.sample_file">sample_file</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.data.audio_dataset.AudioMeta" href="#audiocraft.data.audio_dataset.AudioMeta">AudioMeta</a></code></h4>
+<ul class="two-column">
+<li><code><a title="audiocraft.data.audio_dataset.AudioMeta.amplitude" href="#audiocraft.data.audio_dataset.AudioMeta.amplitude">amplitude</a></code></li>
+<li><code><a title="audiocraft.data.audio_dataset.AudioMeta.duration" href="#audiocraft.data.audio_dataset.AudioMeta.duration">duration</a></code></li>
+<li><code><a title="audiocraft.data.audio_dataset.AudioMeta.from_dict" href="#audiocraft.data.audio_dataset.AudioMeta.from_dict">from_dict</a></code></li>
+<li><code><a title="audiocraft.data.audio_dataset.AudioMeta.info_path" href="#audiocraft.data.audio_dataset.AudioMeta.info_path">info_path</a></code></li>
+<li><code><a title="audiocraft.data.audio_dataset.AudioMeta.path" href="#audiocraft.data.audio_dataset.AudioMeta.path">path</a></code></li>
+<li><code><a title="audiocraft.data.audio_dataset.AudioMeta.sample_rate" href="#audiocraft.data.audio_dataset.AudioMeta.sample_rate">sample_rate</a></code></li>
+<li><code><a title="audiocraft.data.audio_dataset.AudioMeta.to_dict" href="#audiocraft.data.audio_dataset.AudioMeta.to_dict">to_dict</a></code></li>
+<li><code><a title="audiocraft.data.audio_dataset.AudioMeta.weight" href="#audiocraft.data.audio_dataset.AudioMeta.weight">weight</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.data.audio_dataset.BaseInfo" href="#audiocraft.data.audio_dataset.BaseInfo">BaseInfo</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.data.audio_dataset.BaseInfo.from_dict" href="#audiocraft.data.audio_dataset.BaseInfo.from_dict">from_dict</a></code></li>
+<li><code><a title="audiocraft.data.audio_dataset.BaseInfo.to_dict" href="#audiocraft.data.audio_dataset.BaseInfo.to_dict">to_dict</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.data.audio_dataset.SegmentInfo" href="#audiocraft.data.audio_dataset.SegmentInfo">SegmentInfo</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.data.audio_dataset.SegmentInfo.meta" href="#audiocraft.data.audio_dataset.SegmentInfo.meta">meta</a></code></li>
+<li><code><a title="audiocraft.data.audio_dataset.SegmentInfo.n_frames" href="#audiocraft.data.audio_dataset.SegmentInfo.n_frames">n_frames</a></code></li>
+<li><code><a title="audiocraft.data.audio_dataset.SegmentInfo.sample_rate" href="#audiocraft.data.audio_dataset.SegmentInfo.sample_rate">sample_rate</a></code></li>
+<li><code><a title="audiocraft.data.audio_dataset.SegmentInfo.seek_time" href="#audiocraft.data.audio_dataset.SegmentInfo.seek_time">seek_time</a></code></li>
+<li><code><a title="audiocraft.data.audio_dataset.SegmentInfo.total_frames" href="#audiocraft.data.audio_dataset.SegmentInfo.total_frames">total_frames</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/docs/audiocraft/data/audio_utils.html b/docs/audiocraft/data/audio_utils.html
new file mode 100644
index 00000000..20744e25
--- /dev/null
+++ b/docs/audiocraft/data/audio_utils.html
@@ -0,0 +1,519 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.data.audio_utils API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.data.audio_utils</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import sys
+import typing as tp
+
+import julius
+import torch
+import torchaudio
+
+
+def convert_audio_channels(wav: torch.Tensor, channels: int = 2) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Convert audio to the given number of channels.
+
+    Args:
+        wav (torch.Tensor): Audio wave of shape [B, C, T].
+        channels (int): Expected number of channels as output.
+    Returns:
+        torch.Tensor: Downmixed or unchanged audio wave [B, C, T].
+    &#34;&#34;&#34;
+    *shape, src_channels, length = wav.shape
+    if src_channels == channels:
+        pass
+    elif channels == 1:
+        # Case 1:
+        # The caller asked 1-channel audio, and the stream has multiple
+        # channels, downmix all channels.
+        wav = wav.mean(dim=-2, keepdim=True)
+    elif src_channels == 1:
+        # Case 2:
+        # The caller asked for multiple channels, but the input file has
+        # a single channel, replicate the audio over all channels.
+        wav = wav.expand(*shape, channels, length)
+    elif src_channels &gt;= channels:
+        # Case 3:
+        # The caller asked for multiple channels, and the input file has
+        # more channels than requested. In that case return the first channels.
+        wav = wav[..., :channels, :]
+    else:
+        # Case 4: What is a reasonable choice here?
+        raise ValueError(&#39;The audio file has less channels than requested but is not mono.&#39;)
+    return wav
+
+
+def convert_audio(wav: torch.Tensor, from_rate: float,
+                  to_rate: float, to_channels: int) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Convert audio to new sample rate and number of audio channels.
+    &#34;&#34;&#34;
+    wav = julius.resample_frac(wav, int(from_rate), int(to_rate))
+    wav = convert_audio_channels(wav, to_channels)
+    return wav
+
+
+def normalize_loudness(wav: torch.Tensor, sample_rate: int, loudness_headroom_db: float = 14,
+                       loudness_compressor: bool = False, energy_floor: float = 2e-3):
+    &#34;&#34;&#34;Normalize an input signal to a user loudness in dB LKFS.
+    Audio loudness is defined according to the ITU-R BS.1770-4 recommendation.
+
+    Args:
+        wav (torch.Tensor): Input multichannel audio data.
+        sample_rate (int): Sample rate.
+        loudness_headroom_db (float): Target loudness of the output in dB LUFS.
+        loudness_compressor (bool): Uses tanh for soft clipping.
+        energy_floor (float): anything below that RMS level will not be rescaled.
+    Returns:
+        output (torch.Tensor): Loudness normalized output data.
+    &#34;&#34;&#34;
+    energy = wav.pow(2).mean().sqrt().item()
+    if energy &lt; energy_floor:
+        return wav
+    transform = torchaudio.transforms.Loudness(sample_rate)
+    input_loudness_db = transform(wav).item()
+    # calculate the gain needed to scale to the desired loudness level
+    delta_loudness = -loudness_headroom_db - input_loudness_db
+    gain = 10.0 ** (delta_loudness / 20.0)
+    output = gain * wav
+    if loudness_compressor:
+        output = torch.tanh(output)
+    assert output.isfinite().all(), (input_loudness_db, wav.pow(2).mean().sqrt())
+    return output
+
+
+def _clip_wav(wav: torch.Tensor, log_clipping: bool = False, stem_name: tp.Optional[str] = None) -&gt; None:
+    &#34;&#34;&#34;Utility function to clip the audio with logging if specified.&#34;&#34;&#34;
+    max_scale = wav.abs().max()
+    if log_clipping and max_scale &gt; 1:
+        clamp_prob = (wav.abs() &gt; 1).float().mean().item()
+        print(f&#34;CLIPPING {stem_name or &#39;&#39;} happening with proba (a bit of clipping is okay):&#34;,
+              clamp_prob, &#34;maximum scale: &#34;, max_scale.item(), file=sys.stderr)
+    wav.clamp_(-1, 1)
+
+
+def normalize_audio(wav: torch.Tensor, normalize: bool = True,
+                    strategy: str = &#39;peak&#39;, peak_clip_headroom_db: float = 1,
+                    rms_headroom_db: float = 18, loudness_headroom_db: float = 14,
+                    loudness_compressor: bool = False, log_clipping: bool = False,
+                    sample_rate: tp.Optional[int] = None,
+                    stem_name: tp.Optional[str] = None) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Normalize the audio according to the prescribed strategy (see after).
+
+    Args:
+        wav (torch.Tensor): Audio data.
+        normalize (bool): if `True` (default), normalizes according to the prescribed
+            strategy (see after). If `False`, the strategy is only used in case clipping
+            would happen.
+        strategy (str): Can be either &#39;clip&#39;, &#39;peak&#39;, or &#39;rms&#39;. Default is &#39;peak&#39;,
+            i.e. audio is normalized by its largest value. RMS normalizes by root-mean-square
+            with extra headroom to avoid clipping. &#39;clip&#39; just clips.
+        peak_clip_headroom_db (float): Headroom in dB when doing &#39;peak&#39; or &#39;clip&#39; strategy.
+        rms_headroom_db (float): Headroom in dB when doing &#39;rms&#39; strategy. This must be much larger
+            than the `peak_clip` one to avoid further clipping.
+        loudness_headroom_db (float): Target loudness for loudness normalization.
+        loudness_compressor (bool): If True, uses tanh based soft clipping.
+        log_clipping (bool): If True, basic logging on stderr when clipping still
+            occurs despite strategy (only for &#39;rms&#39;).
+        sample_rate (int): Sample rate for the audio data (required for loudness).
+        stem_name (Optional[str]): Stem name for clipping logging.
+    Returns:
+        torch.Tensor: Normalized audio.
+    &#34;&#34;&#34;
+    scale_peak = 10 ** (-peak_clip_headroom_db / 20)
+    scale_rms = 10 ** (-rms_headroom_db / 20)
+    if strategy == &#39;peak&#39;:
+        rescaling = (scale_peak / wav.abs().max())
+        if normalize or rescaling &lt; 1:
+            wav = wav * rescaling
+    elif strategy == &#39;clip&#39;:
+        wav = wav.clamp(-scale_peak, scale_peak)
+    elif strategy == &#39;rms&#39;:
+        mono = wav.mean(dim=0)
+        rescaling = scale_rms / mono.pow(2).mean().sqrt()
+        if normalize or rescaling &lt; 1:
+            wav = wav * rescaling
+        _clip_wav(wav, log_clipping=log_clipping, stem_name=stem_name)
+    elif strategy == &#39;loudness&#39;:
+        assert sample_rate is not None, &#34;Loudness normalization requires sample rate.&#34;
+        wav = normalize_loudness(wav, sample_rate, loudness_headroom_db, loudness_compressor)
+        _clip_wav(wav, log_clipping=log_clipping, stem_name=stem_name)
+    else:
+        assert wav.abs().max() &lt; 1
+        assert strategy == &#39;&#39; or strategy == &#39;none&#39;, f&#34;Unexpected strategy: &#39;{strategy}&#39;&#34;
+    return wav
+
+
+def f32_pcm(wav: torch.Tensor) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Convert audio to float 32 bits PCM format.
+    &#34;&#34;&#34;
+    if wav.dtype.is_floating_point:
+        return wav
+    else:
+        assert wav.dtype == torch.int16
+        return wav.float() / 2**15
+
+
+def i16_pcm(wav: torch.Tensor) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Convert audio to int 16 bits PCM format.
+
+    ..Warning:: There exist many formula for doing this convertion. None are perfect
+    due to the asymetry of the int16 range. One either have possible clipping, DC offset,
+    or inconsistancies with f32_pcm. If the given wav doesn&#39;t have enough headroom,
+    it is possible that `i16_pcm(f32_pcm)) != Identity`.
+    &#34;&#34;&#34;
+    if wav.dtype.is_floating_point:
+        assert wav.abs().max() &lt;= 1
+        candidate = (wav * 2 ** 15).round()
+        if candidate.max() &gt;= 2 ** 15:  # clipping would occur
+            candidate = (wav * (2 ** 15 - 1)).round()
+        return candidate.short()
+    else:
+        assert wav.dtype == torch.int16
+        return wav</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-functions">Functions</h2>
+<dl>
+<dt id="audiocraft.data.audio_utils.convert_audio"><code class="name flex">
+<span>def <span class="ident">convert_audio</span></span>(<span>wav: torch.Tensor, from_rate: float, to_rate: float, to_channels: int) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Convert audio to new sample rate and number of audio channels.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def convert_audio(wav: torch.Tensor, from_rate: float,
+                  to_rate: float, to_channels: int) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Convert audio to new sample rate and number of audio channels.
+    &#34;&#34;&#34;
+    wav = julius.resample_frac(wav, int(from_rate), int(to_rate))
+    wav = convert_audio_channels(wav, to_channels)
+    return wav</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.data.audio_utils.convert_audio_channels"><code class="name flex">
+<span>def <span class="ident">convert_audio_channels</span></span>(<span>wav: torch.Tensor, channels: int = 2) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Convert audio to the given number of channels.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>wav</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>Audio wave of shape [B, C, T].</dd>
+<dt><strong><code>channels</code></strong> :&ensp;<code>int</code></dt>
+<dd>Expected number of channels as output.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><code>torch.Tensor</code></dt>
+<dd>Downmixed or unchanged audio wave [B, C, T].</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def convert_audio_channels(wav: torch.Tensor, channels: int = 2) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Convert audio to the given number of channels.
+
+    Args:
+        wav (torch.Tensor): Audio wave of shape [B, C, T].
+        channels (int): Expected number of channels as output.
+    Returns:
+        torch.Tensor: Downmixed or unchanged audio wave [B, C, T].
+    &#34;&#34;&#34;
+    *shape, src_channels, length = wav.shape
+    if src_channels == channels:
+        pass
+    elif channels == 1:
+        # Case 1:
+        # The caller asked 1-channel audio, and the stream has multiple
+        # channels, downmix all channels.
+        wav = wav.mean(dim=-2, keepdim=True)
+    elif src_channels == 1:
+        # Case 2:
+        # The caller asked for multiple channels, but the input file has
+        # a single channel, replicate the audio over all channels.
+        wav = wav.expand(*shape, channels, length)
+    elif src_channels &gt;= channels:
+        # Case 3:
+        # The caller asked for multiple channels, and the input file has
+        # more channels than requested. In that case return the first channels.
+        wav = wav[..., :channels, :]
+    else:
+        # Case 4: What is a reasonable choice here?
+        raise ValueError(&#39;The audio file has less channels than requested but is not mono.&#39;)
+    return wav</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.data.audio_utils.f32_pcm"><code class="name flex">
+<span>def <span class="ident">f32_pcm</span></span>(<span>wav: torch.Tensor) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Convert audio to float 32 bits PCM format.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def f32_pcm(wav: torch.Tensor) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Convert audio to float 32 bits PCM format.
+    &#34;&#34;&#34;
+    if wav.dtype.is_floating_point:
+        return wav
+    else:
+        assert wav.dtype == torch.int16
+        return wav.float() / 2**15</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.data.audio_utils.i16_pcm"><code class="name flex">
+<span>def <span class="ident">i16_pcm</span></span>(<span>wav: torch.Tensor) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Convert audio to int 16 bits PCM format.</p>
+<div class="admonition warning">
+<p class="admonition-title">Warning:&ensp;There exist many formula for doing this convertion. None are perfect</p>
+</div>
+<p>due to the asymetry of the int16 range. One either have possible clipping, DC offset,
+or inconsistancies with f32_pcm. If the given wav doesn't have enough headroom,
+it is possible that <code>i16_pcm(f32_pcm)) != Identity</code>.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def i16_pcm(wav: torch.Tensor) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Convert audio to int 16 bits PCM format.
+
+    ..Warning:: There exist many formula for doing this convertion. None are perfect
+    due to the asymetry of the int16 range. One either have possible clipping, DC offset,
+    or inconsistancies with f32_pcm. If the given wav doesn&#39;t have enough headroom,
+    it is possible that `i16_pcm(f32_pcm)) != Identity`.
+    &#34;&#34;&#34;
+    if wav.dtype.is_floating_point:
+        assert wav.abs().max() &lt;= 1
+        candidate = (wav * 2 ** 15).round()
+        if candidate.max() &gt;= 2 ** 15:  # clipping would occur
+            candidate = (wav * (2 ** 15 - 1)).round()
+        return candidate.short()
+    else:
+        assert wav.dtype == torch.int16
+        return wav</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.data.audio_utils.normalize_audio"><code class="name flex">
+<span>def <span class="ident">normalize_audio</span></span>(<span>wav: torch.Tensor, normalize: bool = True, strategy: str = 'peak', peak_clip_headroom_db: float = 1, rms_headroom_db: float = 18, loudness_headroom_db: float = 14, loudness_compressor: bool = False, log_clipping: bool = False, sample_rate: Optional[int] = None, stem_name: Optional[str] = None) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Normalize the audio according to the prescribed strategy (see after).</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>wav</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>Audio data.</dd>
+<dt><strong><code>normalize</code></strong> :&ensp;<code>bool</code></dt>
+<dd>if <code>True</code> (default), normalizes according to the prescribed
+strategy (see after). If <code>False</code>, the strategy is only used in case clipping
+would happen.</dd>
+<dt><strong><code>strategy</code></strong> :&ensp;<code>str</code></dt>
+<dd>Can be either 'clip', 'peak', or 'rms'. Default is 'peak',
+i.e. audio is normalized by its largest value. RMS normalizes by root-mean-square
+with extra headroom to avoid clipping. 'clip' just clips.</dd>
+<dt><strong><code>peak_clip_headroom_db</code></strong> :&ensp;<code>float</code></dt>
+<dd>Headroom in dB when doing 'peak' or 'clip' strategy.</dd>
+<dt><strong><code>rms_headroom_db</code></strong> :&ensp;<code>float</code></dt>
+<dd>Headroom in dB when doing 'rms' strategy. This must be much larger
+than the <code>peak_clip</code> one to avoid further clipping.</dd>
+<dt><strong><code>loudness_headroom_db</code></strong> :&ensp;<code>float</code></dt>
+<dd>Target loudness for loudness normalization.</dd>
+<dt><strong><code>loudness_compressor</code></strong> :&ensp;<code>bool</code></dt>
+<dd>If True, uses tanh based soft clipping.</dd>
+<dt><strong><code>log_clipping</code></strong> :&ensp;<code>bool</code></dt>
+<dd>If True, basic logging on stderr when clipping still
+occurs despite strategy (only for 'rms').</dd>
+<dt><strong><code>sample_rate</code></strong> :&ensp;<code>int</code></dt>
+<dd>Sample rate for the audio data (required for loudness).</dd>
+<dt><strong><code>stem_name</code></strong> :&ensp;<code>Optional[str]</code></dt>
+<dd>Stem name for clipping logging.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><code>torch.Tensor</code></dt>
+<dd>Normalized audio.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def normalize_audio(wav: torch.Tensor, normalize: bool = True,
+                    strategy: str = &#39;peak&#39;, peak_clip_headroom_db: float = 1,
+                    rms_headroom_db: float = 18, loudness_headroom_db: float = 14,
+                    loudness_compressor: bool = False, log_clipping: bool = False,
+                    sample_rate: tp.Optional[int] = None,
+                    stem_name: tp.Optional[str] = None) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Normalize the audio according to the prescribed strategy (see after).
+
+    Args:
+        wav (torch.Tensor): Audio data.
+        normalize (bool): if `True` (default), normalizes according to the prescribed
+            strategy (see after). If `False`, the strategy is only used in case clipping
+            would happen.
+        strategy (str): Can be either &#39;clip&#39;, &#39;peak&#39;, or &#39;rms&#39;. Default is &#39;peak&#39;,
+            i.e. audio is normalized by its largest value. RMS normalizes by root-mean-square
+            with extra headroom to avoid clipping. &#39;clip&#39; just clips.
+        peak_clip_headroom_db (float): Headroom in dB when doing &#39;peak&#39; or &#39;clip&#39; strategy.
+        rms_headroom_db (float): Headroom in dB when doing &#39;rms&#39; strategy. This must be much larger
+            than the `peak_clip` one to avoid further clipping.
+        loudness_headroom_db (float): Target loudness for loudness normalization.
+        loudness_compressor (bool): If True, uses tanh based soft clipping.
+        log_clipping (bool): If True, basic logging on stderr when clipping still
+            occurs despite strategy (only for &#39;rms&#39;).
+        sample_rate (int): Sample rate for the audio data (required for loudness).
+        stem_name (Optional[str]): Stem name for clipping logging.
+    Returns:
+        torch.Tensor: Normalized audio.
+    &#34;&#34;&#34;
+    scale_peak = 10 ** (-peak_clip_headroom_db / 20)
+    scale_rms = 10 ** (-rms_headroom_db / 20)
+    if strategy == &#39;peak&#39;:
+        rescaling = (scale_peak / wav.abs().max())
+        if normalize or rescaling &lt; 1:
+            wav = wav * rescaling
+    elif strategy == &#39;clip&#39;:
+        wav = wav.clamp(-scale_peak, scale_peak)
+    elif strategy == &#39;rms&#39;:
+        mono = wav.mean(dim=0)
+        rescaling = scale_rms / mono.pow(2).mean().sqrt()
+        if normalize or rescaling &lt; 1:
+            wav = wav * rescaling
+        _clip_wav(wav, log_clipping=log_clipping, stem_name=stem_name)
+    elif strategy == &#39;loudness&#39;:
+        assert sample_rate is not None, &#34;Loudness normalization requires sample rate.&#34;
+        wav = normalize_loudness(wav, sample_rate, loudness_headroom_db, loudness_compressor)
+        _clip_wav(wav, log_clipping=log_clipping, stem_name=stem_name)
+    else:
+        assert wav.abs().max() &lt; 1
+        assert strategy == &#39;&#39; or strategy == &#39;none&#39;, f&#34;Unexpected strategy: &#39;{strategy}&#39;&#34;
+    return wav</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.data.audio_utils.normalize_loudness"><code class="name flex">
+<span>def <span class="ident">normalize_loudness</span></span>(<span>wav: torch.Tensor, sample_rate: int, loudness_headroom_db: float = 14, loudness_compressor: bool = False, energy_floor: float = 0.002)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Normalize an input signal to a user loudness in dB LKFS.
+Audio loudness is defined according to the ITU-R BS.1770-4 recommendation.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>wav</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>Input multichannel audio data.</dd>
+<dt><strong><code>sample_rate</code></strong> :&ensp;<code>int</code></dt>
+<dd>Sample rate.</dd>
+<dt><strong><code>loudness_headroom_db</code></strong> :&ensp;<code>float</code></dt>
+<dd>Target loudness of the output in dB LUFS.</dd>
+<dt><strong><code>loudness_compressor</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Uses tanh for soft clipping.</dd>
+<dt><strong><code>energy_floor</code></strong> :&ensp;<code>float</code></dt>
+<dd>anything below that RMS level will not be rescaled.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<p>output (torch.Tensor): Loudness normalized output data.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def normalize_loudness(wav: torch.Tensor, sample_rate: int, loudness_headroom_db: float = 14,
+                       loudness_compressor: bool = False, energy_floor: float = 2e-3):
+    &#34;&#34;&#34;Normalize an input signal to a user loudness in dB LKFS.
+    Audio loudness is defined according to the ITU-R BS.1770-4 recommendation.
+
+    Args:
+        wav (torch.Tensor): Input multichannel audio data.
+        sample_rate (int): Sample rate.
+        loudness_headroom_db (float): Target loudness of the output in dB LUFS.
+        loudness_compressor (bool): Uses tanh for soft clipping.
+        energy_floor (float): anything below that RMS level will not be rescaled.
+    Returns:
+        output (torch.Tensor): Loudness normalized output data.
+    &#34;&#34;&#34;
+    energy = wav.pow(2).mean().sqrt().item()
+    if energy &lt; energy_floor:
+        return wav
+    transform = torchaudio.transforms.Loudness(sample_rate)
+    input_loudness_db = transform(wav).item()
+    # calculate the gain needed to scale to the desired loudness level
+    delta_loudness = -loudness_headroom_db - input_loudness_db
+    gain = 10.0 ** (delta_loudness / 20.0)
+    output = gain * wav
+    if loudness_compressor:
+        output = torch.tanh(output)
+    assert output.isfinite().all(), (input_loudness_db, wav.pow(2).mean().sqrt())
+    return output</code></pre>
+</details>
+</dd>
+</dl>
+</section>
+<section>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.data" href="index.html">audiocraft.data</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-functions">Functions</a></h3>
+<ul class="">
+<li><code><a title="audiocraft.data.audio_utils.convert_audio" href="#audiocraft.data.audio_utils.convert_audio">convert_audio</a></code></li>
+<li><code><a title="audiocraft.data.audio_utils.convert_audio_channels" href="#audiocraft.data.audio_utils.convert_audio_channels">convert_audio_channels</a></code></li>
+<li><code><a title="audiocraft.data.audio_utils.f32_pcm" href="#audiocraft.data.audio_utils.f32_pcm">f32_pcm</a></code></li>
+<li><code><a title="audiocraft.data.audio_utils.i16_pcm" href="#audiocraft.data.audio_utils.i16_pcm">i16_pcm</a></code></li>
+<li><code><a title="audiocraft.data.audio_utils.normalize_audio" href="#audiocraft.data.audio_utils.normalize_audio">normalize_audio</a></code></li>
+<li><code><a title="audiocraft.data.audio_utils.normalize_loudness" href="#audiocraft.data.audio_utils.normalize_loudness">normalize_loudness</a></code></li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/docs/audiocraft/data/index.html b/docs/audiocraft/data/index.html
new file mode 100644
index 00000000..84b4c7b4
--- /dev/null
+++ b/docs/audiocraft/data/index.html
@@ -0,0 +1,94 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.data API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.data</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# flake8: noqa
+from . import audio, audio_dataset</code></pre>
+</details>
+</section>
+<section>
+<h2 class="section-title" id="header-submodules">Sub-modules</h2>
+<dl>
+<dt><code class="name"><a title="audiocraft.data.audio" href="audio.html">audiocraft.data.audio</a></code></dt>
+<dd>
+<div class="desc"><p>Audio IO methods are defined in this module (info, read, write),
+We rely on av library for faster read when possible, otherwise on torchaudio.</p></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.data.audio_dataset" href="audio_dataset.html">audiocraft.data.audio_dataset</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.data.audio_utils" href="audio_utils.html">audiocraft.data.audio_utils</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.data.zip" href="zip.html">audiocraft.data.zip</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft" href="../index.html">audiocraft</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-submodules">Sub-modules</a></h3>
+<ul>
+<li><code><a title="audiocraft.data.audio" href="audio.html">audiocraft.data.audio</a></code></li>
+<li><code><a title="audiocraft.data.audio_dataset" href="audio_dataset.html">audiocraft.data.audio_dataset</a></code></li>
+<li><code><a title="audiocraft.data.audio_utils" href="audio_utils.html">audiocraft.data.audio_utils</a></code></li>
+<li><code><a title="audiocraft.data.zip" href="zip.html">audiocraft.data.zip</a></code></li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/docs/audiocraft/data/zip.html b/docs/audiocraft/data/zip.html
new file mode 100644
index 00000000..d8bcfcef
--- /dev/null
+++ b/docs/audiocraft/data/zip.html
@@ -0,0 +1,289 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.data.zip API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.data.zip</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import typing
+import zipfile
+
+from dataclasses import dataclass
+from functools import lru_cache
+from typing_extensions import Literal
+
+
+DEFAULT_SIZE = 32
+MODE = Literal[&#39;r&#39;, &#39;w&#39;, &#39;x&#39;, &#39;a&#39;]
+
+
+@dataclass(order=True)
+class PathInZip:
+    &#34;&#34;&#34;Class for holding a path of file within a zip file.
+
+    Args:
+        path: The convention is &lt;path_to_zip&gt;:&lt;relative_path_inside_zip&gt;
+            Let&#39;s assume there is a zip file /some/location/foo.zip
+            and inside of it is a json file located at /data/file1.json,
+            Then we expect path = &#34;/some/location/foo.zip:/data/file1.json&#34;
+    &#34;&#34;&#34;
+
+    INFO_PATH_SEP = &#39;:&#39;
+    zip_path: str
+    file_path: str
+
+    def __init__(self, path: str) -&gt; None:
+        split_path = path.split(self.INFO_PATH_SEP)
+        assert len(split_path) == 2
+        self.zip_path, self.file_path = split_path
+
+    @classmethod
+    def from_paths(cls, zip_path: str, file_path: str):
+        return cls(zip_path + cls.INFO_PATH_SEP + file_path)
+
+    def __str__(self) -&gt; str:
+        return self.zip_path + self.INFO_PATH_SEP + self.file_path
+
+
+def _open_zip(path: str, mode: MODE = &#39;r&#39;):
+    return zipfile.ZipFile(path, mode)
+
+
+_cached_open_zip = lru_cache(DEFAULT_SIZE)(_open_zip)
+
+
+def set_zip_cache_size(max_size: int):
+    &#34;&#34;&#34;Sets the maximal LRU caching for zip file opening.
+
+    Args:
+        max_size: the maximal LRU cache.
+    &#34;&#34;&#34;
+    global _cached_open_zip
+    _cached_open_zip = lru_cache(max_size)(_open_zip)
+
+
+def open_file_in_zip(path_in_zip: PathInZip, mode: str = &#39;r&#39;) -&gt; typing.IO:
+    &#34;&#34;&#34;Opens a file stored inside a zip and returns a file-like object.
+
+    Args:
+        path_in_zip: A PathInZip object representing the file to return a file-like object of.
+        mode: The mode in which to open the file with.
+    Returns:
+        A file-like object for PathInZip.
+    &#34;&#34;&#34;
+    zf = _cached_open_zip(path_in_zip.zip_path)
+    return zf.open(path_in_zip.file_path)</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-functions">Functions</h2>
+<dl>
+<dt id="audiocraft.data.zip.open_file_in_zip"><code class="name flex">
+<span>def <span class="ident">open_file_in_zip</span></span>(<span>path_in_zip: <a title="audiocraft.data.zip.PathInZip" href="#audiocraft.data.zip.PathInZip">PathInZip</a>, mode: str = 'r') ‑> <class 'IO'></span>
+</code></dt>
+<dd>
+<div class="desc"><p>Opens a file stored inside a zip and returns a file-like object.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>path_in_zip</code></strong></dt>
+<dd>A PathInZip object representing the file to return a file-like object of.</dd>
+<dt><strong><code>mode</code></strong></dt>
+<dd>The mode in which to open the file with.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<p>A file-like object for PathInZip.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def open_file_in_zip(path_in_zip: PathInZip, mode: str = &#39;r&#39;) -&gt; typing.IO:
+    &#34;&#34;&#34;Opens a file stored inside a zip and returns a file-like object.
+
+    Args:
+        path_in_zip: A PathInZip object representing the file to return a file-like object of.
+        mode: The mode in which to open the file with.
+    Returns:
+        A file-like object for PathInZip.
+    &#34;&#34;&#34;
+    zf = _cached_open_zip(path_in_zip.zip_path)
+    return zf.open(path_in_zip.file_path)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.data.zip.set_zip_cache_size"><code class="name flex">
+<span>def <span class="ident">set_zip_cache_size</span></span>(<span>max_size: int)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Sets the maximal LRU caching for zip file opening.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>max_size</code></strong></dt>
+<dd>the maximal LRU cache.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def set_zip_cache_size(max_size: int):
+    &#34;&#34;&#34;Sets the maximal LRU caching for zip file opening.
+
+    Args:
+        max_size: the maximal LRU cache.
+    &#34;&#34;&#34;
+    global _cached_open_zip
+    _cached_open_zip = lru_cache(max_size)(_open_zip)</code></pre>
+</details>
+</dd>
+</dl>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.data.zip.PathInZip"><code class="flex name class">
+<span>class <span class="ident">PathInZip</span></span>
+<span>(</span><span>path: str)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Class for holding a path of file within a zip file.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>path</code></strong></dt>
+<dd>The convention is <path_to_zip>:<relative_path_inside_zip>
+Let's assume there is a zip file /some/location/foo.zip
+and inside of it is a json file located at /data/file1.json,
+Then we expect path = "/some/location/foo.zip:/data/file1.json"</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class PathInZip:
+    &#34;&#34;&#34;Class for holding a path of file within a zip file.
+
+    Args:
+        path: The convention is &lt;path_to_zip&gt;:&lt;relative_path_inside_zip&gt;
+            Let&#39;s assume there is a zip file /some/location/foo.zip
+            and inside of it is a json file located at /data/file1.json,
+            Then we expect path = &#34;/some/location/foo.zip:/data/file1.json&#34;
+    &#34;&#34;&#34;
+
+    INFO_PATH_SEP = &#39;:&#39;
+    zip_path: str
+    file_path: str
+
+    def __init__(self, path: str) -&gt; None:
+        split_path = path.split(self.INFO_PATH_SEP)
+        assert len(split_path) == 2
+        self.zip_path, self.file_path = split_path
+
+    @classmethod
+    def from_paths(cls, zip_path: str, file_path: str):
+        return cls(zip_path + cls.INFO_PATH_SEP + file_path)
+
+    def __str__(self) -&gt; str:
+        return self.zip_path + self.INFO_PATH_SEP + self.file_path</code></pre>
+</details>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.data.zip.PathInZip.INFO_PATH_SEP"><code class="name">var <span class="ident">INFO_PATH_SEP</span></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.data.zip.PathInZip.file_path"><code class="name">var <span class="ident">file_path</span> : str</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.data.zip.PathInZip.zip_path"><code class="name">var <span class="ident">zip_path</span> : str</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Static methods</h3>
+<dl>
+<dt id="audiocraft.data.zip.PathInZip.from_paths"><code class="name flex">
+<span>def <span class="ident">from_paths</span></span>(<span>zip_path: str, file_path: str)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@classmethod
+def from_paths(cls, zip_path: str, file_path: str):
+    return cls(zip_path + cls.INFO_PATH_SEP + file_path)</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.data" href="index.html">audiocraft.data</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-functions">Functions</a></h3>
+<ul class="">
+<li><code><a title="audiocraft.data.zip.open_file_in_zip" href="#audiocraft.data.zip.open_file_in_zip">open_file_in_zip</a></code></li>
+<li><code><a title="audiocraft.data.zip.set_zip_cache_size" href="#audiocraft.data.zip.set_zip_cache_size">set_zip_cache_size</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.data.zip.PathInZip" href="#audiocraft.data.zip.PathInZip">PathInZip</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.data.zip.PathInZip.INFO_PATH_SEP" href="#audiocraft.data.zip.PathInZip.INFO_PATH_SEP">INFO_PATH_SEP</a></code></li>
+<li><code><a title="audiocraft.data.zip.PathInZip.file_path" href="#audiocraft.data.zip.PathInZip.file_path">file_path</a></code></li>
+<li><code><a title="audiocraft.data.zip.PathInZip.from_paths" href="#audiocraft.data.zip.PathInZip.from_paths">from_paths</a></code></li>
+<li><code><a title="audiocraft.data.zip.PathInZip.zip_path" href="#audiocraft.data.zip.PathInZip.zip_path">zip_path</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/docs/audiocraft/index.html b/docs/audiocraft/index.html
new file mode 100644
index 00000000..2a77ad7d
--- /dev/null
+++ b/docs/audiocraft/index.html
@@ -0,0 +1,95 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Package <code>audiocraft</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# flake8: noqa
+from . import data, modules, models
+
+__version__ = &#39;0.0.2a2&#39;</code></pre>
+</details>
+</section>
+<section>
+<h2 class="section-title" id="header-submodules">Sub-modules</h2>
+<dl>
+<dt><code class="name"><a title="audiocraft.data" href="data/index.html">audiocraft.data</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.models" href="models/index.html">audiocraft.models</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.modules" href="modules/index.html">audiocraft.modules</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.quantization" href="quantization/index.html">audiocraft.quantization</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.utils" href="utils/index.html">audiocraft.utils</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3><a href="#header-submodules">Sub-modules</a></h3>
+<ul>
+<li><code><a title="audiocraft.data" href="data/index.html">audiocraft.data</a></code></li>
+<li><code><a title="audiocraft.models" href="models/index.html">audiocraft.models</a></code></li>
+<li><code><a title="audiocraft.modules" href="modules/index.html">audiocraft.modules</a></code></li>
+<li><code><a title="audiocraft.quantization" href="quantization/index.html">audiocraft.quantization</a></code></li>
+<li><code><a title="audiocraft.utils" href="utils/index.html">audiocraft.utils</a></code></li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/docs/audiocraft/models/builders.html b/docs/audiocraft/models/builders.html
new file mode 100644
index 00000000..867a760b
--- /dev/null
+++ b/docs/audiocraft/models/builders.html
@@ -0,0 +1,556 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.models.builders API documentation</title>
+<meta name="description" content="All the functions to build the relevant models and modules
+from the Hydra config." />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.models.builders</code></h1>
+</header>
+<section id="section-intro">
+<p>All the functions to build the relevant models and modules
+from the Hydra config.</p>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+&#34;&#34;&#34;
+All the functions to build the relevant models and modules
+from the Hydra config.
+&#34;&#34;&#34;
+
+import typing as tp
+import warnings
+
+import audiocraft
+import omegaconf
+import torch
+
+from .encodec import CompressionModel, EncodecModel, FlattenedCompressionModel  # noqa
+from .lm import LMModel
+from ..modules.codebooks_patterns import (
+    CodebooksPatternProvider,
+    DelayedPatternProvider,
+    ParallelPatternProvider,
+    UnrolledPatternProvider,
+    VALLEPattern,
+    MusicLMPattern,
+)
+from ..modules.conditioners import (
+    BaseConditioner,
+    ConditioningProvider,
+    LUTConditioner,
+    T5Conditioner,
+    ConditionFuser,
+    ChromaStemConditioner,
+)
+from .. import quantization as qt
+from ..utils.utils import dict_from_config
+
+
+def get_quantizer(quantizer: str, cfg: omegaconf.DictConfig, dimension: int) -&gt; qt.BaseQuantizer:
+    klass = {
+        &#39;no_quant&#39;: qt.DummyQuantizer,
+        &#39;rvq&#39;: qt.ResidualVectorQuantizer
+    }[quantizer]
+    kwargs = dict_from_config(getattr(cfg, quantizer))
+    if quantizer != &#39;no_quant&#39;:
+        kwargs[&#39;dimension&#39;] = dimension
+    return klass(**kwargs)
+
+
+def get_encodec_autoencoder(encoder_name: str, cfg: omegaconf.DictConfig):
+    if encoder_name == &#39;seanet&#39;:
+        kwargs = dict_from_config(getattr(cfg, &#39;seanet&#39;))
+        encoder_override_kwargs = kwargs.pop(&#39;encoder&#39;)
+        decoder_override_kwargs = kwargs.pop(&#39;decoder&#39;)
+        encoder_kwargs = {**kwargs, **encoder_override_kwargs}
+        decoder_kwargs = {**kwargs, **decoder_override_kwargs}
+        encoder = audiocraft.modules.SEANetEncoder(**encoder_kwargs)
+        decoder = audiocraft.modules.SEANetDecoder(**decoder_kwargs)
+        return encoder, decoder
+    else:
+        raise KeyError(f&#39;Unexpected compression model {cfg.compression_model}&#39;)
+
+
+def get_compression_model(cfg: omegaconf.DictConfig) -&gt; CompressionModel:
+    &#34;&#34;&#34;Instantiate a compression model.
+    &#34;&#34;&#34;
+    if cfg.compression_model == &#39;encodec&#39;:
+        kwargs = dict_from_config(getattr(cfg, &#39;encodec&#39;))
+        encoder_name = kwargs.pop(&#39;autoencoder&#39;)
+        quantizer_name = kwargs.pop(&#39;quantizer&#39;)
+        encoder, decoder = get_encodec_autoencoder(encoder_name, cfg)
+        quantizer = get_quantizer(quantizer_name, cfg, encoder.dimension)
+        frame_rate = kwargs[&#39;sample_rate&#39;] // encoder.hop_length
+        renormalize = kwargs.pop(&#39;renormalize&#39;, None)
+        renorm = kwargs.pop(&#39;renorm&#39;)
+        if renormalize is None:
+            renormalize = renorm is not None
+            warnings.warn(&#34;You are using a deprecated EnCodec model. Please migrate to new renormalization.&#34;)
+        return EncodecModel(encoder, decoder, quantizer,
+                            frame_rate=frame_rate, renormalize=renormalize, **kwargs).to(cfg.device)
+    else:
+        raise KeyError(f&#39;Unexpected compression model {cfg.compression_model}&#39;)
+
+
+def get_lm_model(cfg: omegaconf.DictConfig) -&gt; LMModel:
+    &#34;&#34;&#34;Instantiate a transformer LM.
+    &#34;&#34;&#34;
+    if cfg.lm_model == &#39;transformer_lm&#39;:
+        kwargs = dict_from_config(getattr(cfg, &#39;transformer_lm&#39;))
+        n_q = kwargs[&#39;n_q&#39;]
+        q_modeling = kwargs.pop(&#39;q_modeling&#39;, None)
+        codebooks_pattern_cfg = getattr(cfg, &#39;codebooks_pattern&#39;)
+        attribute_dropout = dict_from_config(getattr(cfg, &#39;attribute_dropout&#39;))
+        cls_free_guidance = dict_from_config(getattr(cfg, &#39;classifier_free_guidance&#39;))
+        cfg_prob, cfg_coef = cls_free_guidance[&#34;training_dropout&#34;], cls_free_guidance[&#34;inference_coef&#34;]
+        fuser = get_condition_fuser(cfg)
+        condition_provider = get_conditioner_provider(kwargs[&#34;dim&#34;], cfg).to(cfg.device)
+        if len(fuser.fuse2cond[&#39;cross&#39;]) &gt; 0:  # enforce cross-att programatically
+            kwargs[&#39;cross_attention&#39;] = True
+        if codebooks_pattern_cfg.modeling is None:
+            assert q_modeling is not None, \
+                &#39;LM model should either have a codebook pattern defined or transformer_lm.q_modeling&#39;
+            codebooks_pattern_cfg = omegaconf.OmegaConf.create(
+                {&#39;modeling&#39;: q_modeling, &#39;delay&#39;: {&#39;delays&#39;: list(range(n_q))}}
+            )
+        pattern_provider = get_codebooks_pattern_provider(n_q, codebooks_pattern_cfg)
+        return LMModel(
+            pattern_provider=pattern_provider,
+            condition_provider=condition_provider,
+            fuser=fuser,
+            cfg_dropout=cfg_prob,
+            cfg_coef=cfg_coef,
+            attribute_dropout=attribute_dropout,
+            dtype=getattr(torch, cfg.dtype),
+            device=cfg.device,
+            **kwargs
+        ).to(cfg.device)
+    else:
+        raise KeyError(f&#39;Unexpected LM model {cfg.lm_model}&#39;)
+
+
+def get_conditioner_provider(output_dim: int, cfg: omegaconf.DictConfig) -&gt; ConditioningProvider:
+    &#34;&#34;&#34;Instantiate a conditioning model.
+    &#34;&#34;&#34;
+    device = cfg.device
+    duration = cfg.dataset.segment_duration
+    cfg = getattr(cfg, &#34;conditioners&#34;)
+    cfg = omegaconf.OmegaConf.create({}) if cfg is None else cfg
+    conditioners: tp.Dict[str, BaseConditioner] = {}
+    with omegaconf.open_dict(cfg):
+        condition_provider_args = cfg.pop(&#39;args&#39;, {})
+    for cond, cond_cfg in cfg.items():
+        model_type = cond_cfg[&#34;model&#34;]
+        model_args = cond_cfg[model_type]
+        if model_type == &#34;t5&#34;:
+            conditioners[str(cond)] = T5Conditioner(output_dim=output_dim, device=device, **model_args)
+        elif model_type == &#34;lut&#34;:
+            conditioners[str(cond)] = LUTConditioner(output_dim=output_dim, **model_args)
+        elif model_type == &#34;chroma_stem&#34;:
+            model_args.pop(&#39;cache_path&#39;, None)
+            conditioners[str(cond)] = ChromaStemConditioner(
+                output_dim=output_dim,
+                duration=duration,
+                device=device,
+                **model_args
+            )
+        else:
+            raise ValueError(f&#34;unrecognized conditioning model: {model_type}&#34;)
+    conditioner = ConditioningProvider(conditioners, device=device, **condition_provider_args)
+    return conditioner
+
+
+def get_condition_fuser(cfg: omegaconf.DictConfig) -&gt; ConditionFuser:
+    &#34;&#34;&#34;Instantiate a condition fuser object.
+    &#34;&#34;&#34;
+    fuser_cfg = getattr(cfg, &#34;fuser&#34;)
+    fuser_methods = [&#34;sum&#34;, &#34;cross&#34;, &#34;prepend&#34;, &#34;input_interpolate&#34;]
+    fuse2cond = {k: fuser_cfg[k] for k in fuser_methods}
+    kwargs = {k: v for k, v in fuser_cfg.items() if k not in fuser_methods}
+    fuser = ConditionFuser(fuse2cond=fuse2cond, **kwargs)
+    return fuser
+
+
+def get_codebooks_pattern_provider(n_q: int, cfg: omegaconf.DictConfig) -&gt; CodebooksPatternProvider:
+    &#34;&#34;&#34;Instantiate a codebooks pattern provider object.
+    &#34;&#34;&#34;
+    pattern_providers = {
+        &#39;parallel&#39;: ParallelPatternProvider,
+        &#39;delay&#39;: DelayedPatternProvider,
+        &#39;unroll&#39;: UnrolledPatternProvider,
+        &#39;valle&#39;: VALLEPattern,
+        &#39;musiclm&#39;: MusicLMPattern,
+    }
+    name = cfg.modeling
+    kwargs = dict_from_config(cfg.get(name)) if hasattr(cfg, name) else {}
+    klass = pattern_providers[name]
+    return klass(n_q, **kwargs)
+
+
+def get_debug_compression_model(device=&#39;cpu&#39;):
+    &#34;&#34;&#34;Instantiate a debug compression model to be used for unit tests.
+    &#34;&#34;&#34;
+    seanet_kwargs = {
+        &#39;n_filters&#39;: 4,
+        &#39;n_residual_layers&#39;: 1,
+        &#39;dimension&#39;: 32,
+        &#39;ratios&#39;: [10, 8, 16]  # 25 Hz at 32kHz
+    }
+    encoder = audiocraft.modules.SEANetEncoder(**seanet_kwargs)
+    decoder = audiocraft.modules.SEANetDecoder(**seanet_kwargs)
+    quantizer = qt.ResidualVectorQuantizer(dimension=32, bins=400, n_q=4)
+    init_x = torch.randn(8, 32, 128)
+    quantizer(init_x, 1)  # initialize kmeans etc.
+    compression_model = EncodecModel(
+        encoder, decoder, quantizer,
+        frame_rate=25, sample_rate=32000, channels=1).to(device)
+    return compression_model.eval()
+
+
+def get_debug_lm_model(device=&#39;cpu&#39;):
+    &#34;&#34;&#34;Instantiate a debug LM to be used for unit tests.
+    &#34;&#34;&#34;
+    pattern = DelayedPatternProvider(n_q=4)
+    dim = 16
+    providers = {
+        &#39;description&#39;: LUTConditioner(n_bins=128, dim=dim, output_dim=dim, tokenizer=&#34;whitespace&#34;),
+    }
+    condition_provider = ConditioningProvider(providers)
+    fuser = ConditionFuser(
+        {&#39;cross&#39;: [&#39;description&#39;], &#39;prepend&#39;: [],
+         &#39;sum&#39;: [], &#39;input_interpolate&#39;: []})
+    lm = LMModel(
+        pattern, condition_provider, fuser,
+        n_q=4, card=400, dim=dim, num_heads=4, custom=True, num_layers=2,
+        cross_attention=True, causal=True)
+    return lm.to(device).eval()</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-functions">Functions</h2>
+<dl>
+<dt id="audiocraft.models.builders.get_codebooks_pattern_provider"><code class="name flex">
+<span>def <span class="ident">get_codebooks_pattern_provider</span></span>(<span>n_q: int, cfg: omegaconf.dictconfig.DictConfig) ‑> <a title="audiocraft.modules.codebooks_patterns.CodebooksPatternProvider" href="../modules/codebooks_patterns.html#audiocraft.modules.codebooks_patterns.CodebooksPatternProvider">CodebooksPatternProvider</a></span>
+</code></dt>
+<dd>
+<div class="desc"><p>Instantiate a codebooks pattern provider object.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_codebooks_pattern_provider(n_q: int, cfg: omegaconf.DictConfig) -&gt; CodebooksPatternProvider:
+    &#34;&#34;&#34;Instantiate a codebooks pattern provider object.
+    &#34;&#34;&#34;
+    pattern_providers = {
+        &#39;parallel&#39;: ParallelPatternProvider,
+        &#39;delay&#39;: DelayedPatternProvider,
+        &#39;unroll&#39;: UnrolledPatternProvider,
+        &#39;valle&#39;: VALLEPattern,
+        &#39;musiclm&#39;: MusicLMPattern,
+    }
+    name = cfg.modeling
+    kwargs = dict_from_config(cfg.get(name)) if hasattr(cfg, name) else {}
+    klass = pattern_providers[name]
+    return klass(n_q, **kwargs)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.builders.get_compression_model"><code class="name flex">
+<span>def <span class="ident">get_compression_model</span></span>(<span>cfg: omegaconf.dictconfig.DictConfig) ‑> <a title="audiocraft.models.encodec.CompressionModel" href="encodec.html#audiocraft.models.encodec.CompressionModel">CompressionModel</a></span>
+</code></dt>
+<dd>
+<div class="desc"><p>Instantiate a compression model.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_compression_model(cfg: omegaconf.DictConfig) -&gt; CompressionModel:
+    &#34;&#34;&#34;Instantiate a compression model.
+    &#34;&#34;&#34;
+    if cfg.compression_model == &#39;encodec&#39;:
+        kwargs = dict_from_config(getattr(cfg, &#39;encodec&#39;))
+        encoder_name = kwargs.pop(&#39;autoencoder&#39;)
+        quantizer_name = kwargs.pop(&#39;quantizer&#39;)
+        encoder, decoder = get_encodec_autoencoder(encoder_name, cfg)
+        quantizer = get_quantizer(quantizer_name, cfg, encoder.dimension)
+        frame_rate = kwargs[&#39;sample_rate&#39;] // encoder.hop_length
+        renormalize = kwargs.pop(&#39;renormalize&#39;, None)
+        renorm = kwargs.pop(&#39;renorm&#39;)
+        if renormalize is None:
+            renormalize = renorm is not None
+            warnings.warn(&#34;You are using a deprecated EnCodec model. Please migrate to new renormalization.&#34;)
+        return EncodecModel(encoder, decoder, quantizer,
+                            frame_rate=frame_rate, renormalize=renormalize, **kwargs).to(cfg.device)
+    else:
+        raise KeyError(f&#39;Unexpected compression model {cfg.compression_model}&#39;)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.builders.get_condition_fuser"><code class="name flex">
+<span>def <span class="ident">get_condition_fuser</span></span>(<span>cfg: omegaconf.dictconfig.DictConfig) ‑> <a title="audiocraft.modules.conditioners.ConditionFuser" href="../modules/conditioners.html#audiocraft.modules.conditioners.ConditionFuser">ConditionFuser</a></span>
+</code></dt>
+<dd>
+<div class="desc"><p>Instantiate a condition fuser object.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_condition_fuser(cfg: omegaconf.DictConfig) -&gt; ConditionFuser:
+    &#34;&#34;&#34;Instantiate a condition fuser object.
+    &#34;&#34;&#34;
+    fuser_cfg = getattr(cfg, &#34;fuser&#34;)
+    fuser_methods = [&#34;sum&#34;, &#34;cross&#34;, &#34;prepend&#34;, &#34;input_interpolate&#34;]
+    fuse2cond = {k: fuser_cfg[k] for k in fuser_methods}
+    kwargs = {k: v for k, v in fuser_cfg.items() if k not in fuser_methods}
+    fuser = ConditionFuser(fuse2cond=fuse2cond, **kwargs)
+    return fuser</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.builders.get_conditioner_provider"><code class="name flex">
+<span>def <span class="ident">get_conditioner_provider</span></span>(<span>output_dim: int, cfg: omegaconf.dictconfig.DictConfig) ‑> <a title="audiocraft.modules.conditioners.ConditioningProvider" href="../modules/conditioners.html#audiocraft.modules.conditioners.ConditioningProvider">ConditioningProvider</a></span>
+</code></dt>
+<dd>
+<div class="desc"><p>Instantiate a conditioning model.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_conditioner_provider(output_dim: int, cfg: omegaconf.DictConfig) -&gt; ConditioningProvider:
+    &#34;&#34;&#34;Instantiate a conditioning model.
+    &#34;&#34;&#34;
+    device = cfg.device
+    duration = cfg.dataset.segment_duration
+    cfg = getattr(cfg, &#34;conditioners&#34;)
+    cfg = omegaconf.OmegaConf.create({}) if cfg is None else cfg
+    conditioners: tp.Dict[str, BaseConditioner] = {}
+    with omegaconf.open_dict(cfg):
+        condition_provider_args = cfg.pop(&#39;args&#39;, {})
+    for cond, cond_cfg in cfg.items():
+        model_type = cond_cfg[&#34;model&#34;]
+        model_args = cond_cfg[model_type]
+        if model_type == &#34;t5&#34;:
+            conditioners[str(cond)] = T5Conditioner(output_dim=output_dim, device=device, **model_args)
+        elif model_type == &#34;lut&#34;:
+            conditioners[str(cond)] = LUTConditioner(output_dim=output_dim, **model_args)
+        elif model_type == &#34;chroma_stem&#34;:
+            model_args.pop(&#39;cache_path&#39;, None)
+            conditioners[str(cond)] = ChromaStemConditioner(
+                output_dim=output_dim,
+                duration=duration,
+                device=device,
+                **model_args
+            )
+        else:
+            raise ValueError(f&#34;unrecognized conditioning model: {model_type}&#34;)
+    conditioner = ConditioningProvider(conditioners, device=device, **condition_provider_args)
+    return conditioner</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.builders.get_debug_compression_model"><code class="name flex">
+<span>def <span class="ident">get_debug_compression_model</span></span>(<span>device='cpu')</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Instantiate a debug compression model to be used for unit tests.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_debug_compression_model(device=&#39;cpu&#39;):
+    &#34;&#34;&#34;Instantiate a debug compression model to be used for unit tests.
+    &#34;&#34;&#34;
+    seanet_kwargs = {
+        &#39;n_filters&#39;: 4,
+        &#39;n_residual_layers&#39;: 1,
+        &#39;dimension&#39;: 32,
+        &#39;ratios&#39;: [10, 8, 16]  # 25 Hz at 32kHz
+    }
+    encoder = audiocraft.modules.SEANetEncoder(**seanet_kwargs)
+    decoder = audiocraft.modules.SEANetDecoder(**seanet_kwargs)
+    quantizer = qt.ResidualVectorQuantizer(dimension=32, bins=400, n_q=4)
+    init_x = torch.randn(8, 32, 128)
+    quantizer(init_x, 1)  # initialize kmeans etc.
+    compression_model = EncodecModel(
+        encoder, decoder, quantizer,
+        frame_rate=25, sample_rate=32000, channels=1).to(device)
+    return compression_model.eval()</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.builders.get_debug_lm_model"><code class="name flex">
+<span>def <span class="ident">get_debug_lm_model</span></span>(<span>device='cpu')</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Instantiate a debug LM to be used for unit tests.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_debug_lm_model(device=&#39;cpu&#39;):
+    &#34;&#34;&#34;Instantiate a debug LM to be used for unit tests.
+    &#34;&#34;&#34;
+    pattern = DelayedPatternProvider(n_q=4)
+    dim = 16
+    providers = {
+        &#39;description&#39;: LUTConditioner(n_bins=128, dim=dim, output_dim=dim, tokenizer=&#34;whitespace&#34;),
+    }
+    condition_provider = ConditioningProvider(providers)
+    fuser = ConditionFuser(
+        {&#39;cross&#39;: [&#39;description&#39;], &#39;prepend&#39;: [],
+         &#39;sum&#39;: [], &#39;input_interpolate&#39;: []})
+    lm = LMModel(
+        pattern, condition_provider, fuser,
+        n_q=4, card=400, dim=dim, num_heads=4, custom=True, num_layers=2,
+        cross_attention=True, causal=True)
+    return lm.to(device).eval()</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.builders.get_encodec_autoencoder"><code class="name flex">
+<span>def <span class="ident">get_encodec_autoencoder</span></span>(<span>encoder_name: str, cfg: omegaconf.dictconfig.DictConfig)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_encodec_autoencoder(encoder_name: str, cfg: omegaconf.DictConfig):
+    if encoder_name == &#39;seanet&#39;:
+        kwargs = dict_from_config(getattr(cfg, &#39;seanet&#39;))
+        encoder_override_kwargs = kwargs.pop(&#39;encoder&#39;)
+        decoder_override_kwargs = kwargs.pop(&#39;decoder&#39;)
+        encoder_kwargs = {**kwargs, **encoder_override_kwargs}
+        decoder_kwargs = {**kwargs, **decoder_override_kwargs}
+        encoder = audiocraft.modules.SEANetEncoder(**encoder_kwargs)
+        decoder = audiocraft.modules.SEANetDecoder(**decoder_kwargs)
+        return encoder, decoder
+    else:
+        raise KeyError(f&#39;Unexpected compression model {cfg.compression_model}&#39;)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.builders.get_lm_model"><code class="name flex">
+<span>def <span class="ident">get_lm_model</span></span>(<span>cfg: omegaconf.dictconfig.DictConfig) ‑> <a title="audiocraft.models.lm.LMModel" href="lm.html#audiocraft.models.lm.LMModel">LMModel</a></span>
+</code></dt>
+<dd>
+<div class="desc"><p>Instantiate a transformer LM.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_lm_model(cfg: omegaconf.DictConfig) -&gt; LMModel:
+    &#34;&#34;&#34;Instantiate a transformer LM.
+    &#34;&#34;&#34;
+    if cfg.lm_model == &#39;transformer_lm&#39;:
+        kwargs = dict_from_config(getattr(cfg, &#39;transformer_lm&#39;))
+        n_q = kwargs[&#39;n_q&#39;]
+        q_modeling = kwargs.pop(&#39;q_modeling&#39;, None)
+        codebooks_pattern_cfg = getattr(cfg, &#39;codebooks_pattern&#39;)
+        attribute_dropout = dict_from_config(getattr(cfg, &#39;attribute_dropout&#39;))
+        cls_free_guidance = dict_from_config(getattr(cfg, &#39;classifier_free_guidance&#39;))
+        cfg_prob, cfg_coef = cls_free_guidance[&#34;training_dropout&#34;], cls_free_guidance[&#34;inference_coef&#34;]
+        fuser = get_condition_fuser(cfg)
+        condition_provider = get_conditioner_provider(kwargs[&#34;dim&#34;], cfg).to(cfg.device)
+        if len(fuser.fuse2cond[&#39;cross&#39;]) &gt; 0:  # enforce cross-att programatically
+            kwargs[&#39;cross_attention&#39;] = True
+        if codebooks_pattern_cfg.modeling is None:
+            assert q_modeling is not None, \
+                &#39;LM model should either have a codebook pattern defined or transformer_lm.q_modeling&#39;
+            codebooks_pattern_cfg = omegaconf.OmegaConf.create(
+                {&#39;modeling&#39;: q_modeling, &#39;delay&#39;: {&#39;delays&#39;: list(range(n_q))}}
+            )
+        pattern_provider = get_codebooks_pattern_provider(n_q, codebooks_pattern_cfg)
+        return LMModel(
+            pattern_provider=pattern_provider,
+            condition_provider=condition_provider,
+            fuser=fuser,
+            cfg_dropout=cfg_prob,
+            cfg_coef=cfg_coef,
+            attribute_dropout=attribute_dropout,
+            dtype=getattr(torch, cfg.dtype),
+            device=cfg.device,
+            **kwargs
+        ).to(cfg.device)
+    else:
+        raise KeyError(f&#39;Unexpected LM model {cfg.lm_model}&#39;)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.builders.get_quantizer"><code class="name flex">
+<span>def <span class="ident">get_quantizer</span></span>(<span>quantizer: str, cfg: omegaconf.dictconfig.DictConfig, dimension: int) ‑> <a title="audiocraft.quantization.base.BaseQuantizer" href="../quantization/base.html#audiocraft.quantization.base.BaseQuantizer">BaseQuantizer</a></span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_quantizer(quantizer: str, cfg: omegaconf.DictConfig, dimension: int) -&gt; qt.BaseQuantizer:
+    klass = {
+        &#39;no_quant&#39;: qt.DummyQuantizer,
+        &#39;rvq&#39;: qt.ResidualVectorQuantizer
+    }[quantizer]
+    kwargs = dict_from_config(getattr(cfg, quantizer))
+    if quantizer != &#39;no_quant&#39;:
+        kwargs[&#39;dimension&#39;] = dimension
+    return klass(**kwargs)</code></pre>
+</details>
+</dd>
+</dl>
+</section>
+<section>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.models" href="index.html">audiocraft.models</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-functions">Functions</a></h3>
+<ul class="">
+<li><code><a title="audiocraft.models.builders.get_codebooks_pattern_provider" href="#audiocraft.models.builders.get_codebooks_pattern_provider">get_codebooks_pattern_provider</a></code></li>
+<li><code><a title="audiocraft.models.builders.get_compression_model" href="#audiocraft.models.builders.get_compression_model">get_compression_model</a></code></li>
+<li><code><a title="audiocraft.models.builders.get_condition_fuser" href="#audiocraft.models.builders.get_condition_fuser">get_condition_fuser</a></code></li>
+<li><code><a title="audiocraft.models.builders.get_conditioner_provider" href="#audiocraft.models.builders.get_conditioner_provider">get_conditioner_provider</a></code></li>
+<li><code><a title="audiocraft.models.builders.get_debug_compression_model" href="#audiocraft.models.builders.get_debug_compression_model">get_debug_compression_model</a></code></li>
+<li><code><a title="audiocraft.models.builders.get_debug_lm_model" href="#audiocraft.models.builders.get_debug_lm_model">get_debug_lm_model</a></code></li>
+<li><code><a title="audiocraft.models.builders.get_encodec_autoencoder" href="#audiocraft.models.builders.get_encodec_autoencoder">get_encodec_autoencoder</a></code></li>
+<li><code><a title="audiocraft.models.builders.get_lm_model" href="#audiocraft.models.builders.get_lm_model">get_lm_model</a></code></li>
+<li><code><a title="audiocraft.models.builders.get_quantizer" href="#audiocraft.models.builders.get_quantizer">get_quantizer</a></code></li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/docs/audiocraft/models/encodec.html b/docs/audiocraft/models/encodec.html
new file mode 100644
index 00000000..6413ca4d
--- /dev/null
+++ b/docs/audiocraft/models/encodec.html
@@ -0,0 +1,1306 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.models.encodec API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.models.encodec</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from abc import ABC, abstractmethod
+import typing as tp
+
+from einops import rearrange
+import torch
+from torch import nn
+
+from .. import quantization as qt
+
+
+class CompressionModel(ABC, nn.Module):
+
+    @abstractmethod
+    def forward(self, x: torch.Tensor) -&gt; qt.QuantizedResult:
+        ...
+
+    @abstractmethod
+    def encode(self, x: torch.Tensor) -&gt; tp.Tuple[torch.Tensor, tp.Optional[torch.Tensor]]:
+        &#34;&#34;&#34;See `EncodecModel.encode`&#34;&#34;&#34;
+        ...
+
+    @abstractmethod
+    def decode(self, codes: torch.Tensor, scale: tp.Optional[torch.Tensor] = None):
+        &#34;&#34;&#34;See `EncodecModel.decode`&#34;&#34;&#34;
+        ...
+
+    @property
+    @abstractmethod
+    def channels(self) -&gt; int:
+        ...
+
+    @property
+    @abstractmethod
+    def frame_rate(self) -&gt; int:
+        ...
+
+    @property
+    @abstractmethod
+    def sample_rate(self) -&gt; int:
+        ...
+
+    @property
+    @abstractmethod
+    def cardinality(self) -&gt; int:
+        ...
+
+    @property
+    @abstractmethod
+    def num_codebooks(self) -&gt; int:
+        ...
+
+    @property
+    @abstractmethod
+    def total_codebooks(self) -&gt; int:
+        ...
+
+    @abstractmethod
+    def set_num_codebooks(self, n: int):
+        &#34;&#34;&#34;Set the active number of codebooks used by the quantizer.
+        &#34;&#34;&#34;
+        ...
+
+
+class EncodecModel(CompressionModel):
+    &#34;&#34;&#34;Encodec model operating on the raw waveform.
+
+    Args:
+        encoder (nn.Module): Encoder network.
+        decoder (nn.Module): Decoder network.
+        quantizer (qt.BaseQuantizer): Quantizer network.
+        frame_rate (int): Frame rate for the latent representation.
+        sample_rate (int): Audio sample rate.
+        channels (int): Number of audio channels.
+        causal (bool): Whether to use a causal version of the model.
+        renormalize (bool): Whether to renormalize the audio before running the model.
+    &#34;&#34;&#34;
+    # we need assignement to override the property in the abstract class,
+    # I couldn&#39;t find a better way...
+    frame_rate: int = 0
+    sample_rate: int = 0
+    channels: int = 0
+
+    def __init__(self,
+                 encoder: nn.Module,
+                 decoder: nn.Module,
+                 quantizer: qt.BaseQuantizer,
+                 frame_rate: int,
+                 sample_rate: int,
+                 channels: int,
+                 causal: bool = False,
+                 renormalize: bool = False):
+        super().__init__()
+        self.encoder = encoder
+        self.decoder = decoder
+        self.quantizer = quantizer
+        self.frame_rate = frame_rate
+        self.sample_rate = sample_rate
+        self.channels = channels
+        self.renormalize = renormalize
+        self.causal = causal
+        if self.causal:
+            # we force disabling here to avoid handling linear overlap of segments
+            # as supported in original EnCodec codebase.
+            assert not self.renormalize, &#39;Causal model does not support renormalize&#39;
+
+    @property
+    def total_codebooks(self):
+        &#34;&#34;&#34;Total number of quantizer codebooks available.
+        &#34;&#34;&#34;
+        return self.quantizer.total_codebooks
+
+    @property
+    def num_codebooks(self):
+        &#34;&#34;&#34;Active number of codebooks used by the quantizer.
+        &#34;&#34;&#34;
+        return self.quantizer.num_codebooks
+
+    def set_num_codebooks(self, n: int):
+        &#34;&#34;&#34;Set the active number of codebooks used by the quantizer.
+        &#34;&#34;&#34;
+        self.quantizer.set_num_codebooks(n)
+
+    @property
+    def cardinality(self):
+        &#34;&#34;&#34;Cardinality of each codebook.
+        &#34;&#34;&#34;
+        return self.quantizer.bins
+
+    def preprocess(self, x: torch.Tensor) -&gt; tp.Tuple[torch.Tensor, tp.Optional[torch.Tensor]]:
+        scale: tp.Optional[torch.Tensor]
+        if self.renormalize:
+            mono = x.mean(dim=1, keepdim=True)
+            volume = mono.pow(2).mean(dim=2, keepdim=True).sqrt()
+            scale = 1e-8 + volume
+            x = x / scale
+            scale = scale.view(-1, 1)
+        else:
+            scale = None
+        return x, scale
+
+    def postprocess(self,
+                    x: torch.Tensor,
+                    scale: tp.Optional[torch.Tensor] = None) -&gt; torch.Tensor:
+        if scale is not None:
+            assert self.renormalize
+            x = x * scale.view(-1, 1, 1)
+        return x
+
+    def forward(self, x: torch.Tensor) -&gt; qt.QuantizedResult:
+        assert x.dim() == 3
+        length = x.shape[-1]
+        x, scale = self.preprocess(x)
+
+        emb = self.encoder(x)
+        q_res = self.quantizer(emb, self.frame_rate)
+        out = self.decoder(q_res.x)
+
+        # remove extra padding added by the encoder and decoder
+        assert out.shape[-1] &gt;= length, (out.shape[-1], length)
+        out = out[..., :length]
+
+        q_res.x = self.postprocess(out, scale)
+
+        return q_res
+
+    def encode(self, x: torch.Tensor) -&gt; tp.Tuple[torch.Tensor, tp.Optional[torch.Tensor]]:
+        &#34;&#34;&#34;Encode the given input tensor to quantized representation along with scale parameter.
+
+        Args:
+            x (torch.Tensor): Float tensor of shape [B, C, T]
+
+        Returns:
+            codes, scale (tp.Tuple[torch.Tensor, torch.Tensor]): Tuple composed of:
+                codes a float tensor of shape [B, K, T] with K the number of codebooks used and T the timestep.
+                scale a float tensor containing the scale for audio renormalizealization.
+        &#34;&#34;&#34;
+        assert x.dim() == 3
+        x, scale = self.preprocess(x)
+        emb = self.encoder(x)
+        codes = self.quantizer.encode(emb)
+        return codes, scale
+
+    def decode(self, codes: torch.Tensor, scale: tp.Optional[torch.Tensor] = None):
+        &#34;&#34;&#34;Decode the given codes to a reconstructed representation, using the scale to perform
+        audio denormalization if needed.
+
+        Args:
+            codes (torch.Tensor): Int tensor of shape [B, K, T]
+            scale (tp.Optional[torch.Tensor]): Float tensor containing the scale value.
+
+        Returns:
+            out (torch.Tensor): Float tensor of shape [B, C, T], the reconstructed audio.
+        &#34;&#34;&#34;
+        emb = self.quantizer.decode(codes)
+        out = self.decoder(emb)
+        out = self.postprocess(out, scale)
+        # out contains extra padding added by the encoder and decoder
+        return out
+
+
+class FlattenedCompressionModel(CompressionModel):
+    &#34;&#34;&#34;Wraps a CompressionModel and flatten its codebooks, e.g.
+    instead of returning [B, K, T], return [B, S, T * (K // S)] with
+    S the number of codebooks per step, and `K // S` the number of &#39;virtual steps&#39;
+    for each real time step.
+
+    Args:
+        model (CompressionModel): compression model to wrap.
+        codebooks_per_step (int): number of codebooks to keep per step,
+            this must divide the number of codebooks provided by the wrapped model.
+        extend_cardinality (bool): if True, and for instance if codebooks_per_step = 1,
+            if each codebook has a cardinality N, then the first codebook will
+            use the range [0, N - 1], and the second [N, 2 N - 1] etc.
+            On decoding, this can lead to potentially invalid sequences.
+            Any invalid entry will be silently remapped to the proper range
+            with a modulo.
+    &#34;&#34;&#34;
+    def __init__(self, model: CompressionModel, codebooks_per_step: int = 1,
+                 extend_cardinality: bool = True):
+        super().__init__()
+        self.model = model
+        self.codebooks_per_step = codebooks_per_step
+        self.extend_cardinality = extend_cardinality
+
+    @property
+    def total_codebooks(self):
+        return self.model.total_codebooks
+
+    @property
+    def num_codebooks(self):
+        &#34;&#34;&#34;Active number of codebooks used by the quantizer.
+
+        ..Warning:: this reports the number of codebooks after the flattening
+        of the codebooks!
+        &#34;&#34;&#34;
+        assert self.model.num_codebooks % self.codebooks_per_step == 0
+        return self.codebooks_per_step
+
+    def set_num_codebooks(self, n: int):
+        &#34;&#34;&#34;Set the active number of codebooks used by the quantizer.
+
+        ..Warning:: this sets the number of codebooks **before** the flattening
+        of the codebooks.
+        &#34;&#34;&#34;
+        assert n % self.codebooks_per_step == 0
+        self.model.set_num_codebooks(n)
+
+    @property
+    def num_virtual_steps(self) -&gt; int:
+        &#34;&#34;&#34;Return the number of virtual steps, e.g. one real step
+        will be split into that many steps.
+        &#34;&#34;&#34;
+        return self.model.num_codebooks // self.codebooks_per_step
+
+    @property
+    def frame_rate(self) -&gt; int:
+        return self.model.frame_rate * self.num_virtual_steps
+
+    @property
+    def sample_rate(self) -&gt; int:
+        return self.model.sample_rate
+
+    @property
+    def channels(self) -&gt; int:
+        return self.model.channels
+
+    @property
+    def cardinality(self):
+        &#34;&#34;&#34;Cardinality of each codebook.
+        &#34;&#34;&#34;
+        if self.extend_cardinality:
+            return self.model.cardinality * self.num_virtual_steps
+        else:
+            return self.model.cardinality
+
+    def forward(self, x: torch.Tensor) -&gt; qt.QuantizedResult:
+        raise NotImplementedError(&#34;Not supported, use encode and decode.&#34;)
+
+    def encode(self, x: torch.Tensor) -&gt; tp.Tuple[torch.Tensor, tp.Optional[torch.Tensor]]:
+        indices, scales = self.model.encode(x)
+        B, K, T = indices.shape
+        indices = rearrange(indices, &#39;b (k v) t -&gt; b k t v&#39;, k=self.codebooks_per_step)
+        if self.extend_cardinality:
+            for virtual_step in range(1, self.num_virtual_steps):
+                indices[..., virtual_step] += self.model.cardinality * virtual_step
+        indices = rearrange(indices, &#39;b k t v -&gt; b k (t v)&#39;)
+        return (indices, scales)
+
+    def decode(self, codes: torch.Tensor, scale: tp.Optional[torch.Tensor] = None):
+        B, K, T = codes.shape
+        assert T % self.num_virtual_steps == 0
+        codes = rearrange(codes, &#39;b k (t v) -&gt; b (k v) t&#39;, v=self.num_virtual_steps)
+        # We silently ignore potential errors from the LM when
+        # using extend_cardinality.
+        codes = codes % self.model.cardinality
+        return self.model.decode(codes, scale)</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.models.encodec.CompressionModel"><code class="flex name class">
+<span>class <span class="ident">CompressionModel</span></span>
+<span>(</span><span>*args, **kwargs)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Helper class that provides a standard way to create an ABC using
+inheritance.</p>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class CompressionModel(ABC, nn.Module):
+
+    @abstractmethod
+    def forward(self, x: torch.Tensor) -&gt; qt.QuantizedResult:
+        ...
+
+    @abstractmethod
+    def encode(self, x: torch.Tensor) -&gt; tp.Tuple[torch.Tensor, tp.Optional[torch.Tensor]]:
+        &#34;&#34;&#34;See `EncodecModel.encode`&#34;&#34;&#34;
+        ...
+
+    @abstractmethod
+    def decode(self, codes: torch.Tensor, scale: tp.Optional[torch.Tensor] = None):
+        &#34;&#34;&#34;See `EncodecModel.decode`&#34;&#34;&#34;
+        ...
+
+    @property
+    @abstractmethod
+    def channels(self) -&gt; int:
+        ...
+
+    @property
+    @abstractmethod
+    def frame_rate(self) -&gt; int:
+        ...
+
+    @property
+    @abstractmethod
+    def sample_rate(self) -&gt; int:
+        ...
+
+    @property
+    @abstractmethod
+    def cardinality(self) -&gt; int:
+        ...
+
+    @property
+    @abstractmethod
+    def num_codebooks(self) -&gt; int:
+        ...
+
+    @property
+    @abstractmethod
+    def total_codebooks(self) -&gt; int:
+        ...
+
+    @abstractmethod
+    def set_num_codebooks(self, n: int):
+        &#34;&#34;&#34;Set the active number of codebooks used by the quantizer.
+        &#34;&#34;&#34;
+        ...</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>abc.ABC</li>
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Subclasses</h3>
+<ul class="hlist">
+<li><a title="audiocraft.models.encodec.EncodecModel" href="#audiocraft.models.encodec.EncodecModel">EncodecModel</a></li>
+<li><a title="audiocraft.models.encodec.FlattenedCompressionModel" href="#audiocraft.models.encodec.FlattenedCompressionModel">FlattenedCompressionModel</a></li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.models.encodec.CompressionModel.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.models.encodec.CompressionModel.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.models.encodec.CompressionModel.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Instance variables</h3>
+<dl>
+<dt id="audiocraft.models.encodec.CompressionModel.cardinality"><code class="name">var <span class="ident">cardinality</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+@abstractmethod
+def cardinality(self) -&gt; int:
+    ...</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.encodec.CompressionModel.channels"><code class="name">var <span class="ident">channels</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+@abstractmethod
+def channels(self) -&gt; int:
+    ...</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.encodec.CompressionModel.frame_rate"><code class="name">var <span class="ident">frame_rate</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+@abstractmethod
+def frame_rate(self) -&gt; int:
+    ...</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.encodec.CompressionModel.num_codebooks"><code class="name">var <span class="ident">num_codebooks</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+@abstractmethod
+def num_codebooks(self) -&gt; int:
+    ...</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.encodec.CompressionModel.sample_rate"><code class="name">var <span class="ident">sample_rate</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+@abstractmethod
+def sample_rate(self) -&gt; int:
+    ...</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.encodec.CompressionModel.total_codebooks"><code class="name">var <span class="ident">total_codebooks</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+@abstractmethod
+def total_codebooks(self) -&gt; int:
+    ...</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.models.encodec.CompressionModel.decode"><code class="name flex">
+<span>def <span class="ident">decode</span></span>(<span>self, codes: torch.Tensor, scale: Optional[torch.Tensor] = None)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>See <code><a title="audiocraft.models.encodec.EncodecModel.decode" href="#audiocraft.models.encodec.EncodecModel.decode">EncodecModel.decode()</a></code></p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@abstractmethod
+def decode(self, codes: torch.Tensor, scale: tp.Optional[torch.Tensor] = None):
+    &#34;&#34;&#34;See `EncodecModel.decode`&#34;&#34;&#34;
+    ...</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.encodec.CompressionModel.encode"><code class="name flex">
+<span>def <span class="ident">encode</span></span>(<span>self, x: torch.Tensor) ‑> Tuple[torch.Tensor, Optional[torch.Tensor]]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>See <code><a title="audiocraft.models.encodec.EncodecModel.encode" href="#audiocraft.models.encodec.EncodecModel.encode">EncodecModel.encode()</a></code></p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@abstractmethod
+def encode(self, x: torch.Tensor) -&gt; tp.Tuple[torch.Tensor, tp.Optional[torch.Tensor]]:
+    &#34;&#34;&#34;See `EncodecModel.encode`&#34;&#34;&#34;
+    ...</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.encodec.CompressionModel.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, x: torch.Tensor) ‑> <a title="audiocraft.quantization.base.QuantizedResult" href="../quantization/base.html#audiocraft.quantization.base.QuantizedResult">QuantizedResult</a></span>
+</code></dt>
+<dd>
+<div class="desc"><p>Defines the computation performed at every call.</p>
+<p>Should be overridden by all subclasses.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>Although the recipe for forward pass needs to be defined within
+this function, one should call the :class:<code>Module</code> instance afterwards
+instead of this since the former takes care of running the
+registered hooks while the latter silently ignores them.</p>
+</div></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@abstractmethod
+def forward(self, x: torch.Tensor) -&gt; qt.QuantizedResult:
+    ...</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.encodec.CompressionModel.set_num_codebooks"><code class="name flex">
+<span>def <span class="ident">set_num_codebooks</span></span>(<span>self, n: int)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Set the active number of codebooks used by the quantizer.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@abstractmethod
+def set_num_codebooks(self, n: int):
+    &#34;&#34;&#34;Set the active number of codebooks used by the quantizer.
+    &#34;&#34;&#34;
+    ...</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.models.encodec.EncodecModel"><code class="flex name class">
+<span>class <span class="ident">EncodecModel</span></span>
+<span>(</span><span>encoder: torch.nn.modules.module.Module, decoder: torch.nn.modules.module.Module, quantizer: <a title="audiocraft.quantization.base.BaseQuantizer" href="../quantization/base.html#audiocraft.quantization.base.BaseQuantizer">BaseQuantizer</a>, frame_rate: int, sample_rate: int, channels: int, causal: bool = False, renormalize: bool = False)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Encodec model operating on the raw waveform.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>encoder</code></strong> :&ensp;<code>nn.Module</code></dt>
+<dd>Encoder network.</dd>
+<dt><strong><code>decoder</code></strong> :&ensp;<code>nn.Module</code></dt>
+<dd>Decoder network.</dd>
+<dt><strong><code>quantizer</code></strong> :&ensp;<code>qt.BaseQuantizer</code></dt>
+<dd>Quantizer network.</dd>
+<dt><strong><code>frame_rate</code></strong> :&ensp;<code>int</code></dt>
+<dd>Frame rate for the latent representation.</dd>
+<dt><strong><code>sample_rate</code></strong> :&ensp;<code>int</code></dt>
+<dd>Audio sample rate.</dd>
+<dt><strong><code>channels</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of audio channels.</dd>
+<dt><strong><code>causal</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to use a causal version of the model.</dd>
+<dt><strong><code>renormalize</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to renormalize the audio before running the model.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class EncodecModel(CompressionModel):
+    &#34;&#34;&#34;Encodec model operating on the raw waveform.
+
+    Args:
+        encoder (nn.Module): Encoder network.
+        decoder (nn.Module): Decoder network.
+        quantizer (qt.BaseQuantizer): Quantizer network.
+        frame_rate (int): Frame rate for the latent representation.
+        sample_rate (int): Audio sample rate.
+        channels (int): Number of audio channels.
+        causal (bool): Whether to use a causal version of the model.
+        renormalize (bool): Whether to renormalize the audio before running the model.
+    &#34;&#34;&#34;
+    # we need assignement to override the property in the abstract class,
+    # I couldn&#39;t find a better way...
+    frame_rate: int = 0
+    sample_rate: int = 0
+    channels: int = 0
+
+    def __init__(self,
+                 encoder: nn.Module,
+                 decoder: nn.Module,
+                 quantizer: qt.BaseQuantizer,
+                 frame_rate: int,
+                 sample_rate: int,
+                 channels: int,
+                 causal: bool = False,
+                 renormalize: bool = False):
+        super().__init__()
+        self.encoder = encoder
+        self.decoder = decoder
+        self.quantizer = quantizer
+        self.frame_rate = frame_rate
+        self.sample_rate = sample_rate
+        self.channels = channels
+        self.renormalize = renormalize
+        self.causal = causal
+        if self.causal:
+            # we force disabling here to avoid handling linear overlap of segments
+            # as supported in original EnCodec codebase.
+            assert not self.renormalize, &#39;Causal model does not support renormalize&#39;
+
+    @property
+    def total_codebooks(self):
+        &#34;&#34;&#34;Total number of quantizer codebooks available.
+        &#34;&#34;&#34;
+        return self.quantizer.total_codebooks
+
+    @property
+    def num_codebooks(self):
+        &#34;&#34;&#34;Active number of codebooks used by the quantizer.
+        &#34;&#34;&#34;
+        return self.quantizer.num_codebooks
+
+    def set_num_codebooks(self, n: int):
+        &#34;&#34;&#34;Set the active number of codebooks used by the quantizer.
+        &#34;&#34;&#34;
+        self.quantizer.set_num_codebooks(n)
+
+    @property
+    def cardinality(self):
+        &#34;&#34;&#34;Cardinality of each codebook.
+        &#34;&#34;&#34;
+        return self.quantizer.bins
+
+    def preprocess(self, x: torch.Tensor) -&gt; tp.Tuple[torch.Tensor, tp.Optional[torch.Tensor]]:
+        scale: tp.Optional[torch.Tensor]
+        if self.renormalize:
+            mono = x.mean(dim=1, keepdim=True)
+            volume = mono.pow(2).mean(dim=2, keepdim=True).sqrt()
+            scale = 1e-8 + volume
+            x = x / scale
+            scale = scale.view(-1, 1)
+        else:
+            scale = None
+        return x, scale
+
+    def postprocess(self,
+                    x: torch.Tensor,
+                    scale: tp.Optional[torch.Tensor] = None) -&gt; torch.Tensor:
+        if scale is not None:
+            assert self.renormalize
+            x = x * scale.view(-1, 1, 1)
+        return x
+
+    def forward(self, x: torch.Tensor) -&gt; qt.QuantizedResult:
+        assert x.dim() == 3
+        length = x.shape[-1]
+        x, scale = self.preprocess(x)
+
+        emb = self.encoder(x)
+        q_res = self.quantizer(emb, self.frame_rate)
+        out = self.decoder(q_res.x)
+
+        # remove extra padding added by the encoder and decoder
+        assert out.shape[-1] &gt;= length, (out.shape[-1], length)
+        out = out[..., :length]
+
+        q_res.x = self.postprocess(out, scale)
+
+        return q_res
+
+    def encode(self, x: torch.Tensor) -&gt; tp.Tuple[torch.Tensor, tp.Optional[torch.Tensor]]:
+        &#34;&#34;&#34;Encode the given input tensor to quantized representation along with scale parameter.
+
+        Args:
+            x (torch.Tensor): Float tensor of shape [B, C, T]
+
+        Returns:
+            codes, scale (tp.Tuple[torch.Tensor, torch.Tensor]): Tuple composed of:
+                codes a float tensor of shape [B, K, T] with K the number of codebooks used and T the timestep.
+                scale a float tensor containing the scale for audio renormalizealization.
+        &#34;&#34;&#34;
+        assert x.dim() == 3
+        x, scale = self.preprocess(x)
+        emb = self.encoder(x)
+        codes = self.quantizer.encode(emb)
+        return codes, scale
+
+    def decode(self, codes: torch.Tensor, scale: tp.Optional[torch.Tensor] = None):
+        &#34;&#34;&#34;Decode the given codes to a reconstructed representation, using the scale to perform
+        audio denormalization if needed.
+
+        Args:
+            codes (torch.Tensor): Int tensor of shape [B, K, T]
+            scale (tp.Optional[torch.Tensor]): Float tensor containing the scale value.
+
+        Returns:
+            out (torch.Tensor): Float tensor of shape [B, C, T], the reconstructed audio.
+        &#34;&#34;&#34;
+        emb = self.quantizer.decode(codes)
+        out = self.decoder(emb)
+        out = self.postprocess(out, scale)
+        # out contains extra padding added by the encoder and decoder
+        return out</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.models.encodec.CompressionModel" href="#audiocraft.models.encodec.CompressionModel">CompressionModel</a></li>
+<li>abc.ABC</li>
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.models.encodec.EncodecModel.channels"><code class="name">var <span class="ident">channels</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.models.encodec.EncodecModel.frame_rate"><code class="name">var <span class="ident">frame_rate</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.models.encodec.EncodecModel.sample_rate"><code class="name">var <span class="ident">sample_rate</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Instance variables</h3>
+<dl>
+<dt id="audiocraft.models.encodec.EncodecModel.cardinality"><code class="name">var <span class="ident">cardinality</span></code></dt>
+<dd>
+<div class="desc"><p>Cardinality of each codebook.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def cardinality(self):
+    &#34;&#34;&#34;Cardinality of each codebook.
+    &#34;&#34;&#34;
+    return self.quantizer.bins</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.encodec.EncodecModel.num_codebooks"><code class="name">var <span class="ident">num_codebooks</span></code></dt>
+<dd>
+<div class="desc"><p>Active number of codebooks used by the quantizer.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def num_codebooks(self):
+    &#34;&#34;&#34;Active number of codebooks used by the quantizer.
+    &#34;&#34;&#34;
+    return self.quantizer.num_codebooks</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.encodec.EncodecModel.total_codebooks"><code class="name">var <span class="ident">total_codebooks</span></code></dt>
+<dd>
+<div class="desc"><p>Total number of quantizer codebooks available.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def total_codebooks(self):
+    &#34;&#34;&#34;Total number of quantizer codebooks available.
+    &#34;&#34;&#34;
+    return self.quantizer.total_codebooks</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.models.encodec.EncodecModel.decode"><code class="name flex">
+<span>def <span class="ident">decode</span></span>(<span>self, codes: torch.Tensor, scale: Optional[torch.Tensor] = None)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Decode the given codes to a reconstructed representation, using the scale to perform
+audio denormalization if needed.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>codes</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>Int tensor of shape [B, K, T]</dd>
+<dt><strong><code>scale</code></strong> :&ensp;<code>tp.Optional[torch.Tensor]</code></dt>
+<dd>Float tensor containing the scale value.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<p>out (torch.Tensor): Float tensor of shape [B, C, T], the reconstructed audio.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def decode(self, codes: torch.Tensor, scale: tp.Optional[torch.Tensor] = None):
+    &#34;&#34;&#34;Decode the given codes to a reconstructed representation, using the scale to perform
+    audio denormalization if needed.
+
+    Args:
+        codes (torch.Tensor): Int tensor of shape [B, K, T]
+        scale (tp.Optional[torch.Tensor]): Float tensor containing the scale value.
+
+    Returns:
+        out (torch.Tensor): Float tensor of shape [B, C, T], the reconstructed audio.
+    &#34;&#34;&#34;
+    emb = self.quantizer.decode(codes)
+    out = self.decoder(emb)
+    out = self.postprocess(out, scale)
+    # out contains extra padding added by the encoder and decoder
+    return out</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.encodec.EncodecModel.encode"><code class="name flex">
+<span>def <span class="ident">encode</span></span>(<span>self, x: torch.Tensor) ‑> Tuple[torch.Tensor, Optional[torch.Tensor]]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Encode the given input tensor to quantized representation along with scale parameter.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>x</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>Float tensor of shape [B, C, T]</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<p>codes, scale (tp.Tuple[torch.Tensor, torch.Tensor]): Tuple composed of:
+codes a float tensor of shape [B, K, T] with K the number of codebooks used and T the timestep.
+scale a float tensor containing the scale for audio renormalizealization.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def encode(self, x: torch.Tensor) -&gt; tp.Tuple[torch.Tensor, tp.Optional[torch.Tensor]]:
+    &#34;&#34;&#34;Encode the given input tensor to quantized representation along with scale parameter.
+
+    Args:
+        x (torch.Tensor): Float tensor of shape [B, C, T]
+
+    Returns:
+        codes, scale (tp.Tuple[torch.Tensor, torch.Tensor]): Tuple composed of:
+            codes a float tensor of shape [B, K, T] with K the number of codebooks used and T the timestep.
+            scale a float tensor containing the scale for audio renormalizealization.
+    &#34;&#34;&#34;
+    assert x.dim() == 3
+    x, scale = self.preprocess(x)
+    emb = self.encoder(x)
+    codes = self.quantizer.encode(emb)
+    return codes, scale</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.encodec.EncodecModel.postprocess"><code class="name flex">
+<span>def <span class="ident">postprocess</span></span>(<span>self, x: torch.Tensor, scale: Optional[torch.Tensor] = None) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def postprocess(self,
+                x: torch.Tensor,
+                scale: tp.Optional[torch.Tensor] = None) -&gt; torch.Tensor:
+    if scale is not None:
+        assert self.renormalize
+        x = x * scale.view(-1, 1, 1)
+    return x</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.encodec.EncodecModel.preprocess"><code class="name flex">
+<span>def <span class="ident">preprocess</span></span>(<span>self, x: torch.Tensor) ‑> Tuple[torch.Tensor, Optional[torch.Tensor]]</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def preprocess(self, x: torch.Tensor) -&gt; tp.Tuple[torch.Tensor, tp.Optional[torch.Tensor]]:
+    scale: tp.Optional[torch.Tensor]
+    if self.renormalize:
+        mono = x.mean(dim=1, keepdim=True)
+        volume = mono.pow(2).mean(dim=2, keepdim=True).sqrt()
+        scale = 1e-8 + volume
+        x = x / scale
+        scale = scale.view(-1, 1)
+    else:
+        scale = None
+    return x, scale</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Inherited members</h3>
+<ul class="hlist">
+<li><code><b><a title="audiocraft.models.encodec.CompressionModel" href="#audiocraft.models.encodec.CompressionModel">CompressionModel</a></b></code>:
+<ul class="hlist">
+<li><code><a title="audiocraft.models.encodec.CompressionModel.forward" href="#audiocraft.models.encodec.CompressionModel.forward">forward</a></code></li>
+<li><code><a title="audiocraft.models.encodec.CompressionModel.set_num_codebooks" href="#audiocraft.models.encodec.CompressionModel.set_num_codebooks">set_num_codebooks</a></code></li>
+</ul>
+</li>
+</ul>
+</dd>
+<dt id="audiocraft.models.encodec.FlattenedCompressionModel"><code class="flex name class">
+<span>class <span class="ident">FlattenedCompressionModel</span></span>
+<span>(</span><span>model: <a title="audiocraft.models.encodec.CompressionModel" href="#audiocraft.models.encodec.CompressionModel">CompressionModel</a>, codebooks_per_step: int = 1, extend_cardinality: bool = True)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Wraps a CompressionModel and flatten its codebooks, e.g.
+instead of returning [B, K, T], return [B, S, T * (K // S)] with
+S the number of codebooks per step, and <code>K // S</code> the number of 'virtual steps'
+for each real time step.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>model</code></strong> :&ensp;<code><a title="audiocraft.models.encodec.CompressionModel" href="#audiocraft.models.encodec.CompressionModel">CompressionModel</a></code></dt>
+<dd>compression model to wrap.</dd>
+<dt><strong><code>codebooks_per_step</code></strong> :&ensp;<code>int</code></dt>
+<dd>number of codebooks to keep per step,
+this must divide the number of codebooks provided by the wrapped model.</dd>
+<dt><strong><code>extend_cardinality</code></strong> :&ensp;<code>bool</code></dt>
+<dd>if True, and for instance if codebooks_per_step = 1,
+if each codebook has a cardinality N, then the first codebook will
+use the range [0, N - 1], and the second [N, 2 N - 1] etc.
+On decoding, this can lead to potentially invalid sequences.
+Any invalid entry will be silently remapped to the proper range
+with a modulo.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class FlattenedCompressionModel(CompressionModel):
+    &#34;&#34;&#34;Wraps a CompressionModel and flatten its codebooks, e.g.
+    instead of returning [B, K, T], return [B, S, T * (K // S)] with
+    S the number of codebooks per step, and `K // S` the number of &#39;virtual steps&#39;
+    for each real time step.
+
+    Args:
+        model (CompressionModel): compression model to wrap.
+        codebooks_per_step (int): number of codebooks to keep per step,
+            this must divide the number of codebooks provided by the wrapped model.
+        extend_cardinality (bool): if True, and for instance if codebooks_per_step = 1,
+            if each codebook has a cardinality N, then the first codebook will
+            use the range [0, N - 1], and the second [N, 2 N - 1] etc.
+            On decoding, this can lead to potentially invalid sequences.
+            Any invalid entry will be silently remapped to the proper range
+            with a modulo.
+    &#34;&#34;&#34;
+    def __init__(self, model: CompressionModel, codebooks_per_step: int = 1,
+                 extend_cardinality: bool = True):
+        super().__init__()
+        self.model = model
+        self.codebooks_per_step = codebooks_per_step
+        self.extend_cardinality = extend_cardinality
+
+    @property
+    def total_codebooks(self):
+        return self.model.total_codebooks
+
+    @property
+    def num_codebooks(self):
+        &#34;&#34;&#34;Active number of codebooks used by the quantizer.
+
+        ..Warning:: this reports the number of codebooks after the flattening
+        of the codebooks!
+        &#34;&#34;&#34;
+        assert self.model.num_codebooks % self.codebooks_per_step == 0
+        return self.codebooks_per_step
+
+    def set_num_codebooks(self, n: int):
+        &#34;&#34;&#34;Set the active number of codebooks used by the quantizer.
+
+        ..Warning:: this sets the number of codebooks **before** the flattening
+        of the codebooks.
+        &#34;&#34;&#34;
+        assert n % self.codebooks_per_step == 0
+        self.model.set_num_codebooks(n)
+
+    @property
+    def num_virtual_steps(self) -&gt; int:
+        &#34;&#34;&#34;Return the number of virtual steps, e.g. one real step
+        will be split into that many steps.
+        &#34;&#34;&#34;
+        return self.model.num_codebooks // self.codebooks_per_step
+
+    @property
+    def frame_rate(self) -&gt; int:
+        return self.model.frame_rate * self.num_virtual_steps
+
+    @property
+    def sample_rate(self) -&gt; int:
+        return self.model.sample_rate
+
+    @property
+    def channels(self) -&gt; int:
+        return self.model.channels
+
+    @property
+    def cardinality(self):
+        &#34;&#34;&#34;Cardinality of each codebook.
+        &#34;&#34;&#34;
+        if self.extend_cardinality:
+            return self.model.cardinality * self.num_virtual_steps
+        else:
+            return self.model.cardinality
+
+    def forward(self, x: torch.Tensor) -&gt; qt.QuantizedResult:
+        raise NotImplementedError(&#34;Not supported, use encode and decode.&#34;)
+
+    def encode(self, x: torch.Tensor) -&gt; tp.Tuple[torch.Tensor, tp.Optional[torch.Tensor]]:
+        indices, scales = self.model.encode(x)
+        B, K, T = indices.shape
+        indices = rearrange(indices, &#39;b (k v) t -&gt; b k t v&#39;, k=self.codebooks_per_step)
+        if self.extend_cardinality:
+            for virtual_step in range(1, self.num_virtual_steps):
+                indices[..., virtual_step] += self.model.cardinality * virtual_step
+        indices = rearrange(indices, &#39;b k t v -&gt; b k (t v)&#39;)
+        return (indices, scales)
+
+    def decode(self, codes: torch.Tensor, scale: tp.Optional[torch.Tensor] = None):
+        B, K, T = codes.shape
+        assert T % self.num_virtual_steps == 0
+        codes = rearrange(codes, &#39;b k (t v) -&gt; b (k v) t&#39;, v=self.num_virtual_steps)
+        # We silently ignore potential errors from the LM when
+        # using extend_cardinality.
+        codes = codes % self.model.cardinality
+        return self.model.decode(codes, scale)</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.models.encodec.CompressionModel" href="#audiocraft.models.encodec.CompressionModel">CompressionModel</a></li>
+<li>abc.ABC</li>
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.models.encodec.FlattenedCompressionModel.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.models.encodec.FlattenedCompressionModel.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.models.encodec.FlattenedCompressionModel.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Instance variables</h3>
+<dl>
+<dt id="audiocraft.models.encodec.FlattenedCompressionModel.cardinality"><code class="name">var <span class="ident">cardinality</span></code></dt>
+<dd>
+<div class="desc"><p>Cardinality of each codebook.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def cardinality(self):
+    &#34;&#34;&#34;Cardinality of each codebook.
+    &#34;&#34;&#34;
+    if self.extend_cardinality:
+        return self.model.cardinality * self.num_virtual_steps
+    else:
+        return self.model.cardinality</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.encodec.FlattenedCompressionModel.channels"><code class="name">var <span class="ident">channels</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def channels(self) -&gt; int:
+    return self.model.channels</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.encodec.FlattenedCompressionModel.frame_rate"><code class="name">var <span class="ident">frame_rate</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def frame_rate(self) -&gt; int:
+    return self.model.frame_rate * self.num_virtual_steps</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.encodec.FlattenedCompressionModel.num_codebooks"><code class="name">var <span class="ident">num_codebooks</span></code></dt>
+<dd>
+<div class="desc"><p>Active number of codebooks used by the quantizer.</p>
+<div class="admonition warning">
+<p class="admonition-title">Warning:&ensp;this reports the number of codebooks after the flattening</p>
+</div>
+<p>of the codebooks!</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def num_codebooks(self):
+    &#34;&#34;&#34;Active number of codebooks used by the quantizer.
+
+    ..Warning:: this reports the number of codebooks after the flattening
+    of the codebooks!
+    &#34;&#34;&#34;
+    assert self.model.num_codebooks % self.codebooks_per_step == 0
+    return self.codebooks_per_step</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.encodec.FlattenedCompressionModel.num_virtual_steps"><code class="name">var <span class="ident">num_virtual_steps</span> : int</code></dt>
+<dd>
+<div class="desc"><p>Return the number of virtual steps, e.g. one real step
+will be split into that many steps.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def num_virtual_steps(self) -&gt; int:
+    &#34;&#34;&#34;Return the number of virtual steps, e.g. one real step
+    will be split into that many steps.
+    &#34;&#34;&#34;
+    return self.model.num_codebooks // self.codebooks_per_step</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.encodec.FlattenedCompressionModel.sample_rate"><code class="name">var <span class="ident">sample_rate</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def sample_rate(self) -&gt; int:
+    return self.model.sample_rate</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.encodec.FlattenedCompressionModel.total_codebooks"><code class="name">var <span class="ident">total_codebooks</span></code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def total_codebooks(self):
+    return self.model.total_codebooks</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.models.encodec.FlattenedCompressionModel.set_num_codebooks"><code class="name flex">
+<span>def <span class="ident">set_num_codebooks</span></span>(<span>self, n: int)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Set the active number of codebooks used by the quantizer.</p>
+<div class="admonition warning">
+<p class="admonition-title">Warning:&ensp;this sets the number of codebooks <strong>before</strong> the flattening</p>
+</div>
+<p>of the codebooks.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def set_num_codebooks(self, n: int):
+    &#34;&#34;&#34;Set the active number of codebooks used by the quantizer.
+
+    ..Warning:: this sets the number of codebooks **before** the flattening
+    of the codebooks.
+    &#34;&#34;&#34;
+    assert n % self.codebooks_per_step == 0
+    self.model.set_num_codebooks(n)</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Inherited members</h3>
+<ul class="hlist">
+<li><code><b><a title="audiocraft.models.encodec.CompressionModel" href="#audiocraft.models.encodec.CompressionModel">CompressionModel</a></b></code>:
+<ul class="hlist">
+<li><code><a title="audiocraft.models.encodec.CompressionModel.decode" href="#audiocraft.models.encodec.CompressionModel.decode">decode</a></code></li>
+<li><code><a title="audiocraft.models.encodec.CompressionModel.encode" href="#audiocraft.models.encodec.CompressionModel.encode">encode</a></code></li>
+<li><code><a title="audiocraft.models.encodec.CompressionModel.forward" href="#audiocraft.models.encodec.CompressionModel.forward">forward</a></code></li>
+</ul>
+</li>
+</ul>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.models" href="index.html">audiocraft.models</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.models.encodec.CompressionModel" href="#audiocraft.models.encodec.CompressionModel">CompressionModel</a></code></h4>
+<ul class="two-column">
+<li><code><a title="audiocraft.models.encodec.CompressionModel.call_super_init" href="#audiocraft.models.encodec.CompressionModel.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.models.encodec.CompressionModel.cardinality" href="#audiocraft.models.encodec.CompressionModel.cardinality">cardinality</a></code></li>
+<li><code><a title="audiocraft.models.encodec.CompressionModel.channels" href="#audiocraft.models.encodec.CompressionModel.channels">channels</a></code></li>
+<li><code><a title="audiocraft.models.encodec.CompressionModel.decode" href="#audiocraft.models.encodec.CompressionModel.decode">decode</a></code></li>
+<li><code><a title="audiocraft.models.encodec.CompressionModel.dump_patches" href="#audiocraft.models.encodec.CompressionModel.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.models.encodec.CompressionModel.encode" href="#audiocraft.models.encodec.CompressionModel.encode">encode</a></code></li>
+<li><code><a title="audiocraft.models.encodec.CompressionModel.forward" href="#audiocraft.models.encodec.CompressionModel.forward">forward</a></code></li>
+<li><code><a title="audiocraft.models.encodec.CompressionModel.frame_rate" href="#audiocraft.models.encodec.CompressionModel.frame_rate">frame_rate</a></code></li>
+<li><code><a title="audiocraft.models.encodec.CompressionModel.num_codebooks" href="#audiocraft.models.encodec.CompressionModel.num_codebooks">num_codebooks</a></code></li>
+<li><code><a title="audiocraft.models.encodec.CompressionModel.sample_rate" href="#audiocraft.models.encodec.CompressionModel.sample_rate">sample_rate</a></code></li>
+<li><code><a title="audiocraft.models.encodec.CompressionModel.set_num_codebooks" href="#audiocraft.models.encodec.CompressionModel.set_num_codebooks">set_num_codebooks</a></code></li>
+<li><code><a title="audiocraft.models.encodec.CompressionModel.total_codebooks" href="#audiocraft.models.encodec.CompressionModel.total_codebooks">total_codebooks</a></code></li>
+<li><code><a title="audiocraft.models.encodec.CompressionModel.training" href="#audiocraft.models.encodec.CompressionModel.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.models.encodec.EncodecModel" href="#audiocraft.models.encodec.EncodecModel">EncodecModel</a></code></h4>
+<ul class="two-column">
+<li><code><a title="audiocraft.models.encodec.EncodecModel.cardinality" href="#audiocraft.models.encodec.EncodecModel.cardinality">cardinality</a></code></li>
+<li><code><a title="audiocraft.models.encodec.EncodecModel.channels" href="#audiocraft.models.encodec.EncodecModel.channels">channels</a></code></li>
+<li><code><a title="audiocraft.models.encodec.EncodecModel.decode" href="#audiocraft.models.encodec.EncodecModel.decode">decode</a></code></li>
+<li><code><a title="audiocraft.models.encodec.EncodecModel.encode" href="#audiocraft.models.encodec.EncodecModel.encode">encode</a></code></li>
+<li><code><a title="audiocraft.models.encodec.EncodecModel.frame_rate" href="#audiocraft.models.encodec.EncodecModel.frame_rate">frame_rate</a></code></li>
+<li><code><a title="audiocraft.models.encodec.EncodecModel.num_codebooks" href="#audiocraft.models.encodec.EncodecModel.num_codebooks">num_codebooks</a></code></li>
+<li><code><a title="audiocraft.models.encodec.EncodecModel.postprocess" href="#audiocraft.models.encodec.EncodecModel.postprocess">postprocess</a></code></li>
+<li><code><a title="audiocraft.models.encodec.EncodecModel.preprocess" href="#audiocraft.models.encodec.EncodecModel.preprocess">preprocess</a></code></li>
+<li><code><a title="audiocraft.models.encodec.EncodecModel.sample_rate" href="#audiocraft.models.encodec.EncodecModel.sample_rate">sample_rate</a></code></li>
+<li><code><a title="audiocraft.models.encodec.EncodecModel.total_codebooks" href="#audiocraft.models.encodec.EncodecModel.total_codebooks">total_codebooks</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.models.encodec.FlattenedCompressionModel" href="#audiocraft.models.encodec.FlattenedCompressionModel">FlattenedCompressionModel</a></code></h4>
+<ul class="two-column">
+<li><code><a title="audiocraft.models.encodec.FlattenedCompressionModel.call_super_init" href="#audiocraft.models.encodec.FlattenedCompressionModel.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.models.encodec.FlattenedCompressionModel.cardinality" href="#audiocraft.models.encodec.FlattenedCompressionModel.cardinality">cardinality</a></code></li>
+<li><code><a title="audiocraft.models.encodec.FlattenedCompressionModel.channels" href="#audiocraft.models.encodec.FlattenedCompressionModel.channels">channels</a></code></li>
+<li><code><a title="audiocraft.models.encodec.FlattenedCompressionModel.dump_patches" href="#audiocraft.models.encodec.FlattenedCompressionModel.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.models.encodec.FlattenedCompressionModel.frame_rate" href="#audiocraft.models.encodec.FlattenedCompressionModel.frame_rate">frame_rate</a></code></li>
+<li><code><a title="audiocraft.models.encodec.FlattenedCompressionModel.num_codebooks" href="#audiocraft.models.encodec.FlattenedCompressionModel.num_codebooks">num_codebooks</a></code></li>
+<li><code><a title="audiocraft.models.encodec.FlattenedCompressionModel.num_virtual_steps" href="#audiocraft.models.encodec.FlattenedCompressionModel.num_virtual_steps">num_virtual_steps</a></code></li>
+<li><code><a title="audiocraft.models.encodec.FlattenedCompressionModel.sample_rate" href="#audiocraft.models.encodec.FlattenedCompressionModel.sample_rate">sample_rate</a></code></li>
+<li><code><a title="audiocraft.models.encodec.FlattenedCompressionModel.set_num_codebooks" href="#audiocraft.models.encodec.FlattenedCompressionModel.set_num_codebooks">set_num_codebooks</a></code></li>
+<li><code><a title="audiocraft.models.encodec.FlattenedCompressionModel.total_codebooks" href="#audiocraft.models.encodec.FlattenedCompressionModel.total_codebooks">total_codebooks</a></code></li>
+<li><code><a title="audiocraft.models.encodec.FlattenedCompressionModel.training" href="#audiocraft.models.encodec.FlattenedCompressionModel.training">training</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/docs/audiocraft/models/index.html b/docs/audiocraft/models/index.html
new file mode 100644
index 00000000..f088aa04
--- /dev/null
+++ b/docs/audiocraft/models/index.html
@@ -0,0 +1,104 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.models API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.models</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# flake8: noqa
+from .musicgen import MusicGen
+from .lm import LMModel
+from .encodec import CompressionModel, EncodecModel</code></pre>
+</details>
+</section>
+<section>
+<h2 class="section-title" id="header-submodules">Sub-modules</h2>
+<dl>
+<dt><code class="name"><a title="audiocraft.models.builders" href="builders.html">audiocraft.models.builders</a></code></dt>
+<dd>
+<div class="desc"><p>All the functions to build the relevant models and modules
+from the Hydra config.</p></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.models.encodec" href="encodec.html">audiocraft.models.encodec</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.models.lm" href="lm.html">audiocraft.models.lm</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.models.loaders" href="loaders.html">audiocraft.models.loaders</a></code></dt>
+<dd>
+<div class="desc"><p>Utility functions to load from the checkpoints.
+Each checkpoint is a torch.saved dict with the following keys:
+- 'xp.cfg': the hydra config as dumped …</p></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.models.musicgen" href="musicgen.html">audiocraft.models.musicgen</a></code></dt>
+<dd>
+<div class="desc"><p>Main model for using MusicGen. This will combine all the required components
+and provide easy access to the generation API.</p></div>
+</dd>
+</dl>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft" href="../index.html">audiocraft</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-submodules">Sub-modules</a></h3>
+<ul>
+<li><code><a title="audiocraft.models.builders" href="builders.html">audiocraft.models.builders</a></code></li>
+<li><code><a title="audiocraft.models.encodec" href="encodec.html">audiocraft.models.encodec</a></code></li>
+<li><code><a title="audiocraft.models.lm" href="lm.html">audiocraft.models.lm</a></code></li>
+<li><code><a title="audiocraft.models.loaders" href="loaders.html">audiocraft.models.loaders</a></code></li>
+<li><code><a title="audiocraft.models.musicgen" href="musicgen.html">audiocraft.models.musicgen</a></code></li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/docs/audiocraft/models/lm.html b/docs/audiocraft/models/lm.html
new file mode 100644
index 00000000..0f6a515b
--- /dev/null
+++ b/docs/audiocraft/models/lm.html
@@ -0,0 +1,1721 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.models.lm API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.models.lm</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from dataclasses import dataclass
+from functools import partial
+import logging
+import math
+import typing as tp
+
+import torch
+from torch import nn
+
+from ..utils import utils
+from ..modules.streaming import StreamingModule, State
+from ..modules.transformer import StreamingTransformer, create_norm_fn
+from ..modules.conditioners import (
+    ConditionFuser,
+    ClassifierFreeGuidanceDropout,
+    AttributeDropout,
+    ConditioningProvider,
+    ConditioningAttributes,
+    ConditionType,
+)
+from ..modules.codebooks_patterns import CodebooksPatternProvider
+from ..modules.activations import get_activation_fn
+
+
+logger = logging.getLogger(__name__)
+ConditionTensors = tp.Dict[str, ConditionType]
+CFGConditions = tp.Union[ConditionTensors, tp.Tuple[ConditionTensors, ConditionTensors]]
+
+
+def get_init_fn(method: str, input_dim: int, init_depth: tp.Optional[int] = None):
+    &#34;&#34;&#34;LM layer initialization.
+    Inspired from xlformers: https://github.com/fairinternal/xlformers
+
+    Args:
+        method (str): Method name for init function. Valid options are:
+            &#39;gaussian&#39;, &#39;uniform&#39;.
+        input_dim (int): Input dimension of the initialized module.
+        init_depth (Optional[int]): Optional init depth value used to rescale
+            the standard deviation if defined.
+    &#34;&#34;&#34;
+    # Compute std
+    std = 1 / math.sqrt(input_dim)
+    # Rescale with depth
+    if init_depth is not None:
+        std = std / math.sqrt(2 * init_depth)
+
+    if method == &#39;gaussian&#39;:
+        return partial(
+            torch.nn.init.trunc_normal_, mean=0.0, std=std, a=-3 * std, b=3 * std
+        )
+    elif method == &#39;uniform&#39;:
+        bound = math.sqrt(3) * std  # ensure the standard deviation is `std`
+        return partial(torch.nn.init.uniform_, a=-bound, b=bound)
+    else:
+        raise ValueError(&#34;Unsupported layer initialization method&#34;)
+
+
+def init_layer(m: nn.Module,
+               method: str,
+               init_depth: tp.Optional[int] = None,
+               zero_bias_init: bool = False):
+    &#34;&#34;&#34;Wrapper around ``get_init_fn`` for proper initialization of LM modules.
+
+    Args:
+        m (nn.Module): Module to initialize.
+        method (str): Method name for the init function.
+        init_depth (Optional[int]): Optional init depth value used to rescale
+            the standard deviation if defined.
+        zero_bias_init (bool): Whether to initialize the bias to 0 or not.
+    &#34;&#34;&#34;
+    if isinstance(m, nn.Linear):
+        init_fn = get_init_fn(method, m.in_features, init_depth=init_depth)
+        if m.weight.device.type == &#39;cpu&#39; and m.weight.dtype == torch.float16:
+            weight = m.weight.float()
+            init_fn(weight)
+            m.weight.data[:] = weight.half()
+        else:
+            init_fn(m.weight)
+        if zero_bias_init and m.bias is not None:
+            nn.init.constant_(m.bias, 0)
+    elif isinstance(m, nn.Embedding):
+        init_fn = get_init_fn(method, m.embedding_dim, init_depth=None)
+        if m.weight.device.type == &#39;cpu&#39; and m.weight.dtype == torch.float16:
+            weight = m.weight.float()
+            init_fn(weight)
+            m.weight.data[:] = weight.half()
+        else:
+            init_fn(m.weight)
+
+
+class ScaledEmbedding(nn.Embedding):
+    &#34;&#34;&#34;Boost learning rate for embeddings (with `scale`).
+    &#34;&#34;&#34;
+    def __init__(self, *args, lr=None, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.lr = lr
+
+    def make_optim_group(self):
+        group = {&#34;params&#34;: list(self.parameters())}
+        if self.lr is not None:
+            group[&#34;lr&#34;] = self.lr
+        return group
+
+
+@dataclass
+class LMOutput:
+    # The logits are already re-aligned with the input codes
+    # hence no extra shift is required, e.g. when computing CE
+    logits: torch.Tensor  # [B, K, T, card]
+    mask: torch.Tensor  # [B, K, T]
+
+
+class LMModel(StreamingModule):
+    &#34;&#34;&#34;Transformer-based language model on multiple streams of codes.
+
+    Args:
+        pattern_provider (CodebooksPatternProvider): Pattern provider for codebook interleaving.
+        condition_provider (MusicConditioningProvider): Conditioning provider from metadata.
+        fuser (ConditionFuser): Fuser handling the fusing of conditions with language model input.
+        n_q (int): Number of parallel streams to model.
+        card (int): Cardinality, vocabulary size.
+        dim (int): Dimension of the transformer encoder.
+        num_heads (int): Number of heads for the transformer encoder.
+        hidden_scale (int): Scale for hidden feed forward dimension of the transformer encoder.
+        norm (str): Normalization method.
+        norm_first (bool): Use pre-norm instead of post-norm.
+        emb_lr (Optional[float]): Embedding-specific learning rate.
+        bias_proj (bool): Use bias for output projections.
+        weight_init (Optional[str]): Method for weight initialization.
+        depthwise_init (Optional[str]): Method for depthwise weight initialization.
+        zero_bias_init (bool): If true and bias in Linears, initialize bias to zeros.
+        cfg_dropout (float): Classifier-free guidance dropout.
+        cfg_coef (float): Classifier-free guidance coefficient.
+        attribute_dropout (dict): Attribute dropout probabilities.
+        two_step_cfg (bool): Whether to run classifier free-guidance with 2 distinct steps.
+        **kwargs: Additional parameters for the transformer encoder.
+    &#34;&#34;&#34;
+    def __init__(self, pattern_provider: CodebooksPatternProvider, condition_provider: ConditioningProvider,
+                 fuser: ConditionFuser, n_q: int = 8, card: int = 1024, dim: int = 128, num_heads: int = 8,
+                 hidden_scale: int = 4, norm: str = &#39;layer_norm&#39;, norm_first: bool = False,
+                 emb_lr: tp.Optional[float] = None, bias_proj: bool = True,
+                 weight_init: tp.Optional[str] = None, depthwise_init: tp.Optional[str] = None,
+                 zero_bias_init: bool = False, cfg_dropout: float = 0, cfg_coef: float = 1.0,
+                 attribute_dropout: tp.Dict[str, tp.Dict[str, float]] = {}, two_step_cfg: bool = False,
+                 **kwargs):
+        super().__init__()
+        self.cfg_coef = cfg_coef
+        self.cfg_dropout = ClassifierFreeGuidanceDropout(p=cfg_dropout)
+        self.att_dropout = AttributeDropout(p=attribute_dropout)
+        self.condition_provider = condition_provider
+        self.fuser = fuser
+        self.card = card
+        embed_dim = self.card + 1
+        self.n_q = n_q
+        self.dim = dim
+        self.pattern_provider = pattern_provider
+        self.two_step_cfg = two_step_cfg
+        self.emb = nn.ModuleList([ScaledEmbedding(embed_dim, dim, lr=emb_lr) for _ in range(n_q)])
+        if &#39;activation&#39; in kwargs:
+            kwargs[&#39;activation&#39;] = get_activation_fn(kwargs[&#39;activation&#39;])
+        self.transformer = StreamingTransformer(
+            d_model=dim, num_heads=num_heads, dim_feedforward=int(hidden_scale * dim),
+            norm=norm, norm_first=norm_first, **kwargs)
+        self.out_norm: tp.Optional[nn.Module] = None
+        if norm_first:
+            self.out_norm = create_norm_fn(norm, dim)
+        self.linears = nn.ModuleList([nn.Linear(dim, self.card, bias=bias_proj) for _ in range(n_q)])
+        self._init_weights(weight_init, depthwise_init, zero_bias_init)
+        self._fsdp: tp.Optional[nn.Module]
+        self.__dict__[&#39;_fsdp&#39;] = None
+
+    def _init_weights(self, weight_init: tp.Optional[str], depthwise_init: tp.Optional[str], zero_bias_init: bool):
+        &#34;&#34;&#34;Initialization of the transformer module weights.
+
+        Args:
+            weight_init (Optional[str]): Weight initialization strategy. See ``get_init_fn`` for valid options.
+            depthwise_init (Optional[str]): Depwthwise initialization strategy. The following options are valid:
+                &#39;current&#39; where the depth corresponds to the current layer index or &#39;global&#39; where the total number
+                of layer is used as depth. If not set, no depthwise initialization strategy is used.
+            zero_bias_init (bool): Whether to initalize bias to zero or not.
+        &#34;&#34;&#34;
+        assert depthwise_init is None or depthwise_init in [&#39;current&#39;, &#39;global&#39;]
+        assert depthwise_init is None or weight_init is not None, \
+            &#34;If &#39;depthwise_init&#39; is defined, a &#39;weight_init&#39; method should be provided.&#34;
+        assert not zero_bias_init or weight_init is not None, \
+            &#34;If &#39;zero_bias_init&#39;, a &#39;weight_init&#39; method should be provided&#34;
+
+        if weight_init is None:
+            return
+
+        for emb_layer in self.emb:
+            init_layer(emb_layer, method=weight_init, init_depth=None, zero_bias_init=zero_bias_init)
+
+        for layer_idx, tr_layer in enumerate(self.transformer.layers):
+            depth = None
+            if depthwise_init == &#39;current&#39;:
+                depth = layer_idx + 1
+            elif depthwise_init == &#39;global&#39;:
+                depth = len(self.transformer.layers)
+            init_fn = partial(init_layer, method=weight_init, init_depth=depth, zero_bias_init=zero_bias_init)
+            tr_layer.apply(init_fn)
+
+        for linear in self.linears:
+            init_layer(linear, method=weight_init, init_depth=None, zero_bias_init=zero_bias_init)
+
+    @property
+    def special_token_id(self) -&gt; int:
+        return self.card
+
+    @property
+    def num_codebooks(self) -&gt; int:
+        return self.n_q
+
+    def forward(self, sequence: torch.Tensor,
+                conditions: tp.List[ConditioningAttributes],
+                condition_tensors: tp.Optional[ConditionTensors] = None) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Apply language model on sequence and conditions.
+        Given a tensor of sequence of shape [B, K, S] with K the number of codebooks and
+        S the sequence steps, return the logits with shape [B, card, K, S].
+
+        Args:
+            indices (torch.Tensor): indices of the codes to model.
+            conditions (list[ConditioningAttributes]): conditionings to use when modeling
+                the given codes. Note that when evaluating multiple time with the same conditioning
+                you should pre-compute those and pass them as `condition_tensors`.
+            condition_tensors (dict[str, ConditionType] or None): pre-computed conditioning
+                tensors, see `conditions`.
+        Returns:
+            torch.Tensor: Logits.
+        &#34;&#34;&#34;
+        B, K, S = sequence.shape
+        assert K == self.num_codebooks, &#39;Sequence shape must match the specified number of codebooks&#39;
+        input_ = sum([self.emb[k](sequence[:, k]) for k in range(K)])
+        if condition_tensors is None:
+            assert not self._is_streaming, &#34;Conditions tensors should be precomputed when streaming.&#34;
+            # apply dropout modules
+            conditions = self.cfg_dropout(conditions)
+            conditions = self.att_dropout(conditions)
+            tokenized = self.condition_provider.tokenize(conditions)
+            # encode conditions and fuse, both have a streaming cache to not recompute when generating.
+            condition_tensors = self.condition_provider(tokenized)
+        else:
+            assert not conditions, &#34;Shouldn&#39;t pass both conditions and condition_tensors.&#34;
+
+        input_, cross_attention_input = self.fuser(input_, condition_tensors)
+
+        out = self.transformer(input_, cross_attention_src=cross_attention_input)
+        if self.out_norm:
+            out = self.out_norm(out)
+        logits = torch.stack([self.linears[k](out) for k in range(K)], dim=1)  # [B, K, S, card]
+
+        # remove the prefix from the model outputs
+        if len(self.fuser.fuse2cond[&#39;prepend&#39;]) &gt; 0:
+            logits = logits[:, :, -S:]
+
+        return logits  # [B, K, S, card]
+
+    def compute_predictions(
+            self, codes: torch.Tensor,
+            conditions: tp.List[ConditioningAttributes],
+            condition_tensors: tp.Optional[ConditionTensors] = None) -&gt; LMOutput:
+        &#34;&#34;&#34;Given an input tensor of codes [B, K, T] and list of conditions, runs the model
+        forward using the specified codes interleaving pattern.
+
+        Args:
+            codes (torch.Tensor): Input codes of shape [B, K, T] with B the batch size,
+                K the number of codebooks and T the number of timesteps.
+            conditions (list[ConditioningAttributes]): conditionings to use when modeling
+                the given codes. Note that when evaluating multiple time with the same conditioning
+                you should pre-compute those and pass them as `condition_tensors`.
+            condition_tensors (dict[str, ConditionType] or None): pre-computed conditioning
+                tensors, see `conditions`.
+        Returns:
+            LMOutput: Language model outputs
+                logits (torch.Tensor) of shape [B, K, T, card] corresponding to the provided codes,
+                    i.e. the first item corresponds to logits to predict the first code, meaning that
+                    no additional shifting of codes and logits is required.
+                mask (torch.Tensor) of shape [B, K, T], mask over valid and invalid positions.
+                    Given the specified interleaving strategies, parts of the logits and codes should
+                    not be considered as valid predictions because of invalid context.
+        &#34;&#34;&#34;
+        B, K, T = codes.shape
+        codes = codes.contiguous()
+        # map codes [B, K, T] into pattern sequence [B, K, S] using special_token_id for masked tokens
+        pattern = self.pattern_provider.get_pattern(T)
+        sequence_codes, sequence_indexes, sequence_mask = pattern.build_pattern_sequence(
+            codes, self.special_token_id, keep_only_valid_steps=True
+        )
+        # apply model on pattern sequence
+        model = self if self._fsdp is None else self._fsdp
+        logits = model(sequence_codes, conditions, condition_tensors)  # [B, K, S, card]
+        # map back the logits on pattern sequence to logits on original codes: [B, K, S, card] -&gt; [B, K, T, card]
+        # and provide the corresponding mask over invalid positions of tokens
+        logits = logits.permute(0, 3, 1, 2)  # [B, card, K, S]
+        # note: we use nans as special token to make it obvious if we feed unexpected logits
+        logits, logits_indexes, logits_mask = pattern.revert_pattern_logits(
+            logits, float(&#39;nan&#39;), keep_only_valid_steps=True
+        )
+        logits = logits.permute(0, 2, 3, 1)  # [B, K, T, card]
+        logits_mask = logits_mask[None, :, :].expand(B, -1, -1)  # [K, T] -&gt; [B, K, T]
+        return LMOutput(logits, logits_mask)
+
+    def _sample_next_token(self,
+                           sequence: torch.Tensor,
+                           cfg_conditions: CFGConditions,
+                           unconditional_state: State,
+                           use_sampling: bool = False,
+                           temp: float = 1.0,
+                           top_k: int = 0,
+                           top_p: float = 0.0,
+                           cfg_coef: tp.Optional[float] = None) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Sample next token from the model given a sequence and a set of conditions. The model supports
+        multiple sampling strategies (greedy sampling, softmax, top-k, top-p...).
+
+        Args:
+            sequence (torch.Tensor): Current sequence of shape [B, K, S]
+                with K corresponding to the number of codebooks and S the number of sequence steps.
+                S = 1 in streaming mode, except for the first step that contains a bigger prompt.
+            condition_tensors (Dict[str, ConditionType): Set of conditions. If CFG is used,
+                should be twice the batch size, being the concatenation of the conditions + null conditions.
+            use_sampling (bool): Whether to use a sampling strategy or not.
+            temp (float): Sampling temperature.
+            top_k (int): K for &#34;top-k&#34; sampling.
+            top_p (float): P for &#34;top-p&#34; sampling.
+            cfg_coef (float): classifier free guidance coefficient
+        Returns:
+            next_token (torch.Tensor): Next token tensor of shape [B, K, 1].
+        &#34;&#34;&#34;
+        B = sequence.shape[0]
+        cfg_coef = self.cfg_coef if cfg_coef is None else cfg_coef
+        model = self if self._fsdp is None else self._fsdp
+        if self.two_step_cfg and cfg_conditions != {}:
+            assert isinstance(cfg_conditions, tuple)
+            condition_tensors, null_condition_tensors = cfg_conditions
+            cond_logits = model(sequence, conditions=[], condition_tensors=condition_tensors)
+            state = self.get_streaming_state()
+            self.set_streaming_state(unconditional_state)
+            uncond_logits = model(sequence, conditions=[], condition_tensors=null_condition_tensors)
+            unconditional_state.update(self.get_streaming_state())
+            self.set_streaming_state(state)
+            logits = uncond_logits + (cond_logits - uncond_logits) * self.cfg_coef
+        else:
+            assert isinstance(cfg_conditions, dict)
+            condition_tensors = cfg_conditions
+            if condition_tensors:
+                # Preparing for CFG, predicting both conditional and unconditional logits.
+                sequence = torch.cat([sequence, sequence], dim=0)
+            all_logits = model(
+                sequence,
+                conditions=[], condition_tensors=condition_tensors)
+            if condition_tensors:
+                cond_logits, uncond_logits = all_logits.split(B, dim=0)  # [B, K, T, card]
+                logits = uncond_logits + (cond_logits - uncond_logits) * cfg_coef
+            else:
+                logits = all_logits
+
+        logits = logits.permute(0, 1, 3, 2)  # [B, K, card, T]
+        logits = logits[..., -1]  # [B x K x card]
+
+        # Apply softmax for sampling if temp &gt; 0. Else, do greedy sampling to avoid zero division error.
+        if use_sampling and temp &gt; 0.0:
+            probs = torch.softmax(logits / temp, dim=-1)
+            if top_p &gt; 0.0:
+                next_token = utils.sample_top_p(probs, p=top_p)
+            elif top_k &gt; 0:
+                next_token = utils.sample_top_k(probs, k=top_k)
+            else:
+                next_token = utils.multinomial(probs, num_samples=1)
+        else:
+            next_token = torch.argmax(logits, dim=-1, keepdim=True)
+
+        return next_token
+
+    @torch.no_grad()
+    def generate(self,
+                 prompt: tp.Optional[torch.Tensor] = None,
+                 conditions: tp.List[ConditioningAttributes] = [],
+                 num_samples: tp.Optional[int] = None,
+                 max_gen_len: int = 256,
+                 use_sampling: bool = True,
+                 temp: float = 1.0,
+                 top_k: int = 250,
+                 top_p: float = 0.0,
+                 cfg_coef: tp.Optional[float] = None,
+                 two_step_cfg: bool = False,
+                 remove_prompts: bool = False,
+                 check: bool = False,
+                 callback: tp.Optional[tp.Callable[[int, int], None]] = None) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Generate tokens sampling from the model given a prompt or unconditionally. Generation can
+        be perform in a greedy fashion or using sampling with top K and top P strategies.
+
+        Args:
+            prompt (Optional[torch.Tensor]): Prompt tokens of shape [B, K, T].
+            conditions_tensors (Dict[str, torch.Tensor]): Set of conditions or None.
+            num_samples (int or None): Number of samples to generate when no prompt and no conditions are given.
+            max_gen_len (int): Maximum generation length.
+            use_sampling (bool): Whether to use a sampling strategy or not.
+            temp (float): Sampling temperature.
+            top_k (int): K for &#34;top-k&#34; sampling.
+            top_p (float): P for &#34;top-p&#34; sampling.
+            remove_prompts (bool): Whether to remove prompts from generation or not.
+        Returns:
+            torch.Tensor: Generated tokens.
+        &#34;&#34;&#34;
+        assert not self.training, &#34;generation shouldn&#39;t be used in training mode.&#34;
+        first_param = next(iter(self.parameters()))
+        device = first_param.device
+
+        # Checking all input shapes are consistents.
+        possible_num_samples = []
+        if num_samples is not None:
+            possible_num_samples.append(num_samples)
+        elif prompt is not None:
+            possible_num_samples.append(prompt.shape[0])
+        elif conditions:
+            possible_num_samples.append(len(conditions))
+        else:
+            possible_num_samples.append(1)
+        assert [x == possible_num_samples[0] for x in possible_num_samples], &#34;Inconsitent inputs shapes&#34;
+        num_samples = possible_num_samples[0]
+
+        # below we create set of conditions: one conditional and one unconditional
+        # to do that we merge the regular condition together with the null condition
+        # we then do 1 forward pass instead of 2.
+        # the reason for that is two-fold:
+        # 1. it is about x2 faster than doing 2 forward passes
+        # 2. avoid the streaming API treating the 2 passes as part of different time steps
+        # We also support doing two different passes, in particular to ensure that
+        # the padding structure is exactly the same between train anf test.
+        # With a batch size of 1, this can be slower though.
+        cfg_conditions: CFGConditions
+        two_step_cfg = self.two_step_cfg if two_step_cfg is None else two_step_cfg
+        if conditions:
+            null_conditions = ClassifierFreeGuidanceDropout(p=1.0)(conditions)
+            if two_step_cfg:
+                cfg_conditions = (
+                    self.condition_provider(self.condition_provider.tokenize(conditions)),
+                    self.condition_provider(self.condition_provider.tokenize(null_conditions)),
+                )
+            else:
+                conditions = conditions + null_conditions
+                tokenized = self.condition_provider.tokenize(conditions)
+                cfg_conditions = self.condition_provider(tokenized)
+        else:
+            cfg_conditions = {}
+
+        if prompt is None:
+            assert num_samples &gt; 0
+            prompt = torch.zeros((num_samples, self.num_codebooks, 0), dtype=torch.long, device=device)
+
+        B, K, T = prompt.shape
+        start_offset = T
+        assert start_offset &lt; max_gen_len
+
+        pattern = self.pattern_provider.get_pattern(max_gen_len)
+        # this token is used as default value for codes that are not generated yet
+        unknown_token = -1
+
+        # we generate codes up to the max_gen_len that will be mapped to the pattern sequence
+        gen_codes = torch.full((B, K, max_gen_len), unknown_token, dtype=torch.long, device=device)
+        # filling the gen_codes with the prompt if needed
+        gen_codes[..., :start_offset] = prompt
+        # create the gen_sequence with proper interleaving from the pattern: [B, K, S]
+        gen_sequence, indexes, mask = pattern.build_pattern_sequence(gen_codes, self.special_token_id)
+        # retrieve the start_offset in the sequence:
+        # it is the first sequence step that contains the `start_offset` timestep
+        start_offset_sequence = pattern.get_first_step_with_timesteps(start_offset)
+        assert start_offset_sequence is not None
+
+        with self.streaming():
+            unconditional_state = self.get_streaming_state()
+            prev_offset = 0
+            gen_sequence_len = gen_sequence.shape[-1]  # gen_sequence shape is [B, K, S]
+            for offset in range(start_offset_sequence, gen_sequence_len):
+                # get current sequence (note that the streaming API is providing the caching over previous offsets)
+                curr_sequence = gen_sequence[..., prev_offset:offset]
+                curr_mask = mask[None, ..., prev_offset:offset].expand(B, -1, -1)
+                if check:
+                    # check coherence between mask and sequence
+                    assert (curr_sequence == torch.where(curr_mask, curr_sequence, self.special_token_id)).all()
+                    # should never happen as gen_sequence is filled progressively
+                    assert not (curr_sequence == unknown_token).any()
+                # sample next token from the model, next token shape is [B, K, 1]
+                next_token = self._sample_next_token(
+                    curr_sequence, cfg_conditions, unconditional_state, use_sampling, temp, top_k, top_p,
+                    cfg_coef=cfg_coef)
+                # ensure the tokens that should be masked are properly set to special_token_id
+                # as the model never output special_token_id
+                valid_mask = mask[..., offset:offset+1].expand(B, -1, -1)
+                next_token[~valid_mask] = self.special_token_id
+                # ensure we don&#39;t overwrite prompt tokens, we only write over unknown tokens
+                # (then mask tokens should be left as is as well, which is correct)
+                gen_sequence[..., offset:offset+1] = torch.where(
+                    gen_sequence[..., offset:offset+1] == unknown_token,
+                    next_token, gen_sequence[..., offset:offset+1]
+                )
+                prev_offset = offset
+                if callback is not None:
+                    callback(1 + offset - start_offset_sequence, gen_sequence_len - start_offset_sequence)
+        unconditional_state.clear()
+
+        # ensure sequence has been entirely filled
+        assert not (gen_sequence == unknown_token).any()
+        # ensure gen_sequence pattern and mask are matching
+        # which means the gen_sequence is valid according to the pattern
+        assert (
+            gen_sequence == torch.where(mask[None, ...].expand(B, -1, -1), gen_sequence, self.special_token_id)
+        ).all()
+        # get back the codes, trimming the prompt if needed and cutting potentially incomplete timesteps
+        out_codes, out_indexes, out_mask = pattern.revert_pattern_sequence(gen_sequence, special_token=unknown_token)
+
+        # sanity checks over the returned codes and corresponding masks
+        assert (out_codes[..., :max_gen_len] != unknown_token).all()
+        assert (out_mask[..., :max_gen_len] == 1).all()
+
+        out_start_offset = start_offset if remove_prompts else 0
+        out_codes = out_codes[..., out_start_offset:max_gen_len]
+
+        # ensure the returned codes are all valid
+        assert (out_codes &gt;= 0).all() and (out_codes &lt;= self.card).all()
+        return out_codes</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-functions">Functions</h2>
+<dl>
+<dt id="audiocraft.models.lm.get_init_fn"><code class="name flex">
+<span>def <span class="ident">get_init_fn</span></span>(<span>method: str, input_dim: int, init_depth: Optional[int] = None)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>LM layer initialization.
+Inspired from xlformers: <a href="https://github.com/fairinternal/xlformers">https://github.com/fairinternal/xlformers</a></p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>method</code></strong> :&ensp;<code>str</code></dt>
+<dd>Method name for init function. Valid options are:
+'gaussian', 'uniform'.</dd>
+<dt><strong><code>input_dim</code></strong> :&ensp;<code>int</code></dt>
+<dd>Input dimension of the initialized module.</dd>
+<dt><strong><code>init_depth</code></strong> :&ensp;<code>Optional[int]</code></dt>
+<dd>Optional init depth value used to rescale
+the standard deviation if defined.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_init_fn(method: str, input_dim: int, init_depth: tp.Optional[int] = None):
+    &#34;&#34;&#34;LM layer initialization.
+    Inspired from xlformers: https://github.com/fairinternal/xlformers
+
+    Args:
+        method (str): Method name for init function. Valid options are:
+            &#39;gaussian&#39;, &#39;uniform&#39;.
+        input_dim (int): Input dimension of the initialized module.
+        init_depth (Optional[int]): Optional init depth value used to rescale
+            the standard deviation if defined.
+    &#34;&#34;&#34;
+    # Compute std
+    std = 1 / math.sqrt(input_dim)
+    # Rescale with depth
+    if init_depth is not None:
+        std = std / math.sqrt(2 * init_depth)
+
+    if method == &#39;gaussian&#39;:
+        return partial(
+            torch.nn.init.trunc_normal_, mean=0.0, std=std, a=-3 * std, b=3 * std
+        )
+    elif method == &#39;uniform&#39;:
+        bound = math.sqrt(3) * std  # ensure the standard deviation is `std`
+        return partial(torch.nn.init.uniform_, a=-bound, b=bound)
+    else:
+        raise ValueError(&#34;Unsupported layer initialization method&#34;)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.lm.init_layer"><code class="name flex">
+<span>def <span class="ident">init_layer</span></span>(<span>m: torch.nn.modules.module.Module, method: str, init_depth: Optional[int] = None, zero_bias_init: bool = False)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Wrapper around <code><a title="audiocraft.models.lm.get_init_fn" href="#audiocraft.models.lm.get_init_fn">get_init_fn()</a></code> for proper initialization of LM modules.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>m</code></strong> :&ensp;<code>nn.Module</code></dt>
+<dd>Module to initialize.</dd>
+<dt><strong><code>method</code></strong> :&ensp;<code>str</code></dt>
+<dd>Method name for the init function.</dd>
+<dt><strong><code>init_depth</code></strong> :&ensp;<code>Optional[int]</code></dt>
+<dd>Optional init depth value used to rescale
+the standard deviation if defined.</dd>
+<dt><strong><code>zero_bias_init</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to initialize the bias to 0 or not.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def init_layer(m: nn.Module,
+               method: str,
+               init_depth: tp.Optional[int] = None,
+               zero_bias_init: bool = False):
+    &#34;&#34;&#34;Wrapper around ``get_init_fn`` for proper initialization of LM modules.
+
+    Args:
+        m (nn.Module): Module to initialize.
+        method (str): Method name for the init function.
+        init_depth (Optional[int]): Optional init depth value used to rescale
+            the standard deviation if defined.
+        zero_bias_init (bool): Whether to initialize the bias to 0 or not.
+    &#34;&#34;&#34;
+    if isinstance(m, nn.Linear):
+        init_fn = get_init_fn(method, m.in_features, init_depth=init_depth)
+        if m.weight.device.type == &#39;cpu&#39; and m.weight.dtype == torch.float16:
+            weight = m.weight.float()
+            init_fn(weight)
+            m.weight.data[:] = weight.half()
+        else:
+            init_fn(m.weight)
+        if zero_bias_init and m.bias is not None:
+            nn.init.constant_(m.bias, 0)
+    elif isinstance(m, nn.Embedding):
+        init_fn = get_init_fn(method, m.embedding_dim, init_depth=None)
+        if m.weight.device.type == &#39;cpu&#39; and m.weight.dtype == torch.float16:
+            weight = m.weight.float()
+            init_fn(weight)
+            m.weight.data[:] = weight.half()
+        else:
+            init_fn(m.weight)</code></pre>
+</details>
+</dd>
+</dl>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.models.lm.LMModel"><code class="flex name class">
+<span>class <span class="ident">LMModel</span></span>
+<span>(</span><span>pattern_provider: <a title="audiocraft.modules.codebooks_patterns.CodebooksPatternProvider" href="../modules/codebooks_patterns.html#audiocraft.modules.codebooks_patterns.CodebooksPatternProvider">CodebooksPatternProvider</a>, condition_provider: <a title="audiocraft.modules.conditioners.ConditioningProvider" href="../modules/conditioners.html#audiocraft.modules.conditioners.ConditioningProvider">ConditioningProvider</a>, fuser: <a title="audiocraft.modules.conditioners.ConditionFuser" href="../modules/conditioners.html#audiocraft.modules.conditioners.ConditionFuser">ConditionFuser</a>, n_q: int = 8, card: int = 1024, dim: int = 128, num_heads: int = 8, hidden_scale: int = 4, norm: str = 'layer_norm', norm_first: bool = False, emb_lr: Optional[float] = None, bias_proj: bool = True, weight_init: Optional[str] = None, depthwise_init: Optional[str] = None, zero_bias_init: bool = False, cfg_dropout: float = 0, cfg_coef: float = 1.0, attribute_dropout: Dict[str, Dict[str, float]] = {}, two_step_cfg: bool = False, **kwargs)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Transformer-based language model on multiple streams of codes.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>pattern_provider</code></strong> :&ensp;<code>CodebooksPatternProvider</code></dt>
+<dd>Pattern provider for codebook interleaving.</dd>
+<dt><strong><code>condition_provider</code></strong> :&ensp;<code>MusicConditioningProvider</code></dt>
+<dd>Conditioning provider from metadata.</dd>
+<dt><strong><code>fuser</code></strong> :&ensp;<code>ConditionFuser</code></dt>
+<dd>Fuser handling the fusing of conditions with language model input.</dd>
+<dt><strong><code>n_q</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of parallel streams to model.</dd>
+<dt><strong><code>card</code></strong> :&ensp;<code>int</code></dt>
+<dd>Cardinality, vocabulary size.</dd>
+<dt><strong><code>dim</code></strong> :&ensp;<code>int</code></dt>
+<dd>Dimension of the transformer encoder.</dd>
+<dt><strong><code>num_heads</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of heads for the transformer encoder.</dd>
+<dt><strong><code>hidden_scale</code></strong> :&ensp;<code>int</code></dt>
+<dd>Scale for hidden feed forward dimension of the transformer encoder.</dd>
+<dt><strong><code>norm</code></strong> :&ensp;<code>str</code></dt>
+<dd>Normalization method.</dd>
+<dt><strong><code>norm_first</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Use pre-norm instead of post-norm.</dd>
+<dt><strong><code>emb_lr</code></strong> :&ensp;<code>Optional[float]</code></dt>
+<dd>Embedding-specific learning rate.</dd>
+<dt><strong><code>bias_proj</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Use bias for output projections.</dd>
+<dt><strong><code>weight_init</code></strong> :&ensp;<code>Optional[str]</code></dt>
+<dd>Method for weight initialization.</dd>
+<dt><strong><code>depthwise_init</code></strong> :&ensp;<code>Optional[str]</code></dt>
+<dd>Method for depthwise weight initialization.</dd>
+<dt><strong><code>zero_bias_init</code></strong> :&ensp;<code>bool</code></dt>
+<dd>If true and bias in Linears, initialize bias to zeros.</dd>
+<dt><strong><code>cfg_dropout</code></strong> :&ensp;<code>float</code></dt>
+<dd>Classifier-free guidance dropout.</dd>
+<dt><strong><code>cfg_coef</code></strong> :&ensp;<code>float</code></dt>
+<dd>Classifier-free guidance coefficient.</dd>
+<dt><strong><code>attribute_dropout</code></strong> :&ensp;<code>dict</code></dt>
+<dd>Attribute dropout probabilities.</dd>
+<dt><strong><code>two_step_cfg</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to run classifier free-guidance with 2 distinct steps.</dd>
+<dt><strong><code>**kwargs</code></strong></dt>
+<dd>Additional parameters for the transformer encoder.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class LMModel(StreamingModule):
+    &#34;&#34;&#34;Transformer-based language model on multiple streams of codes.
+
+    Args:
+        pattern_provider (CodebooksPatternProvider): Pattern provider for codebook interleaving.
+        condition_provider (MusicConditioningProvider): Conditioning provider from metadata.
+        fuser (ConditionFuser): Fuser handling the fusing of conditions with language model input.
+        n_q (int): Number of parallel streams to model.
+        card (int): Cardinality, vocabulary size.
+        dim (int): Dimension of the transformer encoder.
+        num_heads (int): Number of heads for the transformer encoder.
+        hidden_scale (int): Scale for hidden feed forward dimension of the transformer encoder.
+        norm (str): Normalization method.
+        norm_first (bool): Use pre-norm instead of post-norm.
+        emb_lr (Optional[float]): Embedding-specific learning rate.
+        bias_proj (bool): Use bias for output projections.
+        weight_init (Optional[str]): Method for weight initialization.
+        depthwise_init (Optional[str]): Method for depthwise weight initialization.
+        zero_bias_init (bool): If true and bias in Linears, initialize bias to zeros.
+        cfg_dropout (float): Classifier-free guidance dropout.
+        cfg_coef (float): Classifier-free guidance coefficient.
+        attribute_dropout (dict): Attribute dropout probabilities.
+        two_step_cfg (bool): Whether to run classifier free-guidance with 2 distinct steps.
+        **kwargs: Additional parameters for the transformer encoder.
+    &#34;&#34;&#34;
+    def __init__(self, pattern_provider: CodebooksPatternProvider, condition_provider: ConditioningProvider,
+                 fuser: ConditionFuser, n_q: int = 8, card: int = 1024, dim: int = 128, num_heads: int = 8,
+                 hidden_scale: int = 4, norm: str = &#39;layer_norm&#39;, norm_first: bool = False,
+                 emb_lr: tp.Optional[float] = None, bias_proj: bool = True,
+                 weight_init: tp.Optional[str] = None, depthwise_init: tp.Optional[str] = None,
+                 zero_bias_init: bool = False, cfg_dropout: float = 0, cfg_coef: float = 1.0,
+                 attribute_dropout: tp.Dict[str, tp.Dict[str, float]] = {}, two_step_cfg: bool = False,
+                 **kwargs):
+        super().__init__()
+        self.cfg_coef = cfg_coef
+        self.cfg_dropout = ClassifierFreeGuidanceDropout(p=cfg_dropout)
+        self.att_dropout = AttributeDropout(p=attribute_dropout)
+        self.condition_provider = condition_provider
+        self.fuser = fuser
+        self.card = card
+        embed_dim = self.card + 1
+        self.n_q = n_q
+        self.dim = dim
+        self.pattern_provider = pattern_provider
+        self.two_step_cfg = two_step_cfg
+        self.emb = nn.ModuleList([ScaledEmbedding(embed_dim, dim, lr=emb_lr) for _ in range(n_q)])
+        if &#39;activation&#39; in kwargs:
+            kwargs[&#39;activation&#39;] = get_activation_fn(kwargs[&#39;activation&#39;])
+        self.transformer = StreamingTransformer(
+            d_model=dim, num_heads=num_heads, dim_feedforward=int(hidden_scale * dim),
+            norm=norm, norm_first=norm_first, **kwargs)
+        self.out_norm: tp.Optional[nn.Module] = None
+        if norm_first:
+            self.out_norm = create_norm_fn(norm, dim)
+        self.linears = nn.ModuleList([nn.Linear(dim, self.card, bias=bias_proj) for _ in range(n_q)])
+        self._init_weights(weight_init, depthwise_init, zero_bias_init)
+        self._fsdp: tp.Optional[nn.Module]
+        self.__dict__[&#39;_fsdp&#39;] = None
+
+    def _init_weights(self, weight_init: tp.Optional[str], depthwise_init: tp.Optional[str], zero_bias_init: bool):
+        &#34;&#34;&#34;Initialization of the transformer module weights.
+
+        Args:
+            weight_init (Optional[str]): Weight initialization strategy. See ``get_init_fn`` for valid options.
+            depthwise_init (Optional[str]): Depwthwise initialization strategy. The following options are valid:
+                &#39;current&#39; where the depth corresponds to the current layer index or &#39;global&#39; where the total number
+                of layer is used as depth. If not set, no depthwise initialization strategy is used.
+            zero_bias_init (bool): Whether to initalize bias to zero or not.
+        &#34;&#34;&#34;
+        assert depthwise_init is None or depthwise_init in [&#39;current&#39;, &#39;global&#39;]
+        assert depthwise_init is None or weight_init is not None, \
+            &#34;If &#39;depthwise_init&#39; is defined, a &#39;weight_init&#39; method should be provided.&#34;
+        assert not zero_bias_init or weight_init is not None, \
+            &#34;If &#39;zero_bias_init&#39;, a &#39;weight_init&#39; method should be provided&#34;
+
+        if weight_init is None:
+            return
+
+        for emb_layer in self.emb:
+            init_layer(emb_layer, method=weight_init, init_depth=None, zero_bias_init=zero_bias_init)
+
+        for layer_idx, tr_layer in enumerate(self.transformer.layers):
+            depth = None
+            if depthwise_init == &#39;current&#39;:
+                depth = layer_idx + 1
+            elif depthwise_init == &#39;global&#39;:
+                depth = len(self.transformer.layers)
+            init_fn = partial(init_layer, method=weight_init, init_depth=depth, zero_bias_init=zero_bias_init)
+            tr_layer.apply(init_fn)
+
+        for linear in self.linears:
+            init_layer(linear, method=weight_init, init_depth=None, zero_bias_init=zero_bias_init)
+
+    @property
+    def special_token_id(self) -&gt; int:
+        return self.card
+
+    @property
+    def num_codebooks(self) -&gt; int:
+        return self.n_q
+
+    def forward(self, sequence: torch.Tensor,
+                conditions: tp.List[ConditioningAttributes],
+                condition_tensors: tp.Optional[ConditionTensors] = None) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Apply language model on sequence and conditions.
+        Given a tensor of sequence of shape [B, K, S] with K the number of codebooks and
+        S the sequence steps, return the logits with shape [B, card, K, S].
+
+        Args:
+            indices (torch.Tensor): indices of the codes to model.
+            conditions (list[ConditioningAttributes]): conditionings to use when modeling
+                the given codes. Note that when evaluating multiple time with the same conditioning
+                you should pre-compute those and pass them as `condition_tensors`.
+            condition_tensors (dict[str, ConditionType] or None): pre-computed conditioning
+                tensors, see `conditions`.
+        Returns:
+            torch.Tensor: Logits.
+        &#34;&#34;&#34;
+        B, K, S = sequence.shape
+        assert K == self.num_codebooks, &#39;Sequence shape must match the specified number of codebooks&#39;
+        input_ = sum([self.emb[k](sequence[:, k]) for k in range(K)])
+        if condition_tensors is None:
+            assert not self._is_streaming, &#34;Conditions tensors should be precomputed when streaming.&#34;
+            # apply dropout modules
+            conditions = self.cfg_dropout(conditions)
+            conditions = self.att_dropout(conditions)
+            tokenized = self.condition_provider.tokenize(conditions)
+            # encode conditions and fuse, both have a streaming cache to not recompute when generating.
+            condition_tensors = self.condition_provider(tokenized)
+        else:
+            assert not conditions, &#34;Shouldn&#39;t pass both conditions and condition_tensors.&#34;
+
+        input_, cross_attention_input = self.fuser(input_, condition_tensors)
+
+        out = self.transformer(input_, cross_attention_src=cross_attention_input)
+        if self.out_norm:
+            out = self.out_norm(out)
+        logits = torch.stack([self.linears[k](out) for k in range(K)], dim=1)  # [B, K, S, card]
+
+        # remove the prefix from the model outputs
+        if len(self.fuser.fuse2cond[&#39;prepend&#39;]) &gt; 0:
+            logits = logits[:, :, -S:]
+
+        return logits  # [B, K, S, card]
+
+    def compute_predictions(
+            self, codes: torch.Tensor,
+            conditions: tp.List[ConditioningAttributes],
+            condition_tensors: tp.Optional[ConditionTensors] = None) -&gt; LMOutput:
+        &#34;&#34;&#34;Given an input tensor of codes [B, K, T] and list of conditions, runs the model
+        forward using the specified codes interleaving pattern.
+
+        Args:
+            codes (torch.Tensor): Input codes of shape [B, K, T] with B the batch size,
+                K the number of codebooks and T the number of timesteps.
+            conditions (list[ConditioningAttributes]): conditionings to use when modeling
+                the given codes. Note that when evaluating multiple time with the same conditioning
+                you should pre-compute those and pass them as `condition_tensors`.
+            condition_tensors (dict[str, ConditionType] or None): pre-computed conditioning
+                tensors, see `conditions`.
+        Returns:
+            LMOutput: Language model outputs
+                logits (torch.Tensor) of shape [B, K, T, card] corresponding to the provided codes,
+                    i.e. the first item corresponds to logits to predict the first code, meaning that
+                    no additional shifting of codes and logits is required.
+                mask (torch.Tensor) of shape [B, K, T], mask over valid and invalid positions.
+                    Given the specified interleaving strategies, parts of the logits and codes should
+                    not be considered as valid predictions because of invalid context.
+        &#34;&#34;&#34;
+        B, K, T = codes.shape
+        codes = codes.contiguous()
+        # map codes [B, K, T] into pattern sequence [B, K, S] using special_token_id for masked tokens
+        pattern = self.pattern_provider.get_pattern(T)
+        sequence_codes, sequence_indexes, sequence_mask = pattern.build_pattern_sequence(
+            codes, self.special_token_id, keep_only_valid_steps=True
+        )
+        # apply model on pattern sequence
+        model = self if self._fsdp is None else self._fsdp
+        logits = model(sequence_codes, conditions, condition_tensors)  # [B, K, S, card]
+        # map back the logits on pattern sequence to logits on original codes: [B, K, S, card] -&gt; [B, K, T, card]
+        # and provide the corresponding mask over invalid positions of tokens
+        logits = logits.permute(0, 3, 1, 2)  # [B, card, K, S]
+        # note: we use nans as special token to make it obvious if we feed unexpected logits
+        logits, logits_indexes, logits_mask = pattern.revert_pattern_logits(
+            logits, float(&#39;nan&#39;), keep_only_valid_steps=True
+        )
+        logits = logits.permute(0, 2, 3, 1)  # [B, K, T, card]
+        logits_mask = logits_mask[None, :, :].expand(B, -1, -1)  # [K, T] -&gt; [B, K, T]
+        return LMOutput(logits, logits_mask)
+
+    def _sample_next_token(self,
+                           sequence: torch.Tensor,
+                           cfg_conditions: CFGConditions,
+                           unconditional_state: State,
+                           use_sampling: bool = False,
+                           temp: float = 1.0,
+                           top_k: int = 0,
+                           top_p: float = 0.0,
+                           cfg_coef: tp.Optional[float] = None) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Sample next token from the model given a sequence and a set of conditions. The model supports
+        multiple sampling strategies (greedy sampling, softmax, top-k, top-p...).
+
+        Args:
+            sequence (torch.Tensor): Current sequence of shape [B, K, S]
+                with K corresponding to the number of codebooks and S the number of sequence steps.
+                S = 1 in streaming mode, except for the first step that contains a bigger prompt.
+            condition_tensors (Dict[str, ConditionType): Set of conditions. If CFG is used,
+                should be twice the batch size, being the concatenation of the conditions + null conditions.
+            use_sampling (bool): Whether to use a sampling strategy or not.
+            temp (float): Sampling temperature.
+            top_k (int): K for &#34;top-k&#34; sampling.
+            top_p (float): P for &#34;top-p&#34; sampling.
+            cfg_coef (float): classifier free guidance coefficient
+        Returns:
+            next_token (torch.Tensor): Next token tensor of shape [B, K, 1].
+        &#34;&#34;&#34;
+        B = sequence.shape[0]
+        cfg_coef = self.cfg_coef if cfg_coef is None else cfg_coef
+        model = self if self._fsdp is None else self._fsdp
+        if self.two_step_cfg and cfg_conditions != {}:
+            assert isinstance(cfg_conditions, tuple)
+            condition_tensors, null_condition_tensors = cfg_conditions
+            cond_logits = model(sequence, conditions=[], condition_tensors=condition_tensors)
+            state = self.get_streaming_state()
+            self.set_streaming_state(unconditional_state)
+            uncond_logits = model(sequence, conditions=[], condition_tensors=null_condition_tensors)
+            unconditional_state.update(self.get_streaming_state())
+            self.set_streaming_state(state)
+            logits = uncond_logits + (cond_logits - uncond_logits) * self.cfg_coef
+        else:
+            assert isinstance(cfg_conditions, dict)
+            condition_tensors = cfg_conditions
+            if condition_tensors:
+                # Preparing for CFG, predicting both conditional and unconditional logits.
+                sequence = torch.cat([sequence, sequence], dim=0)
+            all_logits = model(
+                sequence,
+                conditions=[], condition_tensors=condition_tensors)
+            if condition_tensors:
+                cond_logits, uncond_logits = all_logits.split(B, dim=0)  # [B, K, T, card]
+                logits = uncond_logits + (cond_logits - uncond_logits) * cfg_coef
+            else:
+                logits = all_logits
+
+        logits = logits.permute(0, 1, 3, 2)  # [B, K, card, T]
+        logits = logits[..., -1]  # [B x K x card]
+
+        # Apply softmax for sampling if temp &gt; 0. Else, do greedy sampling to avoid zero division error.
+        if use_sampling and temp &gt; 0.0:
+            probs = torch.softmax(logits / temp, dim=-1)
+            if top_p &gt; 0.0:
+                next_token = utils.sample_top_p(probs, p=top_p)
+            elif top_k &gt; 0:
+                next_token = utils.sample_top_k(probs, k=top_k)
+            else:
+                next_token = utils.multinomial(probs, num_samples=1)
+        else:
+            next_token = torch.argmax(logits, dim=-1, keepdim=True)
+
+        return next_token
+
+    @torch.no_grad()
+    def generate(self,
+                 prompt: tp.Optional[torch.Tensor] = None,
+                 conditions: tp.List[ConditioningAttributes] = [],
+                 num_samples: tp.Optional[int] = None,
+                 max_gen_len: int = 256,
+                 use_sampling: bool = True,
+                 temp: float = 1.0,
+                 top_k: int = 250,
+                 top_p: float = 0.0,
+                 cfg_coef: tp.Optional[float] = None,
+                 two_step_cfg: bool = False,
+                 remove_prompts: bool = False,
+                 check: bool = False,
+                 callback: tp.Optional[tp.Callable[[int, int], None]] = None) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Generate tokens sampling from the model given a prompt or unconditionally. Generation can
+        be perform in a greedy fashion or using sampling with top K and top P strategies.
+
+        Args:
+            prompt (Optional[torch.Tensor]): Prompt tokens of shape [B, K, T].
+            conditions_tensors (Dict[str, torch.Tensor]): Set of conditions or None.
+            num_samples (int or None): Number of samples to generate when no prompt and no conditions are given.
+            max_gen_len (int): Maximum generation length.
+            use_sampling (bool): Whether to use a sampling strategy or not.
+            temp (float): Sampling temperature.
+            top_k (int): K for &#34;top-k&#34; sampling.
+            top_p (float): P for &#34;top-p&#34; sampling.
+            remove_prompts (bool): Whether to remove prompts from generation or not.
+        Returns:
+            torch.Tensor: Generated tokens.
+        &#34;&#34;&#34;
+        assert not self.training, &#34;generation shouldn&#39;t be used in training mode.&#34;
+        first_param = next(iter(self.parameters()))
+        device = first_param.device
+
+        # Checking all input shapes are consistents.
+        possible_num_samples = []
+        if num_samples is not None:
+            possible_num_samples.append(num_samples)
+        elif prompt is not None:
+            possible_num_samples.append(prompt.shape[0])
+        elif conditions:
+            possible_num_samples.append(len(conditions))
+        else:
+            possible_num_samples.append(1)
+        assert [x == possible_num_samples[0] for x in possible_num_samples], &#34;Inconsitent inputs shapes&#34;
+        num_samples = possible_num_samples[0]
+
+        # below we create set of conditions: one conditional and one unconditional
+        # to do that we merge the regular condition together with the null condition
+        # we then do 1 forward pass instead of 2.
+        # the reason for that is two-fold:
+        # 1. it is about x2 faster than doing 2 forward passes
+        # 2. avoid the streaming API treating the 2 passes as part of different time steps
+        # We also support doing two different passes, in particular to ensure that
+        # the padding structure is exactly the same between train anf test.
+        # With a batch size of 1, this can be slower though.
+        cfg_conditions: CFGConditions
+        two_step_cfg = self.two_step_cfg if two_step_cfg is None else two_step_cfg
+        if conditions:
+            null_conditions = ClassifierFreeGuidanceDropout(p=1.0)(conditions)
+            if two_step_cfg:
+                cfg_conditions = (
+                    self.condition_provider(self.condition_provider.tokenize(conditions)),
+                    self.condition_provider(self.condition_provider.tokenize(null_conditions)),
+                )
+            else:
+                conditions = conditions + null_conditions
+                tokenized = self.condition_provider.tokenize(conditions)
+                cfg_conditions = self.condition_provider(tokenized)
+        else:
+            cfg_conditions = {}
+
+        if prompt is None:
+            assert num_samples &gt; 0
+            prompt = torch.zeros((num_samples, self.num_codebooks, 0), dtype=torch.long, device=device)
+
+        B, K, T = prompt.shape
+        start_offset = T
+        assert start_offset &lt; max_gen_len
+
+        pattern = self.pattern_provider.get_pattern(max_gen_len)
+        # this token is used as default value for codes that are not generated yet
+        unknown_token = -1
+
+        # we generate codes up to the max_gen_len that will be mapped to the pattern sequence
+        gen_codes = torch.full((B, K, max_gen_len), unknown_token, dtype=torch.long, device=device)
+        # filling the gen_codes with the prompt if needed
+        gen_codes[..., :start_offset] = prompt
+        # create the gen_sequence with proper interleaving from the pattern: [B, K, S]
+        gen_sequence, indexes, mask = pattern.build_pattern_sequence(gen_codes, self.special_token_id)
+        # retrieve the start_offset in the sequence:
+        # it is the first sequence step that contains the `start_offset` timestep
+        start_offset_sequence = pattern.get_first_step_with_timesteps(start_offset)
+        assert start_offset_sequence is not None
+
+        with self.streaming():
+            unconditional_state = self.get_streaming_state()
+            prev_offset = 0
+            gen_sequence_len = gen_sequence.shape[-1]  # gen_sequence shape is [B, K, S]
+            for offset in range(start_offset_sequence, gen_sequence_len):
+                # get current sequence (note that the streaming API is providing the caching over previous offsets)
+                curr_sequence = gen_sequence[..., prev_offset:offset]
+                curr_mask = mask[None, ..., prev_offset:offset].expand(B, -1, -1)
+                if check:
+                    # check coherence between mask and sequence
+                    assert (curr_sequence == torch.where(curr_mask, curr_sequence, self.special_token_id)).all()
+                    # should never happen as gen_sequence is filled progressively
+                    assert not (curr_sequence == unknown_token).any()
+                # sample next token from the model, next token shape is [B, K, 1]
+                next_token = self._sample_next_token(
+                    curr_sequence, cfg_conditions, unconditional_state, use_sampling, temp, top_k, top_p,
+                    cfg_coef=cfg_coef)
+                # ensure the tokens that should be masked are properly set to special_token_id
+                # as the model never output special_token_id
+                valid_mask = mask[..., offset:offset+1].expand(B, -1, -1)
+                next_token[~valid_mask] = self.special_token_id
+                # ensure we don&#39;t overwrite prompt tokens, we only write over unknown tokens
+                # (then mask tokens should be left as is as well, which is correct)
+                gen_sequence[..., offset:offset+1] = torch.where(
+                    gen_sequence[..., offset:offset+1] == unknown_token,
+                    next_token, gen_sequence[..., offset:offset+1]
+                )
+                prev_offset = offset
+                if callback is not None:
+                    callback(1 + offset - start_offset_sequence, gen_sequence_len - start_offset_sequence)
+        unconditional_state.clear()
+
+        # ensure sequence has been entirely filled
+        assert not (gen_sequence == unknown_token).any()
+        # ensure gen_sequence pattern and mask are matching
+        # which means the gen_sequence is valid according to the pattern
+        assert (
+            gen_sequence == torch.where(mask[None, ...].expand(B, -1, -1), gen_sequence, self.special_token_id)
+        ).all()
+        # get back the codes, trimming the prompt if needed and cutting potentially incomplete timesteps
+        out_codes, out_indexes, out_mask = pattern.revert_pattern_sequence(gen_sequence, special_token=unknown_token)
+
+        # sanity checks over the returned codes and corresponding masks
+        assert (out_codes[..., :max_gen_len] != unknown_token).all()
+        assert (out_mask[..., :max_gen_len] == 1).all()
+
+        out_start_offset = start_offset if remove_prompts else 0
+        out_codes = out_codes[..., out_start_offset:max_gen_len]
+
+        # ensure the returned codes are all valid
+        assert (out_codes &gt;= 0).all() and (out_codes &lt;= self.card).all()
+        return out_codes</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.streaming.StreamingModule" href="../modules/streaming.html#audiocraft.modules.streaming.StreamingModule">StreamingModule</a></li>
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.models.lm.LMModel.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.models.lm.LMModel.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.models.lm.LMModel.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Instance variables</h3>
+<dl>
+<dt id="audiocraft.models.lm.LMModel.num_codebooks"><code class="name">var <span class="ident">num_codebooks</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def num_codebooks(self) -&gt; int:
+    return self.n_q</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.lm.LMModel.special_token_id"><code class="name">var <span class="ident">special_token_id</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def special_token_id(self) -&gt; int:
+    return self.card</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.models.lm.LMModel.compute_predictions"><code class="name flex">
+<span>def <span class="ident">compute_predictions</span></span>(<span>self, codes: torch.Tensor, conditions: List[<a title="audiocraft.modules.conditioners.ConditioningAttributes" href="../modules/conditioners.html#audiocraft.modules.conditioners.ConditioningAttributes">ConditioningAttributes</a>], condition_tensors: Optional[Dict[str, Tuple[torch.Tensor, torch.Tensor]]] = None) ‑> <a title="audiocraft.models.lm.LMOutput" href="#audiocraft.models.lm.LMOutput">LMOutput</a></span>
+</code></dt>
+<dd>
+<div class="desc"><p>Given an input tensor of codes [B, K, T] and list of conditions, runs the model
+forward using the specified codes interleaving pattern.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>codes</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>Input codes of shape [B, K, T] with B the batch size,
+K the number of codebooks and T the number of timesteps.</dd>
+<dt><strong><code>conditions</code></strong> :&ensp;<code>list[ConditioningAttributes]</code></dt>
+<dd>conditionings to use when modeling
+the given codes. Note that when evaluating multiple time with the same conditioning
+you should pre-compute those and pass them as <code>condition_tensors</code>.</dd>
+<dt><strong><code>condition_tensors</code></strong> :&ensp;<code>dict[str, ConditionType]</code> or <code>None</code></dt>
+<dd>pre-computed conditioning
+tensors, see <code>conditions</code>.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><code><a title="audiocraft.models.lm.LMOutput" href="#audiocraft.models.lm.LMOutput">LMOutput</a></code></dt>
+<dd>Language model outputs
+logits (torch.Tensor) of shape [B, K, T, card] corresponding to the provided codes,
+i.e. the first item corresponds to logits to predict the first code, meaning that
+no additional shifting of codes and logits is required.
+mask (torch.Tensor) of shape [B, K, T], mask over valid and invalid positions.
+Given the specified interleaving strategies, parts of the logits and codes should
+not be considered as valid predictions because of invalid context.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def compute_predictions(
+        self, codes: torch.Tensor,
+        conditions: tp.List[ConditioningAttributes],
+        condition_tensors: tp.Optional[ConditionTensors] = None) -&gt; LMOutput:
+    &#34;&#34;&#34;Given an input tensor of codes [B, K, T] and list of conditions, runs the model
+    forward using the specified codes interleaving pattern.
+
+    Args:
+        codes (torch.Tensor): Input codes of shape [B, K, T] with B the batch size,
+            K the number of codebooks and T the number of timesteps.
+        conditions (list[ConditioningAttributes]): conditionings to use when modeling
+            the given codes. Note that when evaluating multiple time with the same conditioning
+            you should pre-compute those and pass them as `condition_tensors`.
+        condition_tensors (dict[str, ConditionType] or None): pre-computed conditioning
+            tensors, see `conditions`.
+    Returns:
+        LMOutput: Language model outputs
+            logits (torch.Tensor) of shape [B, K, T, card] corresponding to the provided codes,
+                i.e. the first item corresponds to logits to predict the first code, meaning that
+                no additional shifting of codes and logits is required.
+            mask (torch.Tensor) of shape [B, K, T], mask over valid and invalid positions.
+                Given the specified interleaving strategies, parts of the logits and codes should
+                not be considered as valid predictions because of invalid context.
+    &#34;&#34;&#34;
+    B, K, T = codes.shape
+    codes = codes.contiguous()
+    # map codes [B, K, T] into pattern sequence [B, K, S] using special_token_id for masked tokens
+    pattern = self.pattern_provider.get_pattern(T)
+    sequence_codes, sequence_indexes, sequence_mask = pattern.build_pattern_sequence(
+        codes, self.special_token_id, keep_only_valid_steps=True
+    )
+    # apply model on pattern sequence
+    model = self if self._fsdp is None else self._fsdp
+    logits = model(sequence_codes, conditions, condition_tensors)  # [B, K, S, card]
+    # map back the logits on pattern sequence to logits on original codes: [B, K, S, card] -&gt; [B, K, T, card]
+    # and provide the corresponding mask over invalid positions of tokens
+    logits = logits.permute(0, 3, 1, 2)  # [B, card, K, S]
+    # note: we use nans as special token to make it obvious if we feed unexpected logits
+    logits, logits_indexes, logits_mask = pattern.revert_pattern_logits(
+        logits, float(&#39;nan&#39;), keep_only_valid_steps=True
+    )
+    logits = logits.permute(0, 2, 3, 1)  # [B, K, T, card]
+    logits_mask = logits_mask[None, :, :].expand(B, -1, -1)  # [K, T] -&gt; [B, K, T]
+    return LMOutput(logits, logits_mask)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.lm.LMModel.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, sequence: torch.Tensor, conditions: List[<a title="audiocraft.modules.conditioners.ConditioningAttributes" href="../modules/conditioners.html#audiocraft.modules.conditioners.ConditioningAttributes">ConditioningAttributes</a>], condition_tensors: Optional[Dict[str, Tuple[torch.Tensor, torch.Tensor]]] = None) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Apply language model on sequence and conditions.
+Given a tensor of sequence of shape [B, K, S] with K the number of codebooks and
+S the sequence steps, return the logits with shape [B, card, K, S].</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>indices</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>indices of the codes to model.</dd>
+<dt><strong><code>conditions</code></strong> :&ensp;<code>list[ConditioningAttributes]</code></dt>
+<dd>conditionings to use when modeling
+the given codes. Note that when evaluating multiple time with the same conditioning
+you should pre-compute those and pass them as <code>condition_tensors</code>.</dd>
+<dt><strong><code>condition_tensors</code></strong> :&ensp;<code>dict[str, ConditionType]</code> or <code>None</code></dt>
+<dd>pre-computed conditioning
+tensors, see <code>conditions</code>.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><code>torch.Tensor</code></dt>
+<dd>Logits.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, sequence: torch.Tensor,
+            conditions: tp.List[ConditioningAttributes],
+            condition_tensors: tp.Optional[ConditionTensors] = None) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Apply language model on sequence and conditions.
+    Given a tensor of sequence of shape [B, K, S] with K the number of codebooks and
+    S the sequence steps, return the logits with shape [B, card, K, S].
+
+    Args:
+        indices (torch.Tensor): indices of the codes to model.
+        conditions (list[ConditioningAttributes]): conditionings to use when modeling
+            the given codes. Note that when evaluating multiple time with the same conditioning
+            you should pre-compute those and pass them as `condition_tensors`.
+        condition_tensors (dict[str, ConditionType] or None): pre-computed conditioning
+            tensors, see `conditions`.
+    Returns:
+        torch.Tensor: Logits.
+    &#34;&#34;&#34;
+    B, K, S = sequence.shape
+    assert K == self.num_codebooks, &#39;Sequence shape must match the specified number of codebooks&#39;
+    input_ = sum([self.emb[k](sequence[:, k]) for k in range(K)])
+    if condition_tensors is None:
+        assert not self._is_streaming, &#34;Conditions tensors should be precomputed when streaming.&#34;
+        # apply dropout modules
+        conditions = self.cfg_dropout(conditions)
+        conditions = self.att_dropout(conditions)
+        tokenized = self.condition_provider.tokenize(conditions)
+        # encode conditions and fuse, both have a streaming cache to not recompute when generating.
+        condition_tensors = self.condition_provider(tokenized)
+    else:
+        assert not conditions, &#34;Shouldn&#39;t pass both conditions and condition_tensors.&#34;
+
+    input_, cross_attention_input = self.fuser(input_, condition_tensors)
+
+    out = self.transformer(input_, cross_attention_src=cross_attention_input)
+    if self.out_norm:
+        out = self.out_norm(out)
+    logits = torch.stack([self.linears[k](out) for k in range(K)], dim=1)  # [B, K, S, card]
+
+    # remove the prefix from the model outputs
+    if len(self.fuser.fuse2cond[&#39;prepend&#39;]) &gt; 0:
+        logits = logits[:, :, -S:]
+
+    return logits  # [B, K, S, card]</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.lm.LMModel.generate"><code class="name flex">
+<span>def <span class="ident">generate</span></span>(<span>self, prompt: Optional[torch.Tensor] = None, conditions: List[<a title="audiocraft.modules.conditioners.ConditioningAttributes" href="../modules/conditioners.html#audiocraft.modules.conditioners.ConditioningAttributes">ConditioningAttributes</a>] = [], num_samples: Optional[int] = None, max_gen_len: int = 256, use_sampling: bool = True, temp: float = 1.0, top_k: int = 250, top_p: float = 0.0, cfg_coef: Optional[float] = None, two_step_cfg: bool = False, remove_prompts: bool = False, check: bool = False, callback: Optional[Callable[[int, int], None]] = None) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Generate tokens sampling from the model given a prompt or unconditionally. Generation can
+be perform in a greedy fashion or using sampling with top K and top P strategies.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>prompt</code></strong> :&ensp;<code>Optional[torch.Tensor]</code></dt>
+<dd>Prompt tokens of shape [B, K, T].</dd>
+<dt><strong><code>conditions_tensors</code></strong> :&ensp;<code>Dict[str, torch.Tensor]</code></dt>
+<dd>Set of conditions or None.</dd>
+<dt><strong><code>num_samples</code></strong> :&ensp;<code>int</code> or <code>None</code></dt>
+<dd>Number of samples to generate when no prompt and no conditions are given.</dd>
+<dt><strong><code>max_gen_len</code></strong> :&ensp;<code>int</code></dt>
+<dd>Maximum generation length.</dd>
+<dt><strong><code>use_sampling</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to use a sampling strategy or not.</dd>
+<dt><strong><code>temp</code></strong> :&ensp;<code>float</code></dt>
+<dd>Sampling temperature.</dd>
+<dt><strong><code>top_k</code></strong> :&ensp;<code>int</code></dt>
+<dd>K for "top-k" sampling.</dd>
+<dt><strong><code>top_p</code></strong> :&ensp;<code>float</code></dt>
+<dd>P for "top-p" sampling.</dd>
+<dt><strong><code>remove_prompts</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to remove prompts from generation or not.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><code>torch.Tensor</code></dt>
+<dd>Generated tokens.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@torch.no_grad()
+def generate(self,
+             prompt: tp.Optional[torch.Tensor] = None,
+             conditions: tp.List[ConditioningAttributes] = [],
+             num_samples: tp.Optional[int] = None,
+             max_gen_len: int = 256,
+             use_sampling: bool = True,
+             temp: float = 1.0,
+             top_k: int = 250,
+             top_p: float = 0.0,
+             cfg_coef: tp.Optional[float] = None,
+             two_step_cfg: bool = False,
+             remove_prompts: bool = False,
+             check: bool = False,
+             callback: tp.Optional[tp.Callable[[int, int], None]] = None) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Generate tokens sampling from the model given a prompt or unconditionally. Generation can
+    be perform in a greedy fashion or using sampling with top K and top P strategies.
+
+    Args:
+        prompt (Optional[torch.Tensor]): Prompt tokens of shape [B, K, T].
+        conditions_tensors (Dict[str, torch.Tensor]): Set of conditions or None.
+        num_samples (int or None): Number of samples to generate when no prompt and no conditions are given.
+        max_gen_len (int): Maximum generation length.
+        use_sampling (bool): Whether to use a sampling strategy or not.
+        temp (float): Sampling temperature.
+        top_k (int): K for &#34;top-k&#34; sampling.
+        top_p (float): P for &#34;top-p&#34; sampling.
+        remove_prompts (bool): Whether to remove prompts from generation or not.
+    Returns:
+        torch.Tensor: Generated tokens.
+    &#34;&#34;&#34;
+    assert not self.training, &#34;generation shouldn&#39;t be used in training mode.&#34;
+    first_param = next(iter(self.parameters()))
+    device = first_param.device
+
+    # Checking all input shapes are consistents.
+    possible_num_samples = []
+    if num_samples is not None:
+        possible_num_samples.append(num_samples)
+    elif prompt is not None:
+        possible_num_samples.append(prompt.shape[0])
+    elif conditions:
+        possible_num_samples.append(len(conditions))
+    else:
+        possible_num_samples.append(1)
+    assert [x == possible_num_samples[0] for x in possible_num_samples], &#34;Inconsitent inputs shapes&#34;
+    num_samples = possible_num_samples[0]
+
+    # below we create set of conditions: one conditional and one unconditional
+    # to do that we merge the regular condition together with the null condition
+    # we then do 1 forward pass instead of 2.
+    # the reason for that is two-fold:
+    # 1. it is about x2 faster than doing 2 forward passes
+    # 2. avoid the streaming API treating the 2 passes as part of different time steps
+    # We also support doing two different passes, in particular to ensure that
+    # the padding structure is exactly the same between train anf test.
+    # With a batch size of 1, this can be slower though.
+    cfg_conditions: CFGConditions
+    two_step_cfg = self.two_step_cfg if two_step_cfg is None else two_step_cfg
+    if conditions:
+        null_conditions = ClassifierFreeGuidanceDropout(p=1.0)(conditions)
+        if two_step_cfg:
+            cfg_conditions = (
+                self.condition_provider(self.condition_provider.tokenize(conditions)),
+                self.condition_provider(self.condition_provider.tokenize(null_conditions)),
+            )
+        else:
+            conditions = conditions + null_conditions
+            tokenized = self.condition_provider.tokenize(conditions)
+            cfg_conditions = self.condition_provider(tokenized)
+    else:
+        cfg_conditions = {}
+
+    if prompt is None:
+        assert num_samples &gt; 0
+        prompt = torch.zeros((num_samples, self.num_codebooks, 0), dtype=torch.long, device=device)
+
+    B, K, T = prompt.shape
+    start_offset = T
+    assert start_offset &lt; max_gen_len
+
+    pattern = self.pattern_provider.get_pattern(max_gen_len)
+    # this token is used as default value for codes that are not generated yet
+    unknown_token = -1
+
+    # we generate codes up to the max_gen_len that will be mapped to the pattern sequence
+    gen_codes = torch.full((B, K, max_gen_len), unknown_token, dtype=torch.long, device=device)
+    # filling the gen_codes with the prompt if needed
+    gen_codes[..., :start_offset] = prompt
+    # create the gen_sequence with proper interleaving from the pattern: [B, K, S]
+    gen_sequence, indexes, mask = pattern.build_pattern_sequence(gen_codes, self.special_token_id)
+    # retrieve the start_offset in the sequence:
+    # it is the first sequence step that contains the `start_offset` timestep
+    start_offset_sequence = pattern.get_first_step_with_timesteps(start_offset)
+    assert start_offset_sequence is not None
+
+    with self.streaming():
+        unconditional_state = self.get_streaming_state()
+        prev_offset = 0
+        gen_sequence_len = gen_sequence.shape[-1]  # gen_sequence shape is [B, K, S]
+        for offset in range(start_offset_sequence, gen_sequence_len):
+            # get current sequence (note that the streaming API is providing the caching over previous offsets)
+            curr_sequence = gen_sequence[..., prev_offset:offset]
+            curr_mask = mask[None, ..., prev_offset:offset].expand(B, -1, -1)
+            if check:
+                # check coherence between mask and sequence
+                assert (curr_sequence == torch.where(curr_mask, curr_sequence, self.special_token_id)).all()
+                # should never happen as gen_sequence is filled progressively
+                assert not (curr_sequence == unknown_token).any()
+            # sample next token from the model, next token shape is [B, K, 1]
+            next_token = self._sample_next_token(
+                curr_sequence, cfg_conditions, unconditional_state, use_sampling, temp, top_k, top_p,
+                cfg_coef=cfg_coef)
+            # ensure the tokens that should be masked are properly set to special_token_id
+            # as the model never output special_token_id
+            valid_mask = mask[..., offset:offset+1].expand(B, -1, -1)
+            next_token[~valid_mask] = self.special_token_id
+            # ensure we don&#39;t overwrite prompt tokens, we only write over unknown tokens
+            # (then mask tokens should be left as is as well, which is correct)
+            gen_sequence[..., offset:offset+1] = torch.where(
+                gen_sequence[..., offset:offset+1] == unknown_token,
+                next_token, gen_sequence[..., offset:offset+1]
+            )
+            prev_offset = offset
+            if callback is not None:
+                callback(1 + offset - start_offset_sequence, gen_sequence_len - start_offset_sequence)
+    unconditional_state.clear()
+
+    # ensure sequence has been entirely filled
+    assert not (gen_sequence == unknown_token).any()
+    # ensure gen_sequence pattern and mask are matching
+    # which means the gen_sequence is valid according to the pattern
+    assert (
+        gen_sequence == torch.where(mask[None, ...].expand(B, -1, -1), gen_sequence, self.special_token_id)
+    ).all()
+    # get back the codes, trimming the prompt if needed and cutting potentially incomplete timesteps
+    out_codes, out_indexes, out_mask = pattern.revert_pattern_sequence(gen_sequence, special_token=unknown_token)
+
+    # sanity checks over the returned codes and corresponding masks
+    assert (out_codes[..., :max_gen_len] != unknown_token).all()
+    assert (out_mask[..., :max_gen_len] == 1).all()
+
+    out_start_offset = start_offset if remove_prompts else 0
+    out_codes = out_codes[..., out_start_offset:max_gen_len]
+
+    # ensure the returned codes are all valid
+    assert (out_codes &gt;= 0).all() and (out_codes &lt;= self.card).all()
+    return out_codes</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Inherited members</h3>
+<ul class="hlist">
+<li><code><b><a title="audiocraft.modules.streaming.StreamingModule" href="../modules/streaming.html#audiocraft.modules.streaming.StreamingModule">StreamingModule</a></b></code>:
+<ul class="hlist">
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.flush" href="../modules/streaming.html#audiocraft.modules.streaming.StreamingModule.flush">flush</a></code></li>
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.get_streaming_state" href="../modules/streaming.html#audiocraft.modules.streaming.StreamingModule.get_streaming_state">get_streaming_state</a></code></li>
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.reset_streaming" href="../modules/streaming.html#audiocraft.modules.streaming.StreamingModule.reset_streaming">reset_streaming</a></code></li>
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.set_streaming_state" href="../modules/streaming.html#audiocraft.modules.streaming.StreamingModule.set_streaming_state">set_streaming_state</a></code></li>
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.streaming" href="../modules/streaming.html#audiocraft.modules.streaming.StreamingModule.streaming">streaming</a></code></li>
+</ul>
+</li>
+</ul>
+</dd>
+<dt id="audiocraft.models.lm.LMOutput"><code class="flex name class">
+<span>class <span class="ident">LMOutput</span></span>
+<span>(</span><span>logits: torch.Tensor, mask: torch.Tensor)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>LMOutput(logits: torch.Tensor, mask: torch.Tensor)</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class LMOutput:
+    # The logits are already re-aligned with the input codes
+    # hence no extra shift is required, e.g. when computing CE
+    logits: torch.Tensor  # [B, K, T, card]
+    mask: torch.Tensor  # [B, K, T]</code></pre>
+</details>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.models.lm.LMOutput.logits"><code class="name">var <span class="ident">logits</span> : torch.Tensor</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.models.lm.LMOutput.mask"><code class="name">var <span class="ident">mask</span> : torch.Tensor</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.models.lm.ScaledEmbedding"><code class="flex name class">
+<span>class <span class="ident">ScaledEmbedding</span></span>
+<span>(</span><span>*args, lr=None, **kwargs)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Boost learning rate for embeddings (with <code>scale</code>).</p>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class ScaledEmbedding(nn.Embedding):
+    &#34;&#34;&#34;Boost learning rate for embeddings (with `scale`).
+    &#34;&#34;&#34;
+    def __init__(self, *args, lr=None, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.lr = lr
+
+    def make_optim_group(self):
+        group = {&#34;params&#34;: list(self.parameters())}
+        if self.lr is not None:
+            group[&#34;lr&#34;] = self.lr
+        return group</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.sparse.Embedding</li>
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.models.lm.ScaledEmbedding.embedding_dim"><code class="name">var <span class="ident">embedding_dim</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.models.lm.ScaledEmbedding.freeze"><code class="name">var <span class="ident">freeze</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.models.lm.ScaledEmbedding.max_norm"><code class="name">var <span class="ident">max_norm</span> : Optional[float]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.models.lm.ScaledEmbedding.norm_type"><code class="name">var <span class="ident">norm_type</span> : float</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.models.lm.ScaledEmbedding.num_embeddings"><code class="name">var <span class="ident">num_embeddings</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.models.lm.ScaledEmbedding.padding_idx"><code class="name">var <span class="ident">padding_idx</span> : Optional[int]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.models.lm.ScaledEmbedding.scale_grad_by_freq"><code class="name">var <span class="ident">scale_grad_by_freq</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.models.lm.ScaledEmbedding.sparse"><code class="name">var <span class="ident">sparse</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.models.lm.ScaledEmbedding.weight"><code class="name">var <span class="ident">weight</span> : torch.Tensor</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.models.lm.ScaledEmbedding.make_optim_group"><code class="name flex">
+<span>def <span class="ident">make_optim_group</span></span>(<span>self)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def make_optim_group(self):
+    group = {&#34;params&#34;: list(self.parameters())}
+    if self.lr is not None:
+        group[&#34;lr&#34;] = self.lr
+    return group</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.models" href="index.html">audiocraft.models</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-functions">Functions</a></h3>
+<ul class="">
+<li><code><a title="audiocraft.models.lm.get_init_fn" href="#audiocraft.models.lm.get_init_fn">get_init_fn</a></code></li>
+<li><code><a title="audiocraft.models.lm.init_layer" href="#audiocraft.models.lm.init_layer">init_layer</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.models.lm.LMModel" href="#audiocraft.models.lm.LMModel">LMModel</a></code></h4>
+<ul class="two-column">
+<li><code><a title="audiocraft.models.lm.LMModel.call_super_init" href="#audiocraft.models.lm.LMModel.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.models.lm.LMModel.compute_predictions" href="#audiocraft.models.lm.LMModel.compute_predictions">compute_predictions</a></code></li>
+<li><code><a title="audiocraft.models.lm.LMModel.dump_patches" href="#audiocraft.models.lm.LMModel.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.models.lm.LMModel.forward" href="#audiocraft.models.lm.LMModel.forward">forward</a></code></li>
+<li><code><a title="audiocraft.models.lm.LMModel.generate" href="#audiocraft.models.lm.LMModel.generate">generate</a></code></li>
+<li><code><a title="audiocraft.models.lm.LMModel.num_codebooks" href="#audiocraft.models.lm.LMModel.num_codebooks">num_codebooks</a></code></li>
+<li><code><a title="audiocraft.models.lm.LMModel.special_token_id" href="#audiocraft.models.lm.LMModel.special_token_id">special_token_id</a></code></li>
+<li><code><a title="audiocraft.models.lm.LMModel.training" href="#audiocraft.models.lm.LMModel.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.models.lm.LMOutput" href="#audiocraft.models.lm.LMOutput">LMOutput</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.models.lm.LMOutput.logits" href="#audiocraft.models.lm.LMOutput.logits">logits</a></code></li>
+<li><code><a title="audiocraft.models.lm.LMOutput.mask" href="#audiocraft.models.lm.LMOutput.mask">mask</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.models.lm.ScaledEmbedding" href="#audiocraft.models.lm.ScaledEmbedding">ScaledEmbedding</a></code></h4>
+<ul class="two-column">
+<li><code><a title="audiocraft.models.lm.ScaledEmbedding.embedding_dim" href="#audiocraft.models.lm.ScaledEmbedding.embedding_dim">embedding_dim</a></code></li>
+<li><code><a title="audiocraft.models.lm.ScaledEmbedding.freeze" href="#audiocraft.models.lm.ScaledEmbedding.freeze">freeze</a></code></li>
+<li><code><a title="audiocraft.models.lm.ScaledEmbedding.make_optim_group" href="#audiocraft.models.lm.ScaledEmbedding.make_optim_group">make_optim_group</a></code></li>
+<li><code><a title="audiocraft.models.lm.ScaledEmbedding.max_norm" href="#audiocraft.models.lm.ScaledEmbedding.max_norm">max_norm</a></code></li>
+<li><code><a title="audiocraft.models.lm.ScaledEmbedding.norm_type" href="#audiocraft.models.lm.ScaledEmbedding.norm_type">norm_type</a></code></li>
+<li><code><a title="audiocraft.models.lm.ScaledEmbedding.num_embeddings" href="#audiocraft.models.lm.ScaledEmbedding.num_embeddings">num_embeddings</a></code></li>
+<li><code><a title="audiocraft.models.lm.ScaledEmbedding.padding_idx" href="#audiocraft.models.lm.ScaledEmbedding.padding_idx">padding_idx</a></code></li>
+<li><code><a title="audiocraft.models.lm.ScaledEmbedding.scale_grad_by_freq" href="#audiocraft.models.lm.ScaledEmbedding.scale_grad_by_freq">scale_grad_by_freq</a></code></li>
+<li><code><a title="audiocraft.models.lm.ScaledEmbedding.sparse" href="#audiocraft.models.lm.ScaledEmbedding.sparse">sparse</a></code></li>
+<li><code><a title="audiocraft.models.lm.ScaledEmbedding.weight" href="#audiocraft.models.lm.ScaledEmbedding.weight">weight</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/docs/audiocraft/models/loaders.html b/docs/audiocraft/models/loaders.html
new file mode 100644
index 00000000..64f4bf06
--- /dev/null
+++ b/docs/audiocraft/models/loaders.html
@@ -0,0 +1,217 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.models.loaders API documentation</title>
+<meta name="description" content="Utility functions to load from the checkpoints.
+Each checkpoint is a torch.saved dict with the following keys:
+- &#39;xp.cfg&#39;: the hydra config as dumped …" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.models.loaders</code></h1>
+</header>
+<section id="section-intro">
+<p>Utility functions to load from the checkpoints.
+Each checkpoint is a torch.saved dict with the following keys:
+- 'xp.cfg': the hydra config as dumped during training. This should be used
+to rebuild the object using the audiocraft.models.builders functions,
+- 'model_best_state': a readily loadable best state for the model, including
+the conditioner. The model obtained from <code>xp.cfg</code> should be compatible
+with this state dict. In the case of a LM, the encodec model would not be
+bundled along but instead provided separately.</p>
+<p>Those functions also support loading from a remote location with the Torch Hub API.
+They also support overriding some parameters, in particular the device and dtype
+of the returned model.</p>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+&#34;&#34;&#34;
+Utility functions to load from the checkpoints.
+Each checkpoint is a torch.saved dict with the following keys:
+- &#39;xp.cfg&#39;: the hydra config as dumped during training. This should be used
+    to rebuild the object using the audiocraft.models.builders functions,
+- &#39;model_best_state&#39;: a readily loadable best state for the model, including
+    the conditioner. The model obtained from `xp.cfg` should be compatible
+    with this state dict. In the case of a LM, the encodec model would not be
+    bundled along but instead provided separately.
+
+Those functions also support loading from a remote location with the Torch Hub API.
+They also support overriding some parameters, in particular the device and dtype
+of the returned model.
+&#34;&#34;&#34;
+
+from pathlib import Path
+from huggingface_hub import hf_hub_download
+import typing as tp
+import os
+
+from omegaconf import OmegaConf
+import torch
+
+from . import builders
+
+
+HF_MODEL_CHECKPOINTS_MAP = {
+    &#34;small&#34;: &#34;facebook/musicgen-small&#34;,
+    &#34;medium&#34;: &#34;facebook/musicgen-medium&#34;,
+    &#34;large&#34;: &#34;facebook/musicgen-large&#34;,
+    &#34;melody&#34;: &#34;facebook/musicgen-melody&#34;,
+}
+
+
+def _get_state_dict(
+    file_or_url_or_id: tp.Union[Path, str],
+    filename: tp.Optional[str] = None,
+    device=&#39;cpu&#39;,
+    cache_dir: tp.Optional[str] = None,
+):
+    # Return the state dict either from a file or url
+    file_or_url_or_id = str(file_or_url_or_id)
+    assert isinstance(file_or_url_or_id, str)
+
+    if os.path.isfile(file_or_url_or_id):
+        return torch.load(file_or_url_or_id, map_location=device)
+
+    if os.path.isdir(file_or_url_or_id):
+        file = f&#34;{file_or_url_or_id}/{filename}&#34;
+        return torch.load(file, map_location=device)
+
+    elif file_or_url_or_id.startswith(&#39;https://&#39;):
+        return torch.hub.load_state_dict_from_url(file_or_url_or_id, map_location=device, check_hash=True)
+
+    elif file_or_url_or_id in HF_MODEL_CHECKPOINTS_MAP:
+        assert filename is not None, &#34;filename needs to be defined if using HF checkpoints&#34;
+
+        repo_id = HF_MODEL_CHECKPOINTS_MAP[file_or_url_or_id]
+        file = hf_hub_download(repo_id=repo_id, filename=filename, cache_dir=cache_dir)
+        return torch.load(file, map_location=device)
+
+    else:
+        raise ValueError(f&#34;{file_or_url_or_id} is not a valid name, path or link that can be loaded.&#34;)
+
+
+def load_compression_model(file_or_url_or_id: tp.Union[Path, str], device=&#39;cpu&#39;, cache_dir: tp.Optional[str] = None):
+    pkg = _get_state_dict(file_or_url_or_id, filename=&#34;compression_state_dict.bin&#34;, cache_dir=cache_dir)
+    cfg = OmegaConf.create(pkg[&#39;xp.cfg&#39;])
+    cfg.device = str(device)
+    model = builders.get_compression_model(cfg)
+    model.load_state_dict(pkg[&#39;best_state&#39;])
+    model.eval()
+    return model
+
+
+def load_lm_model(file_or_url_or_id: tp.Union[Path, str], device=&#39;cpu&#39;, cache_dir: tp.Optional[str] = None):
+    pkg = _get_state_dict(file_or_url_or_id, filename=&#34;state_dict.bin&#34;, cache_dir=cache_dir)
+    cfg = OmegaConf.create(pkg[&#39;xp.cfg&#39;])
+    cfg.device = str(device)
+    if cfg.device == &#39;cpu&#39;:
+        cfg.dtype = &#39;float32&#39;
+    else:
+        cfg.dtype = &#39;float16&#39;
+    model = builders.get_lm_model(cfg)
+    model.load_state_dict(pkg[&#39;best_state&#39;])
+    model.eval()
+    model.cfg = cfg
+    return model</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-functions">Functions</h2>
+<dl>
+<dt id="audiocraft.models.loaders.load_compression_model"><code class="name flex">
+<span>def <span class="ident">load_compression_model</span></span>(<span>file_or_url_or_id: Union[str, pathlib.Path], device='cpu', cache_dir: Optional[str] = None)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def load_compression_model(file_or_url_or_id: tp.Union[Path, str], device=&#39;cpu&#39;, cache_dir: tp.Optional[str] = None):
+    pkg = _get_state_dict(file_or_url_or_id, filename=&#34;compression_state_dict.bin&#34;, cache_dir=cache_dir)
+    cfg = OmegaConf.create(pkg[&#39;xp.cfg&#39;])
+    cfg.device = str(device)
+    model = builders.get_compression_model(cfg)
+    model.load_state_dict(pkg[&#39;best_state&#39;])
+    model.eval()
+    return model</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.loaders.load_lm_model"><code class="name flex">
+<span>def <span class="ident">load_lm_model</span></span>(<span>file_or_url_or_id: Union[str, pathlib.Path], device='cpu', cache_dir: Optional[str] = None)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def load_lm_model(file_or_url_or_id: tp.Union[Path, str], device=&#39;cpu&#39;, cache_dir: tp.Optional[str] = None):
+    pkg = _get_state_dict(file_or_url_or_id, filename=&#34;state_dict.bin&#34;, cache_dir=cache_dir)
+    cfg = OmegaConf.create(pkg[&#39;xp.cfg&#39;])
+    cfg.device = str(device)
+    if cfg.device == &#39;cpu&#39;:
+        cfg.dtype = &#39;float32&#39;
+    else:
+        cfg.dtype = &#39;float16&#39;
+    model = builders.get_lm_model(cfg)
+    model.load_state_dict(pkg[&#39;best_state&#39;])
+    model.eval()
+    model.cfg = cfg
+    return model</code></pre>
+</details>
+</dd>
+</dl>
+</section>
+<section>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.models" href="index.html">audiocraft.models</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-functions">Functions</a></h3>
+<ul class="">
+<li><code><a title="audiocraft.models.loaders.load_compression_model" href="#audiocraft.models.loaders.load_compression_model">load_compression_model</a></code></li>
+<li><code><a title="audiocraft.models.loaders.load_lm_model" href="#audiocraft.models.loaders.load_lm_model">load_lm_model</a></code></li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/docs/audiocraft/models/musicgen.html b/docs/audiocraft/models/musicgen.html
new file mode 100644
index 00000000..e6a3a2c8
--- /dev/null
+++ b/docs/audiocraft/models/musicgen.html
@@ -0,0 +1,1135 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.models.musicgen API documentation</title>
+<meta name="description" content="Main model for using MusicGen. This will combine all the required components
+and provide easy access to the generation API." />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.models.musicgen</code></h1>
+</header>
+<section id="section-intro">
+<p>Main model for using MusicGen. This will combine all the required components
+and provide easy access to the generation API.</p>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+&#34;&#34;&#34;
+Main model for using MusicGen. This will combine all the required components
+and provide easy access to the generation API.
+&#34;&#34;&#34;
+
+import os
+import typing as tp
+
+import torch
+
+from .encodec import CompressionModel
+from .lm import LMModel
+from .builders import get_debug_compression_model, get_debug_lm_model
+from .loaders import load_compression_model, load_lm_model, HF_MODEL_CHECKPOINTS_MAP
+from ..data.audio_utils import convert_audio
+from ..modules.conditioners import ConditioningAttributes, WavCondition
+from ..utils.autocast import TorchAutocast
+
+
+MelodyList = tp.List[tp.Optional[torch.Tensor]]
+MelodyType = tp.Union[torch.Tensor, MelodyList]
+
+
+class MusicGen:
+    &#34;&#34;&#34;MusicGen main model with convenient generation API.
+
+    Args:
+        name (str): name of the model.
+        compression_model (CompressionModel): Compression model
+            used to map audio to invertible discrete representations.
+        lm (LMModel): Language model over discrete representations.
+    &#34;&#34;&#34;
+    def __init__(self, name: str, compression_model: CompressionModel, lm: LMModel,
+                 max_duration: float = 30):
+        self.name = name
+        self.compression_model = compression_model
+        self.lm = lm
+        self.max_duration = max_duration
+        self.device = next(iter(lm.parameters())).device
+        self.generation_params: dict = {}
+        self.set_generation_params(duration=15)  # 15 seconds by default
+        self._progress_callback: tp.Optional[tp.Callable[[int, int], None]] = None
+        if self.device.type == &#39;cpu&#39;:
+            self.autocast = TorchAutocast(enabled=False)
+        else:
+            self.autocast = TorchAutocast(
+                enabled=True, device_type=self.device.type, dtype=torch.float16)
+
+    @property
+    def frame_rate(self) -&gt; int:
+        &#34;&#34;&#34;Roughly the number of AR steps per seconds.&#34;&#34;&#34;
+        return self.compression_model.frame_rate
+
+    @property
+    def sample_rate(self) -&gt; int:
+        &#34;&#34;&#34;Sample rate of the generated audio.&#34;&#34;&#34;
+        return self.compression_model.sample_rate
+
+    @property
+    def audio_channels(self) -&gt; int:
+        &#34;&#34;&#34;Audio channels of the generated audio.&#34;&#34;&#34;
+        return self.compression_model.channels
+
+    @staticmethod
+    def get_pretrained(name: str = &#39;melody&#39;, device=None):
+        &#34;&#34;&#34;Return pretrained model, we provide four models:
+        - small (300M), text to music, # see: https://huggingface.co/facebook/musicgen-small
+        - medium (1.5B), text to music, # see: https://huggingface.co/facebook/musicgen-medium
+        - melody (1.5B) text to music and text+melody to music, # see: https://huggingface.co/facebook/musicgen-melody
+        - large (3.3B), text to music, # see: https://huggingface.co/facebook/musicgen-large
+        &#34;&#34;&#34;
+
+        if device is None:
+            if torch.cuda.device_count():
+                device = &#39;cuda&#39;
+            else:
+                device = &#39;cpu&#39;
+
+        if name == &#39;debug&#39;:
+            # used only for unit tests
+            compression_model = get_debug_compression_model(device)
+            lm = get_debug_lm_model(device)
+            return MusicGen(name, compression_model, lm)
+
+        if name not in HF_MODEL_CHECKPOINTS_MAP:
+            if not os.path.isfile(name) and not os.path.isdir(name):
+                raise ValueError(
+                    f&#34;{name} is not a valid checkpoint name. &#34;
+                    f&#34;Choose one of {&#39;, &#39;.join(HF_MODEL_CHECKPOINTS_MAP.keys())}&#34;
+                )
+
+        cache_dir = os.environ.get(&#39;MUSICGEN_ROOT&#39;, None)
+        compression_model = load_compression_model(name, device=device, cache_dir=cache_dir)
+        lm = load_lm_model(name, device=device, cache_dir=cache_dir)
+        if name == &#39;melody&#39;:
+            lm.condition_provider.conditioners[&#39;self_wav&#39;].match_len_on_eval = True
+
+        return MusicGen(name, compression_model, lm)
+
+    def set_generation_params(self, use_sampling: bool = True, top_k: int = 250,
+                              top_p: float = 0.0, temperature: float = 1.0,
+                              duration: float = 30.0, cfg_coef: float = 3.0,
+                              two_step_cfg: bool = False, extend_stride: float = 18):
+        &#34;&#34;&#34;Set the generation parameters for MusicGen.
+
+        Args:
+            use_sampling (bool, optional): Use sampling if True, else do argmax decoding. Defaults to True.
+            top_k (int, optional): top_k used for sampling. Defaults to 250.
+            top_p (float, optional): top_p used for sampling, when set to 0 top_k is used. Defaults to 0.0.
+            temperature (float, optional): Softmax temperature parameter. Defaults to 1.0.
+            duration (float, optional): Duration of the generated waveform. Defaults to 30.0.
+            cfg_coef (float, optional): Coefficient used for classifier free guidance. Defaults to 3.0.
+            two_step_cfg (bool, optional): If True, performs 2 forward for Classifier Free Guidance,
+                instead of batching together the two. This has some impact on how things
+                are padded but seems to have little impact in practice.
+            extend_stride: when doing extended generation (i.e. more than 30 seconds), by how much
+                should we extend the audio each time. Larger values will mean less context is
+                preserved, and shorter value will require extra computations.
+        &#34;&#34;&#34;
+        assert extend_stride &lt; self.max_duration, &#34;Cannot stride by more than max generation duration.&#34;
+        self.extend_stride = extend_stride
+        self.duration = duration
+        self.generation_params = {
+            &#39;use_sampling&#39;: use_sampling,
+            &#39;temp&#39;: temperature,
+            &#39;top_k&#39;: top_k,
+            &#39;top_p&#39;: top_p,
+            &#39;cfg_coef&#39;: cfg_coef,
+            &#39;two_step_cfg&#39;: two_step_cfg,
+        }
+
+    def set_custom_progress_callback(self, progress_callback: tp.Optional[tp.Callable[[int, int], None]] = None):
+        &#34;&#34;&#34;Override the default progress callback.&#34;&#34;&#34;
+        self._progress_callback = progress_callback
+
+    def generate_unconditional(self, num_samples: int, progress: bool = False) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Generate samples in an unconditional manner.
+
+        Args:
+            num_samples (int): Number of samples to be generated.
+            progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
+        &#34;&#34;&#34;
+        descriptions: tp.List[tp.Optional[str]] = [None] * num_samples
+        attributes, prompt_tokens = self._prepare_tokens_and_attributes(descriptions, None)
+        return self._generate_tokens(attributes, prompt_tokens, progress)
+
+    def generate(self, descriptions: tp.List[str], progress: bool = False) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Generate samples conditioned on text.
+
+        Args:
+            descriptions (tp.List[str]): A list of strings used as text conditioning.
+            progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
+        &#34;&#34;&#34;
+        attributes, prompt_tokens = self._prepare_tokens_and_attributes(descriptions, None)
+        assert prompt_tokens is None
+        return self._generate_tokens(attributes, prompt_tokens, progress)
+
+    def generate_with_chroma(self, descriptions: tp.List[str], melody_wavs: MelodyType,
+                             melody_sample_rate: int, progress: bool = False) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Generate samples conditioned on text and melody.
+
+        Args:
+            descriptions (tp.List[str]): A list of strings used as text conditioning.
+            melody_wavs: (torch.Tensor or list of Tensor): A batch of waveforms used as
+                melody conditioning. Should have shape [B, C, T] with B matching the description length,
+                C=1 or 2. It can be [C, T] if there is a single description. It can also be
+                a list of [C, T] tensors.
+            melody_sample_rate: (int): Sample rate of the melody waveforms.
+            progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
+        &#34;&#34;&#34;
+        if isinstance(melody_wavs, torch.Tensor):
+            if melody_wavs.dim() == 2:
+                melody_wavs = melody_wavs[None]
+            if melody_wavs.dim() != 3:
+                raise ValueError(&#34;Melody wavs should have a shape [B, C, T].&#34;)
+            melody_wavs = list(melody_wavs)
+        else:
+            for melody in melody_wavs:
+                if melody is not None:
+                    assert melody.dim() == 2, &#34;One melody in the list has the wrong number of dims.&#34;
+
+        melody_wavs = [
+            convert_audio(wav, melody_sample_rate, self.sample_rate, self.audio_channels)
+            if wav is not None else None
+            for wav in melody_wavs]
+        attributes, prompt_tokens = self._prepare_tokens_and_attributes(descriptions=descriptions, prompt=None,
+                                                                        melody_wavs=melody_wavs)
+        assert prompt_tokens is None
+        return self._generate_tokens(attributes, prompt_tokens, progress)
+
+    def generate_continuation(self, prompt: torch.Tensor, prompt_sample_rate: int,
+                              descriptions: tp.Optional[tp.List[tp.Optional[str]]] = None,
+                              progress: bool = False) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Generate samples conditioned on audio prompts.
+
+        Args:
+            prompt (torch.Tensor): A batch of waveforms used for continuation.
+                Prompt should be [B, C, T], or [C, T] if only one sample is generated.
+            prompt_sample_rate (int): Sampling rate of the given audio waveforms.
+            descriptions (tp.List[str], optional): A list of strings used as text conditioning. Defaults to None.
+            progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
+        &#34;&#34;&#34;
+        if prompt.dim() == 2:
+            prompt = prompt[None]
+        if prompt.dim() != 3:
+            raise ValueError(&#34;prompt should have 3 dimensions: [B, C, T] (C = 1).&#34;)
+        prompt = convert_audio(prompt, prompt_sample_rate, self.sample_rate, self.audio_channels)
+        if descriptions is None:
+            descriptions = [None] * len(prompt)
+        attributes, prompt_tokens = self._prepare_tokens_and_attributes(descriptions, prompt)
+        assert prompt_tokens is not None
+        return self._generate_tokens(attributes, prompt_tokens, progress)
+
+    @torch.no_grad()
+    def _prepare_tokens_and_attributes(
+            self,
+            descriptions: tp.Sequence[tp.Optional[str]],
+            prompt: tp.Optional[torch.Tensor],
+            melody_wavs: tp.Optional[MelodyList] = None,
+    ) -&gt; tp.Tuple[tp.List[ConditioningAttributes], tp.Optional[torch.Tensor]]:
+        &#34;&#34;&#34;Prepare model inputs.
+
+        Args:
+            descriptions (tp.List[str]): A list of strings used as text conditioning.
+            prompt (torch.Tensor): A batch of waveforms used for continuation.
+            melody_wavs (tp.Optional[torch.Tensor], optional): A batch of waveforms
+                used as melody conditioning. Defaults to None.
+        &#34;&#34;&#34;
+        attributes = [
+            ConditioningAttributes(text={&#39;description&#39;: description})
+            for description in descriptions]
+
+        if melody_wavs is None:
+            for attr in attributes:
+                attr.wav[&#39;self_wav&#39;] = WavCondition(
+                    torch.zeros((1, 1), device=self.device),
+                    torch.tensor([0], device=self.device),
+                    path=&#39;null_wav&#39;)  # type: ignore
+        else:
+            if self.name != &#34;melody&#34;:
+                raise RuntimeError(&#34;This model doesn&#39;t support melody conditioning. &#34;
+                                   &#34;Use the `melody` model.&#34;)
+            assert len(melody_wavs) == len(descriptions), \
+                f&#34;number of melody wavs must match number of descriptions! &#34; \
+                f&#34;got melody len={len(melody_wavs)}, and descriptions len={len(descriptions)}&#34;
+            for attr, melody in zip(attributes, melody_wavs):
+                if melody is None:
+                    attr.wav[&#39;self_wav&#39;] = WavCondition(
+                        torch.zeros((1, 1), device=self.device),
+                        torch.tensor([0], device=self.device),
+                        path=&#39;null_wav&#39;)  # type: ignore
+                else:
+                    attr.wav[&#39;self_wav&#39;] = WavCondition(
+                        melody.to(device=self.device),
+                        torch.tensor([melody.shape[-1]], device=self.device))
+
+        if prompt is not None:
+            if descriptions is not None:
+                assert len(descriptions) == len(prompt), &#34;Prompt and nb. descriptions doesn&#39;t match&#34;
+            prompt = prompt.to(self.device)
+            prompt_tokens, scale = self.compression_model.encode(prompt)
+            assert scale is None
+        else:
+            prompt_tokens = None
+        return attributes, prompt_tokens
+
+    def _generate_tokens(self, attributes: tp.List[ConditioningAttributes],
+                         prompt_tokens: tp.Optional[torch.Tensor], progress: bool = False) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Generate discrete audio tokens given audio prompt and/or conditions.
+
+        Args:
+            attributes (tp.List[ConditioningAttributes]): Conditions used for generation (text/melody).
+            prompt_tokens (tp.Optional[torch.Tensor]): Audio prompt used for continuation.
+            progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
+        Returns:
+            torch.Tensor: Generated audio, of shape [B, C, T], T is defined by the generation params.
+        &#34;&#34;&#34;
+        total_gen_len = int(self.duration * self.frame_rate)
+        max_prompt_len = int(min(self.duration, self.max_duration) * self.frame_rate)
+        current_gen_offset: int = 0
+
+        def _progress_callback(generated_tokens: int, tokens_to_generate: int):
+            generated_tokens += current_gen_offset
+            if self._progress_callback is not None:
+                # Note that total_gen_len might be quite wrong depending on the
+                # codebook pattern used, but with delay it is almost accurate.
+                self._progress_callback(generated_tokens, total_gen_len)
+            else:
+                print(f&#39;{generated_tokens: 6d} / {total_gen_len: 6d}&#39;, end=&#39;\r&#39;)
+
+        if prompt_tokens is not None:
+            assert max_prompt_len &gt;= prompt_tokens.shape[-1], \
+                &#34;Prompt is longer than audio to generate&#34;
+
+        callback = None
+        if progress:
+            callback = _progress_callback
+
+        if self.duration &lt;= self.max_duration:
+            # generate by sampling from LM, simple case.
+            with self.autocast:
+                gen_tokens = self.lm.generate(
+                    prompt_tokens, attributes,
+                    callback=callback, max_gen_len=total_gen_len, **self.generation_params)
+
+        else:
+            # now this gets a bit messier, we need to handle prompts,
+            # melody conditioning etc.
+            ref_wavs = [attr.wav[&#39;self_wav&#39;] for attr in attributes]
+            all_tokens = []
+            if prompt_tokens is None:
+                prompt_length = 0
+            else:
+                all_tokens.append(prompt_tokens)
+                prompt_length = prompt_tokens.shape[-1]
+
+            stride_tokens = int(self.frame_rate * self.extend_stride)
+
+            while current_gen_offset + prompt_length &lt; total_gen_len:
+                time_offset = current_gen_offset / self.frame_rate
+                chunk_duration = min(self.duration - time_offset, self.max_duration)
+                max_gen_len = int(chunk_duration * self.frame_rate)
+                for attr, ref_wav in zip(attributes, ref_wavs):
+                    wav_length = ref_wav.length.item()
+                    if wav_length == 0:
+                        continue
+                    # We will extend the wav periodically if it not long enough.
+                    # we have to do it here rather than in conditioners.py as otherwise
+                    # we wouldn&#39;t have the full wav.
+                    initial_position = int(time_offset * self.sample_rate)
+                    wav_target_length = int(self.max_duration * self.sample_rate)
+                    print(initial_position / self.sample_rate, wav_target_length / self.sample_rate)
+                    positions = torch.arange(initial_position,
+                                             initial_position + wav_target_length, device=self.device)
+                    attr.wav[&#39;self_wav&#39;] = WavCondition(
+                        ref_wav[0][:, positions % wav_length],
+                        torch.full_like(ref_wav[1], wav_target_length))
+                with self.autocast:
+                    gen_tokens = self.lm.generate(
+                        prompt_tokens, attributes,
+                        callback=callback, max_gen_len=max_gen_len, **self.generation_params)
+                if prompt_tokens is None:
+                    all_tokens.append(gen_tokens)
+                else:
+                    all_tokens.append(gen_tokens[:, :, prompt_tokens.shape[-1]:])
+                prompt_tokens = gen_tokens[:, :, stride_tokens:]
+                prompt_length = prompt_tokens.shape[-1]
+                current_gen_offset += stride_tokens
+
+            gen_tokens = torch.cat(all_tokens, dim=-1)
+
+        # generate audio
+        assert gen_tokens.dim() == 3
+        with torch.no_grad():
+            gen_audio = self.compression_model.decode(gen_tokens, None)
+        return gen_audio</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.models.musicgen.MusicGen"><code class="flex name class">
+<span>class <span class="ident">MusicGen</span></span>
+<span>(</span><span>name: str, compression_model: <a title="audiocraft.models.encodec.CompressionModel" href="encodec.html#audiocraft.models.encodec.CompressionModel">CompressionModel</a>, lm: <a title="audiocraft.models.lm.LMModel" href="lm.html#audiocraft.models.lm.LMModel">LMModel</a>, max_duration: float = 30)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>MusicGen main model with convenient generation API.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>name</code></strong> :&ensp;<code>str</code></dt>
+<dd>name of the model.</dd>
+<dt><strong><code>compression_model</code></strong> :&ensp;<code>CompressionModel</code></dt>
+<dd>Compression model
+used to map audio to invertible discrete representations.</dd>
+<dt><strong><code>lm</code></strong> :&ensp;<code>LMModel</code></dt>
+<dd>Language model over discrete representations.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class MusicGen:
+    &#34;&#34;&#34;MusicGen main model with convenient generation API.
+
+    Args:
+        name (str): name of the model.
+        compression_model (CompressionModel): Compression model
+            used to map audio to invertible discrete representations.
+        lm (LMModel): Language model over discrete representations.
+    &#34;&#34;&#34;
+    def __init__(self, name: str, compression_model: CompressionModel, lm: LMModel,
+                 max_duration: float = 30):
+        self.name = name
+        self.compression_model = compression_model
+        self.lm = lm
+        self.max_duration = max_duration
+        self.device = next(iter(lm.parameters())).device
+        self.generation_params: dict = {}
+        self.set_generation_params(duration=15)  # 15 seconds by default
+        self._progress_callback: tp.Optional[tp.Callable[[int, int], None]] = None
+        if self.device.type == &#39;cpu&#39;:
+            self.autocast = TorchAutocast(enabled=False)
+        else:
+            self.autocast = TorchAutocast(
+                enabled=True, device_type=self.device.type, dtype=torch.float16)
+
+    @property
+    def frame_rate(self) -&gt; int:
+        &#34;&#34;&#34;Roughly the number of AR steps per seconds.&#34;&#34;&#34;
+        return self.compression_model.frame_rate
+
+    @property
+    def sample_rate(self) -&gt; int:
+        &#34;&#34;&#34;Sample rate of the generated audio.&#34;&#34;&#34;
+        return self.compression_model.sample_rate
+
+    @property
+    def audio_channels(self) -&gt; int:
+        &#34;&#34;&#34;Audio channels of the generated audio.&#34;&#34;&#34;
+        return self.compression_model.channels
+
+    @staticmethod
+    def get_pretrained(name: str = &#39;melody&#39;, device=None):
+        &#34;&#34;&#34;Return pretrained model, we provide four models:
+        - small (300M), text to music, # see: https://huggingface.co/facebook/musicgen-small
+        - medium (1.5B), text to music, # see: https://huggingface.co/facebook/musicgen-medium
+        - melody (1.5B) text to music and text+melody to music, # see: https://huggingface.co/facebook/musicgen-melody
+        - large (3.3B), text to music, # see: https://huggingface.co/facebook/musicgen-large
+        &#34;&#34;&#34;
+
+        if device is None:
+            if torch.cuda.device_count():
+                device = &#39;cuda&#39;
+            else:
+                device = &#39;cpu&#39;
+
+        if name == &#39;debug&#39;:
+            # used only for unit tests
+            compression_model = get_debug_compression_model(device)
+            lm = get_debug_lm_model(device)
+            return MusicGen(name, compression_model, lm)
+
+        if name not in HF_MODEL_CHECKPOINTS_MAP:
+            if not os.path.isfile(name) and not os.path.isdir(name):
+                raise ValueError(
+                    f&#34;{name} is not a valid checkpoint name. &#34;
+                    f&#34;Choose one of {&#39;, &#39;.join(HF_MODEL_CHECKPOINTS_MAP.keys())}&#34;
+                )
+
+        cache_dir = os.environ.get(&#39;MUSICGEN_ROOT&#39;, None)
+        compression_model = load_compression_model(name, device=device, cache_dir=cache_dir)
+        lm = load_lm_model(name, device=device, cache_dir=cache_dir)
+        if name == &#39;melody&#39;:
+            lm.condition_provider.conditioners[&#39;self_wav&#39;].match_len_on_eval = True
+
+        return MusicGen(name, compression_model, lm)
+
+    def set_generation_params(self, use_sampling: bool = True, top_k: int = 250,
+                              top_p: float = 0.0, temperature: float = 1.0,
+                              duration: float = 30.0, cfg_coef: float = 3.0,
+                              two_step_cfg: bool = False, extend_stride: float = 18):
+        &#34;&#34;&#34;Set the generation parameters for MusicGen.
+
+        Args:
+            use_sampling (bool, optional): Use sampling if True, else do argmax decoding. Defaults to True.
+            top_k (int, optional): top_k used for sampling. Defaults to 250.
+            top_p (float, optional): top_p used for sampling, when set to 0 top_k is used. Defaults to 0.0.
+            temperature (float, optional): Softmax temperature parameter. Defaults to 1.0.
+            duration (float, optional): Duration of the generated waveform. Defaults to 30.0.
+            cfg_coef (float, optional): Coefficient used for classifier free guidance. Defaults to 3.0.
+            two_step_cfg (bool, optional): If True, performs 2 forward for Classifier Free Guidance,
+                instead of batching together the two. This has some impact on how things
+                are padded but seems to have little impact in practice.
+            extend_stride: when doing extended generation (i.e. more than 30 seconds), by how much
+                should we extend the audio each time. Larger values will mean less context is
+                preserved, and shorter value will require extra computations.
+        &#34;&#34;&#34;
+        assert extend_stride &lt; self.max_duration, &#34;Cannot stride by more than max generation duration.&#34;
+        self.extend_stride = extend_stride
+        self.duration = duration
+        self.generation_params = {
+            &#39;use_sampling&#39;: use_sampling,
+            &#39;temp&#39;: temperature,
+            &#39;top_k&#39;: top_k,
+            &#39;top_p&#39;: top_p,
+            &#39;cfg_coef&#39;: cfg_coef,
+            &#39;two_step_cfg&#39;: two_step_cfg,
+        }
+
+    def set_custom_progress_callback(self, progress_callback: tp.Optional[tp.Callable[[int, int], None]] = None):
+        &#34;&#34;&#34;Override the default progress callback.&#34;&#34;&#34;
+        self._progress_callback = progress_callback
+
+    def generate_unconditional(self, num_samples: int, progress: bool = False) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Generate samples in an unconditional manner.
+
+        Args:
+            num_samples (int): Number of samples to be generated.
+            progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
+        &#34;&#34;&#34;
+        descriptions: tp.List[tp.Optional[str]] = [None] * num_samples
+        attributes, prompt_tokens = self._prepare_tokens_and_attributes(descriptions, None)
+        return self._generate_tokens(attributes, prompt_tokens, progress)
+
+    def generate(self, descriptions: tp.List[str], progress: bool = False) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Generate samples conditioned on text.
+
+        Args:
+            descriptions (tp.List[str]): A list of strings used as text conditioning.
+            progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
+        &#34;&#34;&#34;
+        attributes, prompt_tokens = self._prepare_tokens_and_attributes(descriptions, None)
+        assert prompt_tokens is None
+        return self._generate_tokens(attributes, prompt_tokens, progress)
+
+    def generate_with_chroma(self, descriptions: tp.List[str], melody_wavs: MelodyType,
+                             melody_sample_rate: int, progress: bool = False) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Generate samples conditioned on text and melody.
+
+        Args:
+            descriptions (tp.List[str]): A list of strings used as text conditioning.
+            melody_wavs: (torch.Tensor or list of Tensor): A batch of waveforms used as
+                melody conditioning. Should have shape [B, C, T] with B matching the description length,
+                C=1 or 2. It can be [C, T] if there is a single description. It can also be
+                a list of [C, T] tensors.
+            melody_sample_rate: (int): Sample rate of the melody waveforms.
+            progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
+        &#34;&#34;&#34;
+        if isinstance(melody_wavs, torch.Tensor):
+            if melody_wavs.dim() == 2:
+                melody_wavs = melody_wavs[None]
+            if melody_wavs.dim() != 3:
+                raise ValueError(&#34;Melody wavs should have a shape [B, C, T].&#34;)
+            melody_wavs = list(melody_wavs)
+        else:
+            for melody in melody_wavs:
+                if melody is not None:
+                    assert melody.dim() == 2, &#34;One melody in the list has the wrong number of dims.&#34;
+
+        melody_wavs = [
+            convert_audio(wav, melody_sample_rate, self.sample_rate, self.audio_channels)
+            if wav is not None else None
+            for wav in melody_wavs]
+        attributes, prompt_tokens = self._prepare_tokens_and_attributes(descriptions=descriptions, prompt=None,
+                                                                        melody_wavs=melody_wavs)
+        assert prompt_tokens is None
+        return self._generate_tokens(attributes, prompt_tokens, progress)
+
+    def generate_continuation(self, prompt: torch.Tensor, prompt_sample_rate: int,
+                              descriptions: tp.Optional[tp.List[tp.Optional[str]]] = None,
+                              progress: bool = False) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Generate samples conditioned on audio prompts.
+
+        Args:
+            prompt (torch.Tensor): A batch of waveforms used for continuation.
+                Prompt should be [B, C, T], or [C, T] if only one sample is generated.
+            prompt_sample_rate (int): Sampling rate of the given audio waveforms.
+            descriptions (tp.List[str], optional): A list of strings used as text conditioning. Defaults to None.
+            progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
+        &#34;&#34;&#34;
+        if prompt.dim() == 2:
+            prompt = prompt[None]
+        if prompt.dim() != 3:
+            raise ValueError(&#34;prompt should have 3 dimensions: [B, C, T] (C = 1).&#34;)
+        prompt = convert_audio(prompt, prompt_sample_rate, self.sample_rate, self.audio_channels)
+        if descriptions is None:
+            descriptions = [None] * len(prompt)
+        attributes, prompt_tokens = self._prepare_tokens_and_attributes(descriptions, prompt)
+        assert prompt_tokens is not None
+        return self._generate_tokens(attributes, prompt_tokens, progress)
+
+    @torch.no_grad()
+    def _prepare_tokens_and_attributes(
+            self,
+            descriptions: tp.Sequence[tp.Optional[str]],
+            prompt: tp.Optional[torch.Tensor],
+            melody_wavs: tp.Optional[MelodyList] = None,
+    ) -&gt; tp.Tuple[tp.List[ConditioningAttributes], tp.Optional[torch.Tensor]]:
+        &#34;&#34;&#34;Prepare model inputs.
+
+        Args:
+            descriptions (tp.List[str]): A list of strings used as text conditioning.
+            prompt (torch.Tensor): A batch of waveforms used for continuation.
+            melody_wavs (tp.Optional[torch.Tensor], optional): A batch of waveforms
+                used as melody conditioning. Defaults to None.
+        &#34;&#34;&#34;
+        attributes = [
+            ConditioningAttributes(text={&#39;description&#39;: description})
+            for description in descriptions]
+
+        if melody_wavs is None:
+            for attr in attributes:
+                attr.wav[&#39;self_wav&#39;] = WavCondition(
+                    torch.zeros((1, 1), device=self.device),
+                    torch.tensor([0], device=self.device),
+                    path=&#39;null_wav&#39;)  # type: ignore
+        else:
+            if self.name != &#34;melody&#34;:
+                raise RuntimeError(&#34;This model doesn&#39;t support melody conditioning. &#34;
+                                   &#34;Use the `melody` model.&#34;)
+            assert len(melody_wavs) == len(descriptions), \
+                f&#34;number of melody wavs must match number of descriptions! &#34; \
+                f&#34;got melody len={len(melody_wavs)}, and descriptions len={len(descriptions)}&#34;
+            for attr, melody in zip(attributes, melody_wavs):
+                if melody is None:
+                    attr.wav[&#39;self_wav&#39;] = WavCondition(
+                        torch.zeros((1, 1), device=self.device),
+                        torch.tensor([0], device=self.device),
+                        path=&#39;null_wav&#39;)  # type: ignore
+                else:
+                    attr.wav[&#39;self_wav&#39;] = WavCondition(
+                        melody.to(device=self.device),
+                        torch.tensor([melody.shape[-1]], device=self.device))
+
+        if prompt is not None:
+            if descriptions is not None:
+                assert len(descriptions) == len(prompt), &#34;Prompt and nb. descriptions doesn&#39;t match&#34;
+            prompt = prompt.to(self.device)
+            prompt_tokens, scale = self.compression_model.encode(prompt)
+            assert scale is None
+        else:
+            prompt_tokens = None
+        return attributes, prompt_tokens
+
+    def _generate_tokens(self, attributes: tp.List[ConditioningAttributes],
+                         prompt_tokens: tp.Optional[torch.Tensor], progress: bool = False) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Generate discrete audio tokens given audio prompt and/or conditions.
+
+        Args:
+            attributes (tp.List[ConditioningAttributes]): Conditions used for generation (text/melody).
+            prompt_tokens (tp.Optional[torch.Tensor]): Audio prompt used for continuation.
+            progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
+        Returns:
+            torch.Tensor: Generated audio, of shape [B, C, T], T is defined by the generation params.
+        &#34;&#34;&#34;
+        total_gen_len = int(self.duration * self.frame_rate)
+        max_prompt_len = int(min(self.duration, self.max_duration) * self.frame_rate)
+        current_gen_offset: int = 0
+
+        def _progress_callback(generated_tokens: int, tokens_to_generate: int):
+            generated_tokens += current_gen_offset
+            if self._progress_callback is not None:
+                # Note that total_gen_len might be quite wrong depending on the
+                # codebook pattern used, but with delay it is almost accurate.
+                self._progress_callback(generated_tokens, total_gen_len)
+            else:
+                print(f&#39;{generated_tokens: 6d} / {total_gen_len: 6d}&#39;, end=&#39;\r&#39;)
+
+        if prompt_tokens is not None:
+            assert max_prompt_len &gt;= prompt_tokens.shape[-1], \
+                &#34;Prompt is longer than audio to generate&#34;
+
+        callback = None
+        if progress:
+            callback = _progress_callback
+
+        if self.duration &lt;= self.max_duration:
+            # generate by sampling from LM, simple case.
+            with self.autocast:
+                gen_tokens = self.lm.generate(
+                    prompt_tokens, attributes,
+                    callback=callback, max_gen_len=total_gen_len, **self.generation_params)
+
+        else:
+            # now this gets a bit messier, we need to handle prompts,
+            # melody conditioning etc.
+            ref_wavs = [attr.wav[&#39;self_wav&#39;] for attr in attributes]
+            all_tokens = []
+            if prompt_tokens is None:
+                prompt_length = 0
+            else:
+                all_tokens.append(prompt_tokens)
+                prompt_length = prompt_tokens.shape[-1]
+
+            stride_tokens = int(self.frame_rate * self.extend_stride)
+
+            while current_gen_offset + prompt_length &lt; total_gen_len:
+                time_offset = current_gen_offset / self.frame_rate
+                chunk_duration = min(self.duration - time_offset, self.max_duration)
+                max_gen_len = int(chunk_duration * self.frame_rate)
+                for attr, ref_wav in zip(attributes, ref_wavs):
+                    wav_length = ref_wav.length.item()
+                    if wav_length == 0:
+                        continue
+                    # We will extend the wav periodically if it not long enough.
+                    # we have to do it here rather than in conditioners.py as otherwise
+                    # we wouldn&#39;t have the full wav.
+                    initial_position = int(time_offset * self.sample_rate)
+                    wav_target_length = int(self.max_duration * self.sample_rate)
+                    print(initial_position / self.sample_rate, wav_target_length / self.sample_rate)
+                    positions = torch.arange(initial_position,
+                                             initial_position + wav_target_length, device=self.device)
+                    attr.wav[&#39;self_wav&#39;] = WavCondition(
+                        ref_wav[0][:, positions % wav_length],
+                        torch.full_like(ref_wav[1], wav_target_length))
+                with self.autocast:
+                    gen_tokens = self.lm.generate(
+                        prompt_tokens, attributes,
+                        callback=callback, max_gen_len=max_gen_len, **self.generation_params)
+                if prompt_tokens is None:
+                    all_tokens.append(gen_tokens)
+                else:
+                    all_tokens.append(gen_tokens[:, :, prompt_tokens.shape[-1]:])
+                prompt_tokens = gen_tokens[:, :, stride_tokens:]
+                prompt_length = prompt_tokens.shape[-1]
+                current_gen_offset += stride_tokens
+
+            gen_tokens = torch.cat(all_tokens, dim=-1)
+
+        # generate audio
+        assert gen_tokens.dim() == 3
+        with torch.no_grad():
+            gen_audio = self.compression_model.decode(gen_tokens, None)
+        return gen_audio</code></pre>
+</details>
+<h3>Static methods</h3>
+<dl>
+<dt id="audiocraft.models.musicgen.MusicGen.get_pretrained"><code class="name flex">
+<span>def <span class="ident">get_pretrained</span></span>(<span>name: str = 'melody', device=None)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Return pretrained model, we provide four models:
+- small (300M), text to music, # see: <a href="https://huggingface.co/facebook/musicgen-small">https://huggingface.co/facebook/musicgen-small</a>
+- medium (1.5B), text to music, # see: <a href="https://huggingface.co/facebook/musicgen-medium">https://huggingface.co/facebook/musicgen-medium</a>
+- melody (1.5B) text to music and text+melody to music, # see: <a href="https://huggingface.co/facebook/musicgen-melody">https://huggingface.co/facebook/musicgen-melody</a>
+- large (3.3B), text to music, # see: <a href="https://huggingface.co/facebook/musicgen-large">https://huggingface.co/facebook/musicgen-large</a></p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@staticmethod
+def get_pretrained(name: str = &#39;melody&#39;, device=None):
+    &#34;&#34;&#34;Return pretrained model, we provide four models:
+    - small (300M), text to music, # see: https://huggingface.co/facebook/musicgen-small
+    - medium (1.5B), text to music, # see: https://huggingface.co/facebook/musicgen-medium
+    - melody (1.5B) text to music and text+melody to music, # see: https://huggingface.co/facebook/musicgen-melody
+    - large (3.3B), text to music, # see: https://huggingface.co/facebook/musicgen-large
+    &#34;&#34;&#34;
+
+    if device is None:
+        if torch.cuda.device_count():
+            device = &#39;cuda&#39;
+        else:
+            device = &#39;cpu&#39;
+
+    if name == &#39;debug&#39;:
+        # used only for unit tests
+        compression_model = get_debug_compression_model(device)
+        lm = get_debug_lm_model(device)
+        return MusicGen(name, compression_model, lm)
+
+    if name not in HF_MODEL_CHECKPOINTS_MAP:
+        if not os.path.isfile(name) and not os.path.isdir(name):
+            raise ValueError(
+                f&#34;{name} is not a valid checkpoint name. &#34;
+                f&#34;Choose one of {&#39;, &#39;.join(HF_MODEL_CHECKPOINTS_MAP.keys())}&#34;
+            )
+
+    cache_dir = os.environ.get(&#39;MUSICGEN_ROOT&#39;, None)
+    compression_model = load_compression_model(name, device=device, cache_dir=cache_dir)
+    lm = load_lm_model(name, device=device, cache_dir=cache_dir)
+    if name == &#39;melody&#39;:
+        lm.condition_provider.conditioners[&#39;self_wav&#39;].match_len_on_eval = True
+
+    return MusicGen(name, compression_model, lm)</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Instance variables</h3>
+<dl>
+<dt id="audiocraft.models.musicgen.MusicGen.audio_channels"><code class="name">var <span class="ident">audio_channels</span> : int</code></dt>
+<dd>
+<div class="desc"><p>Audio channels of the generated audio.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def audio_channels(self) -&gt; int:
+    &#34;&#34;&#34;Audio channels of the generated audio.&#34;&#34;&#34;
+    return self.compression_model.channels</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.musicgen.MusicGen.frame_rate"><code class="name">var <span class="ident">frame_rate</span> : int</code></dt>
+<dd>
+<div class="desc"><p>Roughly the number of AR steps per seconds.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def frame_rate(self) -&gt; int:
+    &#34;&#34;&#34;Roughly the number of AR steps per seconds.&#34;&#34;&#34;
+    return self.compression_model.frame_rate</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.musicgen.MusicGen.sample_rate"><code class="name">var <span class="ident">sample_rate</span> : int</code></dt>
+<dd>
+<div class="desc"><p>Sample rate of the generated audio.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def sample_rate(self) -&gt; int:
+    &#34;&#34;&#34;Sample rate of the generated audio.&#34;&#34;&#34;
+    return self.compression_model.sample_rate</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.models.musicgen.MusicGen.generate"><code class="name flex">
+<span>def <span class="ident">generate</span></span>(<span>self, descriptions: List[str], progress: bool = False) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Generate samples conditioned on text.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>descriptions</code></strong> :&ensp;<code>tp.List[str]</code></dt>
+<dd>A list of strings used as text conditioning.</dd>
+<dt><strong><code>progress</code></strong> :&ensp;<code>bool</code>, optional</dt>
+<dd>Flag to display progress of the generation process. Defaults to False.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def generate(self, descriptions: tp.List[str], progress: bool = False) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Generate samples conditioned on text.
+
+    Args:
+        descriptions (tp.List[str]): A list of strings used as text conditioning.
+        progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
+    &#34;&#34;&#34;
+    attributes, prompt_tokens = self._prepare_tokens_and_attributes(descriptions, None)
+    assert prompt_tokens is None
+    return self._generate_tokens(attributes, prompt_tokens, progress)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.musicgen.MusicGen.generate_continuation"><code class="name flex">
+<span>def <span class="ident">generate_continuation</span></span>(<span>self, prompt: torch.Tensor, prompt_sample_rate: int, descriptions: Optional[List[Optional[str]]] = None, progress: bool = False) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Generate samples conditioned on audio prompts.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>prompt</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>A batch of waveforms used for continuation.
+Prompt should be [B, C, T], or [C, T] if only one sample is generated.</dd>
+<dt><strong><code>prompt_sample_rate</code></strong> :&ensp;<code>int</code></dt>
+<dd>Sampling rate of the given audio waveforms.</dd>
+<dt><strong><code>descriptions</code></strong> :&ensp;<code>tp.List[str]</code>, optional</dt>
+<dd>A list of strings used as text conditioning. Defaults to None.</dd>
+<dt><strong><code>progress</code></strong> :&ensp;<code>bool</code>, optional</dt>
+<dd>Flag to display progress of the generation process. Defaults to False.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def generate_continuation(self, prompt: torch.Tensor, prompt_sample_rate: int,
+                          descriptions: tp.Optional[tp.List[tp.Optional[str]]] = None,
+                          progress: bool = False) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Generate samples conditioned on audio prompts.
+
+    Args:
+        prompt (torch.Tensor): A batch of waveforms used for continuation.
+            Prompt should be [B, C, T], or [C, T] if only one sample is generated.
+        prompt_sample_rate (int): Sampling rate of the given audio waveforms.
+        descriptions (tp.List[str], optional): A list of strings used as text conditioning. Defaults to None.
+        progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
+    &#34;&#34;&#34;
+    if prompt.dim() == 2:
+        prompt = prompt[None]
+    if prompt.dim() != 3:
+        raise ValueError(&#34;prompt should have 3 dimensions: [B, C, T] (C = 1).&#34;)
+    prompt = convert_audio(prompt, prompt_sample_rate, self.sample_rate, self.audio_channels)
+    if descriptions is None:
+        descriptions = [None] * len(prompt)
+    attributes, prompt_tokens = self._prepare_tokens_and_attributes(descriptions, prompt)
+    assert prompt_tokens is not None
+    return self._generate_tokens(attributes, prompt_tokens, progress)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.musicgen.MusicGen.generate_unconditional"><code class="name flex">
+<span>def <span class="ident">generate_unconditional</span></span>(<span>self, num_samples: int, progress: bool = False) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Generate samples in an unconditional manner.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>num_samples</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of samples to be generated.</dd>
+<dt><strong><code>progress</code></strong> :&ensp;<code>bool</code>, optional</dt>
+<dd>Flag to display progress of the generation process. Defaults to False.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def generate_unconditional(self, num_samples: int, progress: bool = False) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Generate samples in an unconditional manner.
+
+    Args:
+        num_samples (int): Number of samples to be generated.
+        progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
+    &#34;&#34;&#34;
+    descriptions: tp.List[tp.Optional[str]] = [None] * num_samples
+    attributes, prompt_tokens = self._prepare_tokens_and_attributes(descriptions, None)
+    return self._generate_tokens(attributes, prompt_tokens, progress)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.musicgen.MusicGen.generate_with_chroma"><code class="name flex">
+<span>def <span class="ident">generate_with_chroma</span></span>(<span>self, descriptions: List[str], melody_wavs: Union[torch.Tensor, List[Optional[torch.Tensor]]], melody_sample_rate: int, progress: bool = False) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Generate samples conditioned on text and melody.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>descriptions</code></strong> :&ensp;<code>tp.List[str]</code></dt>
+<dd>A list of strings used as text conditioning.</dd>
+<dt><strong><code>melody_wavs</code></strong></dt>
+<dd>(torch.Tensor or list of Tensor): A batch of waveforms used as
+melody conditioning. Should have shape [B, C, T] with B matching the description length,
+C=1 or 2. It can be [C, T] if there is a single description. It can also be
+a list of [C, T] tensors.</dd>
+<dt><strong><code>melody_sample_rate</code></strong></dt>
+<dd>(int): Sample rate of the melody waveforms.</dd>
+<dt><strong><code>progress</code></strong> :&ensp;<code>bool</code>, optional</dt>
+<dd>Flag to display progress of the generation process. Defaults to False.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def generate_with_chroma(self, descriptions: tp.List[str], melody_wavs: MelodyType,
+                         melody_sample_rate: int, progress: bool = False) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Generate samples conditioned on text and melody.
+
+    Args:
+        descriptions (tp.List[str]): A list of strings used as text conditioning.
+        melody_wavs: (torch.Tensor or list of Tensor): A batch of waveforms used as
+            melody conditioning. Should have shape [B, C, T] with B matching the description length,
+            C=1 or 2. It can be [C, T] if there is a single description. It can also be
+            a list of [C, T] tensors.
+        melody_sample_rate: (int): Sample rate of the melody waveforms.
+        progress (bool, optional): Flag to display progress of the generation process. Defaults to False.
+    &#34;&#34;&#34;
+    if isinstance(melody_wavs, torch.Tensor):
+        if melody_wavs.dim() == 2:
+            melody_wavs = melody_wavs[None]
+        if melody_wavs.dim() != 3:
+            raise ValueError(&#34;Melody wavs should have a shape [B, C, T].&#34;)
+        melody_wavs = list(melody_wavs)
+    else:
+        for melody in melody_wavs:
+            if melody is not None:
+                assert melody.dim() == 2, &#34;One melody in the list has the wrong number of dims.&#34;
+
+    melody_wavs = [
+        convert_audio(wav, melody_sample_rate, self.sample_rate, self.audio_channels)
+        if wav is not None else None
+        for wav in melody_wavs]
+    attributes, prompt_tokens = self._prepare_tokens_and_attributes(descriptions=descriptions, prompt=None,
+                                                                    melody_wavs=melody_wavs)
+    assert prompt_tokens is None
+    return self._generate_tokens(attributes, prompt_tokens, progress)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.musicgen.MusicGen.set_custom_progress_callback"><code class="name flex">
+<span>def <span class="ident">set_custom_progress_callback</span></span>(<span>self, progress_callback: Optional[Callable[[int, int], None]] = None)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Override the default progress callback.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def set_custom_progress_callback(self, progress_callback: tp.Optional[tp.Callable[[int, int], None]] = None):
+    &#34;&#34;&#34;Override the default progress callback.&#34;&#34;&#34;
+    self._progress_callback = progress_callback</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.models.musicgen.MusicGen.set_generation_params"><code class="name flex">
+<span>def <span class="ident">set_generation_params</span></span>(<span>self, use_sampling: bool = True, top_k: int = 250, top_p: float = 0.0, temperature: float = 1.0, duration: float = 30.0, cfg_coef: float = 3.0, two_step_cfg: bool = False, extend_stride: float = 18)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Set the generation parameters for MusicGen.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>use_sampling</code></strong> :&ensp;<code>bool</code>, optional</dt>
+<dd>Use sampling if True, else do argmax decoding. Defaults to True.</dd>
+<dt><strong><code>top_k</code></strong> :&ensp;<code>int</code>, optional</dt>
+<dd>top_k used for sampling. Defaults to 250.</dd>
+<dt><strong><code>top_p</code></strong> :&ensp;<code>float</code>, optional</dt>
+<dd>top_p used for sampling, when set to 0 top_k is used. Defaults to 0.0.</dd>
+<dt><strong><code>temperature</code></strong> :&ensp;<code>float</code>, optional</dt>
+<dd>Softmax temperature parameter. Defaults to 1.0.</dd>
+<dt><strong><code>duration</code></strong> :&ensp;<code>float</code>, optional</dt>
+<dd>Duration of the generated waveform. Defaults to 30.0.</dd>
+<dt><strong><code>cfg_coef</code></strong> :&ensp;<code>float</code>, optional</dt>
+<dd>Coefficient used for classifier free guidance. Defaults to 3.0.</dd>
+<dt><strong><code>two_step_cfg</code></strong> :&ensp;<code>bool</code>, optional</dt>
+<dd>If True, performs 2 forward for Classifier Free Guidance,
+instead of batching together the two. This has some impact on how things
+are padded but seems to have little impact in practice.</dd>
+<dt><strong><code>extend_stride</code></strong></dt>
+<dd>when doing extended generation (i.e. more than 30 seconds), by how much
+should we extend the audio each time. Larger values will mean less context is
+preserved, and shorter value will require extra computations.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def set_generation_params(self, use_sampling: bool = True, top_k: int = 250,
+                          top_p: float = 0.0, temperature: float = 1.0,
+                          duration: float = 30.0, cfg_coef: float = 3.0,
+                          two_step_cfg: bool = False, extend_stride: float = 18):
+    &#34;&#34;&#34;Set the generation parameters for MusicGen.
+
+    Args:
+        use_sampling (bool, optional): Use sampling if True, else do argmax decoding. Defaults to True.
+        top_k (int, optional): top_k used for sampling. Defaults to 250.
+        top_p (float, optional): top_p used for sampling, when set to 0 top_k is used. Defaults to 0.0.
+        temperature (float, optional): Softmax temperature parameter. Defaults to 1.0.
+        duration (float, optional): Duration of the generated waveform. Defaults to 30.0.
+        cfg_coef (float, optional): Coefficient used for classifier free guidance. Defaults to 3.0.
+        two_step_cfg (bool, optional): If True, performs 2 forward for Classifier Free Guidance,
+            instead of batching together the two. This has some impact on how things
+            are padded but seems to have little impact in practice.
+        extend_stride: when doing extended generation (i.e. more than 30 seconds), by how much
+            should we extend the audio each time. Larger values will mean less context is
+            preserved, and shorter value will require extra computations.
+    &#34;&#34;&#34;
+    assert extend_stride &lt; self.max_duration, &#34;Cannot stride by more than max generation duration.&#34;
+    self.extend_stride = extend_stride
+    self.duration = duration
+    self.generation_params = {
+        &#39;use_sampling&#39;: use_sampling,
+        &#39;temp&#39;: temperature,
+        &#39;top_k&#39;: top_k,
+        &#39;top_p&#39;: top_p,
+        &#39;cfg_coef&#39;: cfg_coef,
+        &#39;two_step_cfg&#39;: two_step_cfg,
+    }</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.models" href="index.html">audiocraft.models</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.models.musicgen.MusicGen" href="#audiocraft.models.musicgen.MusicGen">MusicGen</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.models.musicgen.MusicGen.audio_channels" href="#audiocraft.models.musicgen.MusicGen.audio_channels">audio_channels</a></code></li>
+<li><code><a title="audiocraft.models.musicgen.MusicGen.frame_rate" href="#audiocraft.models.musicgen.MusicGen.frame_rate">frame_rate</a></code></li>
+<li><code><a title="audiocraft.models.musicgen.MusicGen.generate" href="#audiocraft.models.musicgen.MusicGen.generate">generate</a></code></li>
+<li><code><a title="audiocraft.models.musicgen.MusicGen.generate_continuation" href="#audiocraft.models.musicgen.MusicGen.generate_continuation">generate_continuation</a></code></li>
+<li><code><a title="audiocraft.models.musicgen.MusicGen.generate_unconditional" href="#audiocraft.models.musicgen.MusicGen.generate_unconditional">generate_unconditional</a></code></li>
+<li><code><a title="audiocraft.models.musicgen.MusicGen.generate_with_chroma" href="#audiocraft.models.musicgen.MusicGen.generate_with_chroma">generate_with_chroma</a></code></li>
+<li><code><a title="audiocraft.models.musicgen.MusicGen.get_pretrained" href="#audiocraft.models.musicgen.MusicGen.get_pretrained">get_pretrained</a></code></li>
+<li><code><a title="audiocraft.models.musicgen.MusicGen.sample_rate" href="#audiocraft.models.musicgen.MusicGen.sample_rate">sample_rate</a></code></li>
+<li><code><a title="audiocraft.models.musicgen.MusicGen.set_custom_progress_callback" href="#audiocraft.models.musicgen.MusicGen.set_custom_progress_callback">set_custom_progress_callback</a></code></li>
+<li><code><a title="audiocraft.models.musicgen.MusicGen.set_generation_params" href="#audiocraft.models.musicgen.MusicGen.set_generation_params">set_generation_params</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/docs/audiocraft/modules/activations.html b/docs/audiocraft/modules/activations.html
new file mode 100644
index 00000000..08efaf8a
--- /dev/null
+++ b/docs/audiocraft/modules/activations.html
@@ -0,0 +1,523 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.modules.activations API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.modules.activations</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+from typing import Union, Callable
+
+
+class CustomGLU(nn.Module):
+    &#34;&#34;&#34;Custom Gated Linear Unit activation.
+    Applies a modified gated linear unit :math:`a * f(b)` where :math:`a` is the first half
+    of the input matrices, :math:`b` is the second half, and :math:`f` is a provided activation
+    function (i.e. sigmoid, swish, etc.).
+
+    Args:
+        activation (nn.Module): The custom activation to apply in the Gated Linear Unit
+        dim (int): the dimension on which to split the input. Default: -1
+
+    Shape:
+        - Input: :math:`(\ast_1, N, \ast_2)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(\ast_1, M, \ast_2)` where :math:`M=N/2`
+
+    Examples::
+        &gt;&gt;&gt; m = CustomGLU(nn.Sigmoid())
+        &gt;&gt;&gt; input = torch.randn(4, 2)
+        &gt;&gt;&gt; output = m(input)
+    &#34;&#34;&#34;
+    def __init__(self, activation: nn.Module, dim: int = -1):
+        super(CustomGLU, self).__init__()
+        self.dim = dim
+        self.activation = activation
+
+    def forward(self, x: Tensor):
+        assert x.shape[self.dim] % 2 == 0  # M = N / 2
+        a, b = torch.chunk(x, 2, dim=self.dim)
+        return a * self.activation(b)
+
+
+class SwiGLU(CustomGLU):
+    &#34;&#34;&#34;SiLU Gated Linear Unit activation.
+    Applies SiLU Gated Linear Unit :math:`a * SiLU(b)` where :math:`a` is
+    the first half of the input matrices, :math:`b` is the second half.
+
+    Args:
+        dim (int): the dimension on which to split the input. Default: -1
+    &#34;&#34;&#34;
+    def __init__(self, dim: int = -1):
+        super(SwiGLU, self).__init__(nn.SiLU(), dim)
+
+
+class GeGLU(CustomGLU):
+    &#34;&#34;&#34;GeLU Gated Linear Unit activation.
+    Applies GeLU Gated Linear Unit :math:`a * GELU(b)` where :math:`a` is
+    the first half of the input matrices, :math:`b` is the second half.
+
+    Args:
+        dim (int): the dimension on which to split the input. Default: -1
+    &#34;&#34;&#34;
+    def __init__(self, dim: int = -1):
+        super(GeGLU, self).__init__(nn.GELU(), dim)
+
+
+class ReGLU(CustomGLU):
+    &#34;&#34;&#34;ReLU Gated Linear Unit activation.
+    Applies ReLU Gated Linear Unit :math:`a * ReLU(b)` where :math:`a` is
+    the first half of the input matrices, :math:`b` is the second half.
+
+    Args:
+        dim (int): the dimension on which to split the input. Default: -1
+    &#34;&#34;&#34;
+    def __init__(self, dim: int = -1):
+        super(ReGLU, self).__init__(nn.ReLU(), dim)
+
+
+def get_activation_fn(
+    activation: Union[str, Callable[[Tensor], Tensor]]
+) -&gt; Union[str, Callable[[Tensor], Tensor]]:
+    &#34;&#34;&#34;Helper function to map an activation string to the activation class.
+    If the supplied activation is not a string that is recognized, the activation is passed back.
+
+    Args:
+        activation (Union[str, Callable[[Tensor], Tensor]]): Activation to check
+    &#34;&#34;&#34;
+    if isinstance(activation, str):
+        if activation == &#34;reglu&#34;:
+            return ReGLU()
+        elif activation == &#34;geglu&#34;:
+            return GeGLU()
+        elif activation == &#34;swiglu&#34;:
+            return SwiGLU()
+    return activation</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-functions">Functions</h2>
+<dl>
+<dt id="audiocraft.modules.activations.get_activation_fn"><code class="name flex">
+<span>def <span class="ident">get_activation_fn</span></span>(<span>activation: Union[str, Callable[[torch.Tensor], torch.Tensor]]) ‑> Union[str, Callable[[torch.Tensor], torch.Tensor]]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Helper function to map an activation string to the activation class.
+If the supplied activation is not a string that is recognized, the activation is passed back.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>activation</code></strong> :&ensp;<code>Union[str, Callable[[Tensor], Tensor]]</code></dt>
+<dd>Activation to check</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_activation_fn(
+    activation: Union[str, Callable[[Tensor], Tensor]]
+) -&gt; Union[str, Callable[[Tensor], Tensor]]:
+    &#34;&#34;&#34;Helper function to map an activation string to the activation class.
+    If the supplied activation is not a string that is recognized, the activation is passed back.
+
+    Args:
+        activation (Union[str, Callable[[Tensor], Tensor]]): Activation to check
+    &#34;&#34;&#34;
+    if isinstance(activation, str):
+        if activation == &#34;reglu&#34;:
+            return ReGLU()
+        elif activation == &#34;geglu&#34;:
+            return GeGLU()
+        elif activation == &#34;swiglu&#34;:
+            return SwiGLU()
+    return activation</code></pre>
+</details>
+</dd>
+</dl>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.modules.activations.CustomGLU"><code class="flex name class">
+<span>class <span class="ident">CustomGLU</span></span>
+<span>(</span><span>activation: torch.nn.modules.module.Module, dim: int = -1)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Custom Gated Linear Unit activation.
+Applies a modified gated linear unit :math:<code>a * f(b)</code> where :math:<code>a</code> is the first half
+of the input matrices, :math:<code>b</code> is the second half, and :math:<code>f</code> is a provided activation
+function (i.e. sigmoid, swish, etc.).</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>activation</code></strong> :&ensp;<code>nn.Module</code></dt>
+<dd>The custom activation to apply in the Gated Linear Unit</dd>
+<dt><strong><code>dim</code></strong> :&ensp;<code>int</code></dt>
+<dd>the dimension on which to split the input. Default: -1</dd>
+</dl>
+<h2 id="shape">Shape</h2>
+<ul>
+<li>Input: :math:<code>(st_1, N, st_2)</code> where <code>*</code> means, any number of additional
+dimensions</li>
+<li>Output: :math:<code>(st_1, M, st_2)</code> where :math:<code>M=N/2</code></li>
+</ul>
+<p>Examples::
+&gt;&gt;&gt; m = CustomGLU(nn.Sigmoid())
+&gt;&gt;&gt; input = torch.randn(4, 2)
+&gt;&gt;&gt; output = m(input)</p>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class CustomGLU(nn.Module):
+    &#34;&#34;&#34;Custom Gated Linear Unit activation.
+    Applies a modified gated linear unit :math:`a * f(b)` where :math:`a` is the first half
+    of the input matrices, :math:`b` is the second half, and :math:`f` is a provided activation
+    function (i.e. sigmoid, swish, etc.).
+
+    Args:
+        activation (nn.Module): The custom activation to apply in the Gated Linear Unit
+        dim (int): the dimension on which to split the input. Default: -1
+
+    Shape:
+        - Input: :math:`(\ast_1, N, \ast_2)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(\ast_1, M, \ast_2)` where :math:`M=N/2`
+
+    Examples::
+        &gt;&gt;&gt; m = CustomGLU(nn.Sigmoid())
+        &gt;&gt;&gt; input = torch.randn(4, 2)
+        &gt;&gt;&gt; output = m(input)
+    &#34;&#34;&#34;
+    def __init__(self, activation: nn.Module, dim: int = -1):
+        super(CustomGLU, self).__init__()
+        self.dim = dim
+        self.activation = activation
+
+    def forward(self, x: Tensor):
+        assert x.shape[self.dim] % 2 == 0  # M = N / 2
+        a, b = torch.chunk(x, 2, dim=self.dim)
+        return a * self.activation(b)</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Subclasses</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.activations.GeGLU" href="#audiocraft.modules.activations.GeGLU">GeGLU</a></li>
+<li><a title="audiocraft.modules.activations.ReGLU" href="#audiocraft.modules.activations.ReGLU">ReGLU</a></li>
+<li><a title="audiocraft.modules.activations.SwiGLU" href="#audiocraft.modules.activations.SwiGLU">SwiGLU</a></li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.activations.CustomGLU.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.activations.CustomGLU.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.activations.CustomGLU.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.modules.activations.CustomGLU.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, x: torch.Tensor) ‑> Callable[..., Any]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Defines the computation performed at every call.</p>
+<p>Should be overridden by all subclasses.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>Although the recipe for forward pass needs to be defined within
+this function, one should call the :class:<code>Module</code> instance afterwards
+instead of this since the former takes care of running the
+registered hooks while the latter silently ignores them.</p>
+</div></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, x: Tensor):
+    assert x.shape[self.dim] % 2 == 0  # M = N / 2
+    a, b = torch.chunk(x, 2, dim=self.dim)
+    return a * self.activation(b)</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.modules.activations.GeGLU"><code class="flex name class">
+<span>class <span class="ident">GeGLU</span></span>
+<span>(</span><span>dim: int = -1)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>GeLU Gated Linear Unit activation.
+Applies GeLU Gated Linear Unit :math:<code>a * GELU(b)</code> where :math:<code>a</code> is
+the first half of the input matrices, :math:<code>b</code> is the second half.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>dim</code></strong> :&ensp;<code>int</code></dt>
+<dd>the dimension on which to split the input. Default: -1</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class GeGLU(CustomGLU):
+    &#34;&#34;&#34;GeLU Gated Linear Unit activation.
+    Applies GeLU Gated Linear Unit :math:`a * GELU(b)` where :math:`a` is
+    the first half of the input matrices, :math:`b` is the second half.
+
+    Args:
+        dim (int): the dimension on which to split the input. Default: -1
+    &#34;&#34;&#34;
+    def __init__(self, dim: int = -1):
+        super(GeGLU, self).__init__(nn.GELU(), dim)</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.activations.CustomGLU" href="#audiocraft.modules.activations.CustomGLU">CustomGLU</a></li>
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.activations.GeGLU.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.activations.GeGLU.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.activations.GeGLU.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Inherited members</h3>
+<ul class="hlist">
+<li><code><b><a title="audiocraft.modules.activations.CustomGLU" href="#audiocraft.modules.activations.CustomGLU">CustomGLU</a></b></code>:
+<ul class="hlist">
+<li><code><a title="audiocraft.modules.activations.CustomGLU.forward" href="#audiocraft.modules.activations.CustomGLU.forward">forward</a></code></li>
+</ul>
+</li>
+</ul>
+</dd>
+<dt id="audiocraft.modules.activations.ReGLU"><code class="flex name class">
+<span>class <span class="ident">ReGLU</span></span>
+<span>(</span><span>dim: int = -1)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>ReLU Gated Linear Unit activation.
+Applies ReLU Gated Linear Unit :math:<code>a * ReLU(b)</code> where :math:<code>a</code> is
+the first half of the input matrices, :math:<code>b</code> is the second half.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>dim</code></strong> :&ensp;<code>int</code></dt>
+<dd>the dimension on which to split the input. Default: -1</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class ReGLU(CustomGLU):
+    &#34;&#34;&#34;ReLU Gated Linear Unit activation.
+    Applies ReLU Gated Linear Unit :math:`a * ReLU(b)` where :math:`a` is
+    the first half of the input matrices, :math:`b` is the second half.
+
+    Args:
+        dim (int): the dimension on which to split the input. Default: -1
+    &#34;&#34;&#34;
+    def __init__(self, dim: int = -1):
+        super(ReGLU, self).__init__(nn.ReLU(), dim)</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.activations.CustomGLU" href="#audiocraft.modules.activations.CustomGLU">CustomGLU</a></li>
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.activations.ReGLU.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.activations.ReGLU.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.activations.ReGLU.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Inherited members</h3>
+<ul class="hlist">
+<li><code><b><a title="audiocraft.modules.activations.CustomGLU" href="#audiocraft.modules.activations.CustomGLU">CustomGLU</a></b></code>:
+<ul class="hlist">
+<li><code><a title="audiocraft.modules.activations.CustomGLU.forward" href="#audiocraft.modules.activations.CustomGLU.forward">forward</a></code></li>
+</ul>
+</li>
+</ul>
+</dd>
+<dt id="audiocraft.modules.activations.SwiGLU"><code class="flex name class">
+<span>class <span class="ident">SwiGLU</span></span>
+<span>(</span><span>dim: int = -1)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>SiLU Gated Linear Unit activation.
+Applies SiLU Gated Linear Unit :math:<code>a * SiLU(b)</code> where :math:<code>a</code> is
+the first half of the input matrices, :math:<code>b</code> is the second half.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>dim</code></strong> :&ensp;<code>int</code></dt>
+<dd>the dimension on which to split the input. Default: -1</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class SwiGLU(CustomGLU):
+    &#34;&#34;&#34;SiLU Gated Linear Unit activation.
+    Applies SiLU Gated Linear Unit :math:`a * SiLU(b)` where :math:`a` is
+    the first half of the input matrices, :math:`b` is the second half.
+
+    Args:
+        dim (int): the dimension on which to split the input. Default: -1
+    &#34;&#34;&#34;
+    def __init__(self, dim: int = -1):
+        super(SwiGLU, self).__init__(nn.SiLU(), dim)</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.activations.CustomGLU" href="#audiocraft.modules.activations.CustomGLU">CustomGLU</a></li>
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.activations.SwiGLU.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.activations.SwiGLU.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.activations.SwiGLU.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Inherited members</h3>
+<ul class="hlist">
+<li><code><b><a title="audiocraft.modules.activations.CustomGLU" href="#audiocraft.modules.activations.CustomGLU">CustomGLU</a></b></code>:
+<ul class="hlist">
+<li><code><a title="audiocraft.modules.activations.CustomGLU.forward" href="#audiocraft.modules.activations.CustomGLU.forward">forward</a></code></li>
+</ul>
+</li>
+</ul>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.modules" href="index.html">audiocraft.modules</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-functions">Functions</a></h3>
+<ul class="">
+<li><code><a title="audiocraft.modules.activations.get_activation_fn" href="#audiocraft.modules.activations.get_activation_fn">get_activation_fn</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.modules.activations.CustomGLU" href="#audiocraft.modules.activations.CustomGLU">CustomGLU</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.activations.CustomGLU.call_super_init" href="#audiocraft.modules.activations.CustomGLU.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.activations.CustomGLU.dump_patches" href="#audiocraft.modules.activations.CustomGLU.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.activations.CustomGLU.forward" href="#audiocraft.modules.activations.CustomGLU.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.activations.CustomGLU.training" href="#audiocraft.modules.activations.CustomGLU.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.activations.GeGLU" href="#audiocraft.modules.activations.GeGLU">GeGLU</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.activations.GeGLU.call_super_init" href="#audiocraft.modules.activations.GeGLU.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.activations.GeGLU.dump_patches" href="#audiocraft.modules.activations.GeGLU.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.activations.GeGLU.training" href="#audiocraft.modules.activations.GeGLU.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.activations.ReGLU" href="#audiocraft.modules.activations.ReGLU">ReGLU</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.activations.ReGLU.call_super_init" href="#audiocraft.modules.activations.ReGLU.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.activations.ReGLU.dump_patches" href="#audiocraft.modules.activations.ReGLU.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.activations.ReGLU.training" href="#audiocraft.modules.activations.ReGLU.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.activations.SwiGLU" href="#audiocraft.modules.activations.SwiGLU">SwiGLU</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.activations.SwiGLU.call_super_init" href="#audiocraft.modules.activations.SwiGLU.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.activations.SwiGLU.dump_patches" href="#audiocraft.modules.activations.SwiGLU.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.activations.SwiGLU.training" href="#audiocraft.modules.activations.SwiGLU.training">training</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/docs/audiocraft/modules/codebooks_patterns.html b/docs/audiocraft/modules/codebooks_patterns.html
new file mode 100644
index 00000000..d3339e1b
--- /dev/null
+++ b/docs/audiocraft/modules/codebooks_patterns.html
@@ -0,0 +1,1818 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.modules.codebooks_patterns API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.modules.codebooks_patterns</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from collections import namedtuple
+from dataclasses import dataclass
+from functools import lru_cache
+import logging
+import typing as tp
+
+from abc import ABC, abstractmethod
+import torch
+
+LayoutCoord = namedtuple(&#39;LayoutCoord&#39;, [&#39;t&#39;, &#39;q&#39;])  # (timestep, codebook index)
+PatternLayout = tp.List[tp.List[LayoutCoord]]  # Sequence of coordinates
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class Pattern:
+    &#34;&#34;&#34;Base implementation of a pattern over a sequence with multiple codebooks.
+
+    The codebook pattern consists in a layout, defining for each sequence step
+    the list of coordinates of each codebook timestep in the resulting interleaved sequence.
+    The first item of the pattern is always an empty list in order to properly insert a special token
+    to start with. For convenience, we also keep track of ``n_q`` the number of codebooks used for the pattern
+    and ``timesteps`` the number of timesteps corresponding to the original sequence.
+
+    The pattern provides convenient methods to build and revert interleaved sequences from it:
+    ``build_pattern_sequence`` maps a given a dense input tensor of multi-codebook sequence from [B, K, T]
+        to the interleaved sequence of shape [B, K, S] applying the pattern, with S being the batch size,
+        K being the number of codebooks, T the number of original timesteps and S the number of sequence steps
+        for the output sequence. The unfilled positions are replaced with a special token and the built sequence
+        is returned along with a mask indicating valid tokens.
+    ``revert_pattern_sequence`` maps back an interleaved sequence of shape [B, K, S] to the original alignment
+        of codebooks across timesteps to an output tensor of shape [B, K, T], using again a special token and a mask
+        to fill and specify invalid positions if needed.
+    See the dedicated methods for more details.
+    &#34;&#34;&#34;
+    # Pattern layout, for each sequence step, we have a list of coordinates
+    # corresponding to the original codebook timestep and position.
+    # The first list is always an empty list in order to properly insert
+    # a special token to start with.
+    layout: PatternLayout
+    timesteps: int
+    n_q: int
+
+    def __post_init__(self):
+        assert len(self.layout) &gt; 0
+        assert self.layout[0] == []
+        self._validate_layout()
+        self._build_reverted_sequence_scatter_indexes = lru_cache(100)(self._build_reverted_sequence_scatter_indexes)
+        self._build_pattern_sequence_scatter_indexes = lru_cache(100)(self._build_pattern_sequence_scatter_indexes)
+        logger.info(&#34;New pattern, time steps: %d, sequence steps: %d&#34;, self.timesteps, len(self.layout))
+
+    def _validate_layout(self):
+        &#34;&#34;&#34;Runs checks on the layout to ensure a valid pattern is defined.
+        A pattern is considered invalid if:
+            - Multiple timesteps for a same codebook are defined in the same sequence step
+            - The timesteps for a given codebook are not in ascending order as we advance in the sequence
+              (this would mean that we have future timesteps before past timesteps).
+        &#34;&#34;&#34;
+        q_timesteps = {q: 0 for q in range(self.n_q)}
+        for s, seq_coords in enumerate(self.layout):
+            if len(seq_coords) &gt; 0:
+                qs = set()
+                for coord in seq_coords:
+                    qs.add(coord.q)
+                    last_q_timestep = q_timesteps[coord.q]
+                    assert coord.t &gt;= last_q_timestep, \
+                        f&#34;Past timesteps are found in the sequence for codebook = {coord.q} at step {s}&#34;
+                    q_timesteps[coord.q] = coord.t
+                # each sequence step contains at max 1 coordinate per codebook
+                assert len(qs) == len(seq_coords), \
+                    f&#34;Multiple entries for a same codebook are found at step {s}&#34;
+
+    @property
+    def num_sequence_steps(self):
+        return len(self.layout) - 1
+
+    @property
+    def max_delay(self):
+        max_t_in_seq_coords = 0
+        for seq_coords in self.layout[1:]:
+            for coords in seq_coords:
+                max_t_in_seq_coords = max(max_t_in_seq_coords, coords.t + 1)
+        return max_t_in_seq_coords - self.timesteps
+
+    @property
+    def valid_layout(self):
+        valid_step = len(self.layout) - self.max_delay
+        return self.layout[:valid_step]
+
+    def get_sequence_coords_with_timestep(self, t: int, q: tp.Optional[int] = None):
+        &#34;&#34;&#34;Get codebook coordinates in the layout that corresponds to the specified timestep t
+        and optionally to the codebook q. Coordinates are returned as a tuple with the sequence step
+        and the actual codebook coordinates.
+        &#34;&#34;&#34;
+        assert t &lt;= self.timesteps, &#34;provided timesteps is greater than the pattern&#39;s number of timesteps&#34;
+        if q is not None:
+            assert q &lt;= self.n_q, &#34;provided number of codebooks is greater than the pattern&#39;s number of codebooks&#34;
+        coords = []
+        for s, seq_codes in enumerate(self.layout):
+            for code in seq_codes:
+                if code.t == t and (q is None or code.q == q):
+                    coords.append((s, code))
+        return coords
+
+    def get_steps_with_timestep(self, t: int, q: tp.Optional[int] = None) -&gt; tp.List[int]:
+        return [step for step, coords in self.get_sequence_coords_with_timestep(t, q)]
+
+    def get_first_step_with_timesteps(self, t: int, q: tp.Optional[int] = None) -&gt; tp.Optional[int]:
+        steps_with_timesteps = self.get_steps_with_timestep(t, q)
+        return steps_with_timesteps[0] if len(steps_with_timesteps) &gt; 0 else None
+
+    def _build_pattern_sequence_scatter_indexes(self, timesteps: int, n_q: int, keep_only_valid_steps: bool,
+                                                device: tp.Union[torch.device, str] = &#39;cpu&#39;):
+        &#34;&#34;&#34;Build scatter indexes corresponding to the pattern, up to the provided sequence_steps.
+
+        Args:
+            timesteps (int): Maximum number of timesteps steps to consider.
+            keep_only_valid_steps (bool): Restrict the pattern layout to match only valid steps.
+            device (Union[torch.device, str]): Device for created tensors.
+        Returns:
+            indexes (torch.Tensor): Indexes corresponding to the sequence, of shape [K, S].
+            mask (torch.Tensor): Mask corresponding to indexes that matches valid indexes, of shape [K, S].
+        &#34;&#34;&#34;
+        assert n_q == self.n_q, f&#34;invalid number of codebooks for the sequence and the pattern: {n_q} != {self.n_q}&#34;
+        assert timesteps &lt;= self.timesteps, &#34;invalid number of timesteps used to build the sequence from the pattern&#34;
+        # use the proper layout based on whether we limit ourselves to valid steps only or not,
+        # note that using the valid_layout will result in a truncated sequence up to the valid steps
+        ref_layout = self.valid_layout if keep_only_valid_steps else self.layout
+        # single item indexing being super slow with pytorch vs. numpy, so we use numpy here
+        indexes = torch.zeros(n_q, len(ref_layout), dtype=torch.long).numpy()
+        mask = torch.zeros(n_q, len(ref_layout), dtype=torch.bool).numpy()
+        # fill indexes with last sequence step value that will correspond to our special token
+        # the last value is n_q * timesteps as we have flattened z and append special token as the last token
+        # which will correspond to the index: n_q * timesteps
+        indexes[:] = n_q * timesteps
+        # iterate over the pattern and fill scattered indexes and mask
+        for s, sequence_coords in enumerate(ref_layout):
+            for coords in sequence_coords:
+                if coords.t &lt; timesteps:
+                    indexes[coords.q, s] = coords.t + coords.q * timesteps
+                    mask[coords.q, s] = 1
+        indexes = torch.from_numpy(indexes).to(device)
+        mask = torch.from_numpy(mask).to(device)
+        return indexes, mask
+
+    def build_pattern_sequence(self, z: torch.Tensor, special_token: int, keep_only_valid_steps: bool = False):
+        &#34;&#34;&#34;Build sequence corresponding to the pattern from the input tensor z.
+        The sequence is built using up to sequence_steps if specified, and non-pattern
+        coordinates are filled with the special token.
+
+        Args:
+            z (torch.Tensor): Input tensor of multi-codebooks sequence, of shape [B, K, T].
+            special_token (int): Special token used to fill non-pattern coordinates in the new sequence.
+            keep_only_valid_steps (bool): Build a sequence from the pattern up to valid (= fully defined) steps.
+                Steps that are beyond valid steps will be replaced by the special_token in that case.
+        Returns:
+            values (torch.Tensor): Interleaved sequence matching the pattern, of shape [B, K, S] with S
+                corresponding either to the sequence_steps if provided, otherwise to the length of the pattern.
+            indexes (torch.Tensor): Indexes corresponding to the interleaved sequence, of shape [K, S].
+            mask (torch.Tensor): Mask corresponding to indexes that matches valid indexes of shape [K, S].
+        &#34;&#34;&#34;
+        B, K, T = z.shape
+        indexes, mask = self._build_pattern_sequence_scatter_indexes(
+            T, K, keep_only_valid_steps=keep_only_valid_steps, device=str(z.device)
+        )
+        z = z.view(B, -1)
+        # we append the special token as the last index of our flattened z tensor
+        z = torch.cat([z, torch.zeros_like(z[:, :1]) + special_token], dim=1)
+        values = z[:, indexes.view(-1)]
+        values = values.view(B, K, indexes.shape[-1])
+        return values, indexes, mask
+
+    def _build_reverted_sequence_scatter_indexes(self, sequence_steps: int, n_q: int,
+                                                 keep_only_valid_steps: bool = False,
+                                                 is_model_output: bool = False,
+                                                 device: tp.Union[torch.device, str] = &#39;cpu&#39;):
+        &#34;&#34;&#34;Builds scatter indexes required to retrieve the original multi-codebook sequence
+        from interleaving pattern.
+
+        Args:
+            sequence_steps (int): Sequence steps.
+            n_q (int): Number of codebooks.
+            keep_only_valid_steps (bool): Build a sequence from the pattern up to valid (= fully defined) steps.
+                Steps that are beyond valid steps will be replaced by the special_token in that case.
+            is_model_output (bool): Whether to keep the sequence item corresponding to initial special token or not.
+            device (Union[torch.device, str]): Device for created tensors.
+        Returns:
+            torch.Tensor: Indexes for reconstructing the output, of shape [K, T].
+            mask (torch.Tensor): Mask corresponding to indexes that matches valid indexes of shape [K, T].
+        &#34;&#34;&#34;
+        ref_layout = self.valid_layout if keep_only_valid_steps else self.layout
+        # TODO(jade): Do we want to further truncate to only valid timesteps here as well?
+        timesteps = self.timesteps
+        assert n_q == self.n_q, f&#34;invalid number of codebooks for the sequence and the pattern: {n_q} != {self.n_q}&#34;
+        assert sequence_steps &lt;= len(ref_layout), \
+            f&#34;sequence to revert is longer than the defined pattern: {sequence_steps} &gt; {len(ref_layout)}&#34;
+
+        # ensure we take the appropriate indexes to keep the model output from the first special token as well
+        if is_model_output:
+            ref_layout = ref_layout[1:]
+
+        # single item indexing being super slow with pytorch vs. numpy, so we use numpy here
+        indexes = torch.zeros(n_q, timesteps, dtype=torch.long).numpy()
+        mask = torch.zeros(n_q, timesteps, dtype=torch.bool).numpy()
+        # fill indexes with last sequence step value that will correspond to our special token
+        indexes[:] = n_q * sequence_steps
+        for s, sequence_codes in enumerate(ref_layout):
+            if s &lt; sequence_steps:
+                for code in sequence_codes:
+                    if code.t &lt; timesteps:
+                        indexes[code.q, code.t] = s + code.q * sequence_steps
+                        mask[code.q, code.t] = 1
+        indexes = torch.from_numpy(indexes).to(device)
+        mask = torch.from_numpy(mask).to(device)
+        return indexes, mask
+
+    def revert_pattern_sequence(self, s: torch.Tensor, special_token: int, keep_only_valid_steps: bool = False):
+        &#34;&#34;&#34;Revert a sequence built from the pattern back to the original multi-codebook sequence without interleaving.
+        The sequence is reverted using up to timesteps if specified, and non-pattern coordinates
+        are filled with the special token.
+
+        Args:
+            s (torch.Tensor): Interleaved sequence tensor obtained from the pattern, of shape [B, K, S].
+            special_token (int or float): Special token used to fill non-pattern coordinates in the new sequence.
+        Returns:
+            values (torch.Tensor): Interleaved sequence matching the pattern, of shape [B, K, T] with T
+                corresponding either to the timesteps if provided, or the total timesteps in pattern otherwise.
+            indexes (torch.Tensor): Indexes corresponding to the interleaved sequence, of shape [K, T].
+            mask (torch.Tensor): Mask corresponding to indexes that matches valid indexes of shape [K, T].
+        &#34;&#34;&#34;
+        B, K, S = s.shape
+        indexes, mask = self._build_reverted_sequence_scatter_indexes(
+            S, K, keep_only_valid_steps, is_model_output=False, device=str(s.device)
+        )
+        s = s.view(B, -1)
+        # we append the special token as the last index of our flattened z tensor
+        s = torch.cat([s, torch.zeros_like(s[:, :1]) + special_token], dim=1)
+        values = s[:, indexes.view(-1)]
+        values = values.view(B, K, indexes.shape[-1])
+        return values, indexes, mask
+
+    def revert_pattern_logits(self, logits: torch.Tensor, special_token: float, keep_only_valid_steps: bool = False):
+        &#34;&#34;&#34;Revert model logits obtained on a sequence built from the pattern
+        back to a tensor matching the original sequence.
+
+        This method is similar to ``revert_pattern_sequence`` with the following specificities:
+        1. It is designed to work with the extra cardinality dimension
+        2. We return the logits for the first sequence item that matches the special_token and
+        which matching target in the original sequence is the first item of the sequence,
+        while we skip the last logits as there is no matching target
+        &#34;&#34;&#34;
+        B, card, K, S = logits.shape
+        indexes, mask = self._build_reverted_sequence_scatter_indexes(
+            S, K, keep_only_valid_steps, is_model_output=True, device=logits.device
+        )
+        logits = logits.reshape(B, card, -1)
+        # we append the special token as the last index of our flattened z tensor
+        logits = torch.cat([logits, torch.zeros_like(logits[:, :, :1]) + special_token], dim=-1)  # [B, card, K x S]
+        values = logits[:, :, indexes.view(-1)]
+        values = values.view(B, card, K, indexes.shape[-1])
+        return values, indexes, mask
+
+
+class CodebooksPatternProvider(ABC):
+    &#34;&#34;&#34;Abstraction around providing pattern for interleaving codebooks.
+
+    The CodebooksPatternProvider abstraction allows to implement various strategies to
+    define interleaving pattern of sequences composed of multiple codebooks. For a given
+    number of codebooks `n_q`, the pattern provider can generate a specified pattern
+    corresponding to a sequence of `T` timesteps with `n_q` parallel codebooks. This pattern
+    can be used to construct a new sequence from the original codes respecting the specified
+    pattern. The pattern is defined as a list of list of code coordinates, code coordinate
+    being a tuple with the original timestep and codebook to build the new sequence.
+    Note that all patterns must start with an empty list that is then used to insert a first
+    sequence step of special tokens in the newly generated sequence.
+
+    Args:
+        n_q (int): number of codebooks.
+        cached (bool): if True, patterns for a given length are cached. In general
+            that should be true for efficiency reason to avoid synchronization points.
+    &#34;&#34;&#34;
+    def __init__(self, n_q: int, cached: bool = True):
+        assert n_q &gt; 0
+        self.n_q = n_q
+        self.get_pattern = lru_cache(100)(self.get_pattern)  # type: ignore
+
+    @abstractmethod
+    def get_pattern(self, timesteps: int) -&gt; Pattern:
+        &#34;&#34;&#34;Builds pattern with specific interleaving between codebooks.
+
+        Args:
+            timesteps (int): Total numer of timesteps.
+        &#34;&#34;&#34;
+        raise NotImplementedError()
+
+
+class DelayedPatternProvider(CodebooksPatternProvider):
+    &#34;&#34;&#34;Provider for delayed pattern across delayed codebooks.
+    Codebooks are delayed in the sequence and sequence steps will contain codebooks
+    from different timesteps.
+
+    Example:
+        Taking timesteps=4 and n_q=3, delays=None, the multi-codebook sequence:
+        [[1, 2, 3, 4],
+        [1, 2, 3, 4],
+        [1, 2, 3, 4]]
+        The resulting sequence obtained from the returned pattern is:
+        [[S, 1, 2, 3, 4],
+        [S, S, 1, 2, 3],
+        [S, S, S, 1, 2]]
+        (with S being a special token)
+
+    Args:
+        n_q (int): Number of codebooks.
+        delays (Optional[List[int]]): Delay for each of the codebooks.
+            If delays not defined, each codebook is delayed by 1 compared to the previous one.
+        flatten_first (int): Flatten the first N timesteps.
+        empty_initial (int): Prepend with N empty list of coordinates.
+    &#34;&#34;&#34;
+    def __init__(self, n_q: int, delays: tp.Optional[tp.List[int]] = None,
+                 flatten_first: int = 0, empty_initial: int = 0):
+        super().__init__(n_q)
+        if delays is None:
+            delays = list(range(n_q))
+        self.delays = delays
+        self.flatten_first = flatten_first
+        self.empty_initial = empty_initial
+        assert len(self.delays) == self.n_q
+        assert sorted(self.delays) == self.delays
+
+    def get_pattern(self, timesteps: int) -&gt; Pattern:
+        out: PatternLayout = [[]]
+        max_delay = max(self.delays)
+        if self.empty_initial:
+            out += [[] for _ in range(self.empty_initial)]
+        if self.flatten_first:
+            for t in range(min(timesteps, self.flatten_first)):
+                for q in range(self.n_q):
+                    out.append([LayoutCoord(t, q)])
+        for t in range(self.flatten_first, timesteps + max_delay):
+            v = []
+            for q, delay in enumerate(self.delays):
+                t_for_q = t - delay
+                if t_for_q &gt;= self.flatten_first:
+                    v.append(LayoutCoord(t_for_q, q))
+            out.append(v)
+        return Pattern(out, n_q=self.n_q, timesteps=timesteps)
+
+
+class ParallelPatternProvider(DelayedPatternProvider):
+    &#34;&#34;&#34;Provider for parallel pattern across codebooks.
+    This pattern provider is a special case of the delayed pattern with actually no delay,
+    hence delays=repeat(0, n_q).
+
+    Args:
+        n_q (int): Number of codebooks.
+    &#34;&#34;&#34;
+    def __init__(self, n_q: int):
+        super().__init__(n_q, [0] * n_q)
+
+
+class UnrolledPatternProvider(CodebooksPatternProvider):
+    &#34;&#34;&#34;Provider for unrolling codebooks pattern.
+    This pattern provider enables to represent the codebook flattened completely or only to some extend
+    while also specifying a given delay between the flattened codebooks representation, allowing to
+    unroll the codebooks in the sequence.
+
+    Example:
+        1. Flattening of the codebooks.
+        By default, the pattern provider will fully flatten the codebooks such as flattening=range(n_q),
+        taking n_q = 3 and timesteps = 4:
+        [[1, 2, 3, 4],
+         [1, 2, 3, 4],
+         [1, 2, 3, 4]]
+        will result into:
+        [[S, S, 1, S, S, 2, S, S, 3, S, S, 4],
+         [S, 1, S, S, 2, S, S, 3, S, S, 4, S],
+         [1, S, S, 2, S, S, 3, S, S, 4, S, S]]
+        2. Partial flattening of the codebooks. The ``flattening`` parameter allows to specify the inner step
+        for each of the codebook, allowing to define which codebook to flatten (or keep in parallel), for example
+        taking n_q = 3, timesteps = 4 and flattening = [0, 1, 1]:
+        [[1, 2, 3, 4],
+         [1, 2, 3, 4],
+         [1, 2, 3, 4]]
+        will result into:
+        [[S, 1, S, S, 2, S, S, 3, S, S, 4, S],
+         [S, 1, S, S, 2, S, S, 3, S, S, 4, S],
+         [1, S, S, 2, S, S, 3, S, S, 4, S, S]]
+        3. Flattening with delay. The ``delay`` parameter allows to further unroll the sequence of codebooks
+        allowing to specify the delay per codebook. Note that the delay between codebooks flattened to the
+        same inner timestep should be coherent. For example, taking n_q = 3, timesteps = 4, flattening = [0, 1, 1]
+        and delays = [0, 3, 3]:
+        [[1, 2, 3, 4],
+         [1, 2, 3, 4],
+         [1, 2, 3, 4]]
+        will result into:
+        [[S, S, S, 1, S, 2, S, 3, S, 4],
+         [S, S, S, 1, S, 2, S, 3, S, 4],
+         [1, 2, 3, S, 4, S, 5, S, 6, S]]
+
+    Args:
+        n_q (int): Number of codebooks.
+        flattening (Optional[List[int]]): Flattening schema over the codebooks. If not defined,
+            the codebooks will be flattened to 1 codebook per step, meaning that the sequence will
+            have n_q extra steps for each timestep.
+        delays (Optional[List[int]]): Delay for each of the codebooks. If not defined,
+            no delay is added and therefore will default to [0] * ``n_q``.
+            Note that two codebooks that will be flattened to the same inner step
+            should have the same delay, otherwise the pattern is considered as invalid.
+    &#34;&#34;&#34;
+    FlattenedCodebook = namedtuple(&#39;FlattenedCodebook&#39;, [&#39;codebooks&#39;, &#39;delay&#39;])
+
+    def __init__(self, n_q: int, flattening: tp.Optional[tp.List[int]] = None,
+                 delays: tp.Optional[tp.List[int]] = None):
+        super().__init__(n_q)
+        if flattening is None:
+            flattening = list(range(n_q))
+        if delays is None:
+            delays = [0] * n_q
+        assert len(flattening) == n_q
+        assert len(delays) == n_q
+        assert sorted(flattening) == flattening
+        assert sorted(delays) == delays
+        self._flattened_codebooks = self._build_flattened_codebooks(delays, flattening)
+        self.max_delay = max(delays)
+
+    def _build_flattened_codebooks(self, delays: tp.List[int], flattening: tp.List[int]):
+        &#34;&#34;&#34;Build a flattened codebooks representation as a dictionary of inner step
+        and the actual codebook indices corresponding to the flattened codebook. For convenience, we
+        also store the delay associated to the flattened codebook to avoid maintaining an extra mapping.
+        &#34;&#34;&#34;
+        flattened_codebooks: dict = {}
+        for q, (inner_step, delay) in enumerate(zip(flattening, delays)):
+            if inner_step not in flattened_codebooks:
+                flat_codebook = UnrolledPatternProvider.FlattenedCodebook(codebooks=[q], delay=delay)
+            else:
+                flat_codebook = flattened_codebooks[inner_step]
+                assert flat_codebook.delay == delay, (
+                    &#34;Delay and flattening between codebooks is inconsistent: &#34;,
+                    &#34;two codebooks flattened to the same position should have the same delay.&#34;
+                )
+                flat_codebook.codebooks.append(q)
+            flattened_codebooks[inner_step] = flat_codebook
+        return flattened_codebooks
+
+    @property
+    def _num_inner_steps(self):
+        &#34;&#34;&#34;Number of inner steps to unroll between timesteps in order to flatten the codebooks.
+        &#34;&#34;&#34;
+        return max([inner_step for inner_step in self._flattened_codebooks.keys()]) + 1
+
+    def num_virtual_steps(self, timesteps: int) -&gt; int:
+        return timesteps * self._num_inner_steps + 1
+
+    def get_pattern(self, timesteps: int) -&gt; Pattern:
+        &#34;&#34;&#34;Builds pattern for delay across codebooks.
+
+        Args:
+            timesteps (int): Total numer of timesteps.
+        &#34;&#34;&#34;
+        # the PatternLayout is built as a tuple of sequence position and list of coordinates
+        # so that it can be reordered properly given the required delay between codebooks of given timesteps
+        indexed_out: list = [(-1, [])]
+        max_timesteps = timesteps + self.max_delay
+        for t in range(max_timesteps):
+            # for each timestep, we unroll the flattened codebooks,
+            # emitting the sequence step with the corresponding delay
+            for step in range(self._num_inner_steps):
+                if step in self._flattened_codebooks:
+                    # we have codebooks at this virtual step to emit
+                    step_codebooks = self._flattened_codebooks[step]
+                    t_for_q = t + step_codebooks.delay
+                    coords = [LayoutCoord(t, q) for q in step_codebooks.codebooks]
+                    if t_for_q &lt; max_timesteps and t &lt; max_timesteps:
+                        indexed_out.append((t_for_q, coords))
+                else:
+                    # there is no codebook in this virtual step so we emit an empty list
+                    indexed_out.append((t, []))
+        out = [coords for _, coords in sorted(indexed_out)]
+        return Pattern(out, n_q=self.n_q, timesteps=timesteps)
+
+
+class VALLEPattern(CodebooksPatternProvider):
+    &#34;&#34;&#34;Almost VALL-E style pattern. We futher allow some delays for the
+    codebooks other than the first one.
+
+    Args:
+        n_q (int): Number of codebooks.
+        delays (Optional[List[int]]): Delay for each of the codebooks.
+            If delays not defined, each codebook is delayed by 1 compared to the previous one.
+    &#34;&#34;&#34;
+    def __init__(self, n_q: int, delays: tp.Optional[tp.List[int]] = None):
+        super().__init__(n_q)
+        if delays is None:
+            delays = [0] * (n_q - 1)
+        self.delays = delays
+        assert len(self.delays) == self.n_q - 1
+        assert sorted(self.delays) == self.delays
+
+    def get_pattern(self, timesteps: int) -&gt; Pattern:
+        out: PatternLayout = [[]]
+        for t in range(timesteps):
+            out.append([LayoutCoord(t, 0)])
+        max_delay = max(self.delays)
+        for t in range(timesteps + max_delay):
+            v = []
+            for q, delay in enumerate(self.delays):
+                t_for_q = t - delay
+                if t_for_q &gt;= 0:
+                    v.append(LayoutCoord(t_for_q, q + 1))
+            out.append(v)
+        return Pattern(out, n_q=self.n_q, timesteps=timesteps)
+
+
+class MusicLMPattern(CodebooksPatternProvider):
+    &#34;&#34;&#34;Almost MusicLM style pattern. This is equivalent to full flattening
+    but in a different order.
+
+    Args:
+        n_q (int): Number of codebooks.
+        group_by (int): Number of codebooks to group together.
+    &#34;&#34;&#34;
+    def __init__(self, n_q: int, group_by: int = 2):
+        super().__init__(n_q)
+        self.group_by = group_by
+
+    def get_pattern(self, timesteps: int) -&gt; Pattern:
+        out: PatternLayout = [[]]
+        for offset in range(0, self.n_q, self.group_by):
+            for t in range(timesteps):
+                for q in range(offset, offset + self.group_by):
+                    out.append([LayoutCoord(t, q)])
+        return Pattern(out, n_q=self.n_q, timesteps=timesteps)</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.modules.codebooks_patterns.CodebooksPatternProvider"><code class="flex name class">
+<span>class <span class="ident">CodebooksPatternProvider</span></span>
+<span>(</span><span>n_q: int, cached: bool = True)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Abstraction around providing pattern for interleaving codebooks.</p>
+<p>The CodebooksPatternProvider abstraction allows to implement various strategies to
+define interleaving pattern of sequences composed of multiple codebooks. For a given
+number of codebooks <code>n_q</code>, the pattern provider can generate a specified pattern
+corresponding to a sequence of <code>T</code> timesteps with <code>n_q</code> parallel codebooks. This pattern
+can be used to construct a new sequence from the original codes respecting the specified
+pattern. The pattern is defined as a list of list of code coordinates, code coordinate
+being a tuple with the original timestep and codebook to build the new sequence.
+Note that all patterns must start with an empty list that is then used to insert a first
+sequence step of special tokens in the newly generated sequence.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>n_q</code></strong> :&ensp;<code>int</code></dt>
+<dd>number of codebooks.</dd>
+<dt><strong><code>cached</code></strong> :&ensp;<code>bool</code></dt>
+<dd>if True, patterns for a given length are cached. In general
+that should be true for efficiency reason to avoid synchronization points.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class CodebooksPatternProvider(ABC):
+    &#34;&#34;&#34;Abstraction around providing pattern for interleaving codebooks.
+
+    The CodebooksPatternProvider abstraction allows to implement various strategies to
+    define interleaving pattern of sequences composed of multiple codebooks. For a given
+    number of codebooks `n_q`, the pattern provider can generate a specified pattern
+    corresponding to a sequence of `T` timesteps with `n_q` parallel codebooks. This pattern
+    can be used to construct a new sequence from the original codes respecting the specified
+    pattern. The pattern is defined as a list of list of code coordinates, code coordinate
+    being a tuple with the original timestep and codebook to build the new sequence.
+    Note that all patterns must start with an empty list that is then used to insert a first
+    sequence step of special tokens in the newly generated sequence.
+
+    Args:
+        n_q (int): number of codebooks.
+        cached (bool): if True, patterns for a given length are cached. In general
+            that should be true for efficiency reason to avoid synchronization points.
+    &#34;&#34;&#34;
+    def __init__(self, n_q: int, cached: bool = True):
+        assert n_q &gt; 0
+        self.n_q = n_q
+        self.get_pattern = lru_cache(100)(self.get_pattern)  # type: ignore
+
+    @abstractmethod
+    def get_pattern(self, timesteps: int) -&gt; Pattern:
+        &#34;&#34;&#34;Builds pattern with specific interleaving between codebooks.
+
+        Args:
+            timesteps (int): Total numer of timesteps.
+        &#34;&#34;&#34;
+        raise NotImplementedError()</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>abc.ABC</li>
+</ul>
+<h3>Subclasses</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.codebooks_patterns.DelayedPatternProvider" href="#audiocraft.modules.codebooks_patterns.DelayedPatternProvider">DelayedPatternProvider</a></li>
+<li><a title="audiocraft.modules.codebooks_patterns.MusicLMPattern" href="#audiocraft.modules.codebooks_patterns.MusicLMPattern">MusicLMPattern</a></li>
+<li><a title="audiocraft.modules.codebooks_patterns.UnrolledPatternProvider" href="#audiocraft.modules.codebooks_patterns.UnrolledPatternProvider">UnrolledPatternProvider</a></li>
+<li><a title="audiocraft.modules.codebooks_patterns.VALLEPattern" href="#audiocraft.modules.codebooks_patterns.VALLEPattern">VALLEPattern</a></li>
+</ul>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.modules.codebooks_patterns.CodebooksPatternProvider.get_pattern"><code class="name flex">
+<span>def <span class="ident">get_pattern</span></span>(<span>self, timesteps: int) ‑> <a title="audiocraft.modules.codebooks_patterns.Pattern" href="#audiocraft.modules.codebooks_patterns.Pattern">Pattern</a></span>
+</code></dt>
+<dd>
+<div class="desc"><p>Builds pattern with specific interleaving between codebooks.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>timesteps</code></strong> :&ensp;<code>int</code></dt>
+<dd>Total numer of timesteps.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@abstractmethod
+def get_pattern(self, timesteps: int) -&gt; Pattern:
+    &#34;&#34;&#34;Builds pattern with specific interleaving between codebooks.
+
+    Args:
+        timesteps (int): Total numer of timesteps.
+    &#34;&#34;&#34;
+    raise NotImplementedError()</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.modules.codebooks_patterns.DelayedPatternProvider"><code class="flex name class">
+<span>class <span class="ident">DelayedPatternProvider</span></span>
+<span>(</span><span>n_q: int, delays: Optional[List[int]] = None, flatten_first: int = 0, empty_initial: int = 0)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Provider for delayed pattern across delayed codebooks.
+Codebooks are delayed in the sequence and sequence steps will contain codebooks
+from different timesteps.</p>
+<h2 id="example">Example</h2>
+<p>Taking timesteps=4 and n_q=3, delays=None, the multi-codebook sequence:
+[[1, 2, 3, 4],
+[1, 2, 3, 4],
+[1, 2, 3, 4]]
+The resulting sequence obtained from the returned pattern is:
+[[S, 1, 2, 3, 4],
+[S, S, 1, 2, 3],
+[S, S, S, 1, 2]]
+(with S being a special token)</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>n_q</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of codebooks.</dd>
+<dt><strong><code>delays</code></strong> :&ensp;<code>Optional[List[int]]</code></dt>
+<dd>Delay for each of the codebooks.
+If delays not defined, each codebook is delayed by 1 compared to the previous one.</dd>
+<dt><strong><code>flatten_first</code></strong> :&ensp;<code>int</code></dt>
+<dd>Flatten the first N timesteps.</dd>
+<dt><strong><code>empty_initial</code></strong> :&ensp;<code>int</code></dt>
+<dd>Prepend with N empty list of coordinates.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class DelayedPatternProvider(CodebooksPatternProvider):
+    &#34;&#34;&#34;Provider for delayed pattern across delayed codebooks.
+    Codebooks are delayed in the sequence and sequence steps will contain codebooks
+    from different timesteps.
+
+    Example:
+        Taking timesteps=4 and n_q=3, delays=None, the multi-codebook sequence:
+        [[1, 2, 3, 4],
+        [1, 2, 3, 4],
+        [1, 2, 3, 4]]
+        The resulting sequence obtained from the returned pattern is:
+        [[S, 1, 2, 3, 4],
+        [S, S, 1, 2, 3],
+        [S, S, S, 1, 2]]
+        (with S being a special token)
+
+    Args:
+        n_q (int): Number of codebooks.
+        delays (Optional[List[int]]): Delay for each of the codebooks.
+            If delays not defined, each codebook is delayed by 1 compared to the previous one.
+        flatten_first (int): Flatten the first N timesteps.
+        empty_initial (int): Prepend with N empty list of coordinates.
+    &#34;&#34;&#34;
+    def __init__(self, n_q: int, delays: tp.Optional[tp.List[int]] = None,
+                 flatten_first: int = 0, empty_initial: int = 0):
+        super().__init__(n_q)
+        if delays is None:
+            delays = list(range(n_q))
+        self.delays = delays
+        self.flatten_first = flatten_first
+        self.empty_initial = empty_initial
+        assert len(self.delays) == self.n_q
+        assert sorted(self.delays) == self.delays
+
+    def get_pattern(self, timesteps: int) -&gt; Pattern:
+        out: PatternLayout = [[]]
+        max_delay = max(self.delays)
+        if self.empty_initial:
+            out += [[] for _ in range(self.empty_initial)]
+        if self.flatten_first:
+            for t in range(min(timesteps, self.flatten_first)):
+                for q in range(self.n_q):
+                    out.append([LayoutCoord(t, q)])
+        for t in range(self.flatten_first, timesteps + max_delay):
+            v = []
+            for q, delay in enumerate(self.delays):
+                t_for_q = t - delay
+                if t_for_q &gt;= self.flatten_first:
+                    v.append(LayoutCoord(t_for_q, q))
+            out.append(v)
+        return Pattern(out, n_q=self.n_q, timesteps=timesteps)</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.codebooks_patterns.CodebooksPatternProvider" href="#audiocraft.modules.codebooks_patterns.CodebooksPatternProvider">CodebooksPatternProvider</a></li>
+<li>abc.ABC</li>
+</ul>
+<h3>Subclasses</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.codebooks_patterns.ParallelPatternProvider" href="#audiocraft.modules.codebooks_patterns.ParallelPatternProvider">ParallelPatternProvider</a></li>
+</ul>
+<h3>Inherited members</h3>
+<ul class="hlist">
+<li><code><b><a title="audiocraft.modules.codebooks_patterns.CodebooksPatternProvider" href="#audiocraft.modules.codebooks_patterns.CodebooksPatternProvider">CodebooksPatternProvider</a></b></code>:
+<ul class="hlist">
+<li><code><a title="audiocraft.modules.codebooks_patterns.CodebooksPatternProvider.get_pattern" href="#audiocraft.modules.codebooks_patterns.CodebooksPatternProvider.get_pattern">get_pattern</a></code></li>
+</ul>
+</li>
+</ul>
+</dd>
+<dt id="audiocraft.modules.codebooks_patterns.LayoutCoord"><code class="flex name class">
+<span>class <span class="ident">LayoutCoord</span></span>
+<span>(</span><span>t, q)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>LayoutCoord(t, q)</p></div>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>builtins.tuple</li>
+</ul>
+<h3>Instance variables</h3>
+<dl>
+<dt id="audiocraft.modules.codebooks_patterns.LayoutCoord.q"><code class="name">var <span class="ident">q</span></code></dt>
+<dd>
+<div class="desc"><p>Alias for field number 1</p></div>
+</dd>
+<dt id="audiocraft.modules.codebooks_patterns.LayoutCoord.t"><code class="name">var <span class="ident">t</span></code></dt>
+<dd>
+<div class="desc"><p>Alias for field number 0</p></div>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.modules.codebooks_patterns.MusicLMPattern"><code class="flex name class">
+<span>class <span class="ident">MusicLMPattern</span></span>
+<span>(</span><span>n_q: int, group_by: int = 2)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Almost MusicLM style pattern. This is equivalent to full flattening
+but in a different order.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>n_q</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of codebooks.</dd>
+<dt><strong><code>group_by</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of codebooks to group together.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class MusicLMPattern(CodebooksPatternProvider):
+    &#34;&#34;&#34;Almost MusicLM style pattern. This is equivalent to full flattening
+    but in a different order.
+
+    Args:
+        n_q (int): Number of codebooks.
+        group_by (int): Number of codebooks to group together.
+    &#34;&#34;&#34;
+    def __init__(self, n_q: int, group_by: int = 2):
+        super().__init__(n_q)
+        self.group_by = group_by
+
+    def get_pattern(self, timesteps: int) -&gt; Pattern:
+        out: PatternLayout = [[]]
+        for offset in range(0, self.n_q, self.group_by):
+            for t in range(timesteps):
+                for q in range(offset, offset + self.group_by):
+                    out.append([LayoutCoord(t, q)])
+        return Pattern(out, n_q=self.n_q, timesteps=timesteps)</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.codebooks_patterns.CodebooksPatternProvider" href="#audiocraft.modules.codebooks_patterns.CodebooksPatternProvider">CodebooksPatternProvider</a></li>
+<li>abc.ABC</li>
+</ul>
+<h3>Inherited members</h3>
+<ul class="hlist">
+<li><code><b><a title="audiocraft.modules.codebooks_patterns.CodebooksPatternProvider" href="#audiocraft.modules.codebooks_patterns.CodebooksPatternProvider">CodebooksPatternProvider</a></b></code>:
+<ul class="hlist">
+<li><code><a title="audiocraft.modules.codebooks_patterns.CodebooksPatternProvider.get_pattern" href="#audiocraft.modules.codebooks_patterns.CodebooksPatternProvider.get_pattern">get_pattern</a></code></li>
+</ul>
+</li>
+</ul>
+</dd>
+<dt id="audiocraft.modules.codebooks_patterns.ParallelPatternProvider"><code class="flex name class">
+<span>class <span class="ident">ParallelPatternProvider</span></span>
+<span>(</span><span>n_q: int)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Provider for parallel pattern across codebooks.
+This pattern provider is a special case of the delayed pattern with actually no delay,
+hence delays=repeat(0, n_q).</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>n_q</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of codebooks.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class ParallelPatternProvider(DelayedPatternProvider):
+    &#34;&#34;&#34;Provider for parallel pattern across codebooks.
+    This pattern provider is a special case of the delayed pattern with actually no delay,
+    hence delays=repeat(0, n_q).
+
+    Args:
+        n_q (int): Number of codebooks.
+    &#34;&#34;&#34;
+    def __init__(self, n_q: int):
+        super().__init__(n_q, [0] * n_q)</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.codebooks_patterns.DelayedPatternProvider" href="#audiocraft.modules.codebooks_patterns.DelayedPatternProvider">DelayedPatternProvider</a></li>
+<li><a title="audiocraft.modules.codebooks_patterns.CodebooksPatternProvider" href="#audiocraft.modules.codebooks_patterns.CodebooksPatternProvider">CodebooksPatternProvider</a></li>
+<li>abc.ABC</li>
+</ul>
+<h3>Inherited members</h3>
+<ul class="hlist">
+<li><code><b><a title="audiocraft.modules.codebooks_patterns.DelayedPatternProvider" href="#audiocraft.modules.codebooks_patterns.DelayedPatternProvider">DelayedPatternProvider</a></b></code>:
+<ul class="hlist">
+<li><code><a title="audiocraft.modules.codebooks_patterns.DelayedPatternProvider.get_pattern" href="#audiocraft.modules.codebooks_patterns.CodebooksPatternProvider.get_pattern">get_pattern</a></code></li>
+</ul>
+</li>
+</ul>
+</dd>
+<dt id="audiocraft.modules.codebooks_patterns.Pattern"><code class="flex name class">
+<span>class <span class="ident">Pattern</span></span>
+<span>(</span><span>layout: List[List[<a title="audiocraft.modules.codebooks_patterns.LayoutCoord" href="#audiocraft.modules.codebooks_patterns.LayoutCoord">LayoutCoord</a>]], timesteps: int, n_q: int)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Base implementation of a pattern over a sequence with multiple codebooks.</p>
+<p>The codebook pattern consists in a layout, defining for each sequence step
+the list of coordinates of each codebook timestep in the resulting interleaved sequence.
+The first item of the pattern is always an empty list in order to properly insert a special token
+to start with. For convenience, we also keep track of <code>n_q</code> the number of codebooks used for the pattern
+and <code>timesteps</code> the number of timesteps corresponding to the original sequence.</p>
+<p>The pattern provides convenient methods to build and revert interleaved sequences from it:
+<code>build_pattern_sequence</code> maps a given a dense input tensor of multi-codebook sequence from [B, K, T]
+to the interleaved sequence of shape [B, K, S] applying the pattern, with S being the batch size,
+K being the number of codebooks, T the number of original timesteps and S the number of sequence steps
+for the output sequence. The unfilled positions are replaced with a special token and the built sequence
+is returned along with a mask indicating valid tokens.
+<code>revert_pattern_sequence</code> maps back an interleaved sequence of shape [B, K, S] to the original alignment
+of codebooks across timesteps to an output tensor of shape [B, K, T], using again a special token and a mask
+to fill and specify invalid positions if needed.
+See the dedicated methods for more details.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class Pattern:
+    &#34;&#34;&#34;Base implementation of a pattern over a sequence with multiple codebooks.
+
+    The codebook pattern consists in a layout, defining for each sequence step
+    the list of coordinates of each codebook timestep in the resulting interleaved sequence.
+    The first item of the pattern is always an empty list in order to properly insert a special token
+    to start with. For convenience, we also keep track of ``n_q`` the number of codebooks used for the pattern
+    and ``timesteps`` the number of timesteps corresponding to the original sequence.
+
+    The pattern provides convenient methods to build and revert interleaved sequences from it:
+    ``build_pattern_sequence`` maps a given a dense input tensor of multi-codebook sequence from [B, K, T]
+        to the interleaved sequence of shape [B, K, S] applying the pattern, with S being the batch size,
+        K being the number of codebooks, T the number of original timesteps and S the number of sequence steps
+        for the output sequence. The unfilled positions are replaced with a special token and the built sequence
+        is returned along with a mask indicating valid tokens.
+    ``revert_pattern_sequence`` maps back an interleaved sequence of shape [B, K, S] to the original alignment
+        of codebooks across timesteps to an output tensor of shape [B, K, T], using again a special token and a mask
+        to fill and specify invalid positions if needed.
+    See the dedicated methods for more details.
+    &#34;&#34;&#34;
+    # Pattern layout, for each sequence step, we have a list of coordinates
+    # corresponding to the original codebook timestep and position.
+    # The first list is always an empty list in order to properly insert
+    # a special token to start with.
+    layout: PatternLayout
+    timesteps: int
+    n_q: int
+
+    def __post_init__(self):
+        assert len(self.layout) &gt; 0
+        assert self.layout[0] == []
+        self._validate_layout()
+        self._build_reverted_sequence_scatter_indexes = lru_cache(100)(self._build_reverted_sequence_scatter_indexes)
+        self._build_pattern_sequence_scatter_indexes = lru_cache(100)(self._build_pattern_sequence_scatter_indexes)
+        logger.info(&#34;New pattern, time steps: %d, sequence steps: %d&#34;, self.timesteps, len(self.layout))
+
+    def _validate_layout(self):
+        &#34;&#34;&#34;Runs checks on the layout to ensure a valid pattern is defined.
+        A pattern is considered invalid if:
+            - Multiple timesteps for a same codebook are defined in the same sequence step
+            - The timesteps for a given codebook are not in ascending order as we advance in the sequence
+              (this would mean that we have future timesteps before past timesteps).
+        &#34;&#34;&#34;
+        q_timesteps = {q: 0 for q in range(self.n_q)}
+        for s, seq_coords in enumerate(self.layout):
+            if len(seq_coords) &gt; 0:
+                qs = set()
+                for coord in seq_coords:
+                    qs.add(coord.q)
+                    last_q_timestep = q_timesteps[coord.q]
+                    assert coord.t &gt;= last_q_timestep, \
+                        f&#34;Past timesteps are found in the sequence for codebook = {coord.q} at step {s}&#34;
+                    q_timesteps[coord.q] = coord.t
+                # each sequence step contains at max 1 coordinate per codebook
+                assert len(qs) == len(seq_coords), \
+                    f&#34;Multiple entries for a same codebook are found at step {s}&#34;
+
+    @property
+    def num_sequence_steps(self):
+        return len(self.layout) - 1
+
+    @property
+    def max_delay(self):
+        max_t_in_seq_coords = 0
+        for seq_coords in self.layout[1:]:
+            for coords in seq_coords:
+                max_t_in_seq_coords = max(max_t_in_seq_coords, coords.t + 1)
+        return max_t_in_seq_coords - self.timesteps
+
+    @property
+    def valid_layout(self):
+        valid_step = len(self.layout) - self.max_delay
+        return self.layout[:valid_step]
+
+    def get_sequence_coords_with_timestep(self, t: int, q: tp.Optional[int] = None):
+        &#34;&#34;&#34;Get codebook coordinates in the layout that corresponds to the specified timestep t
+        and optionally to the codebook q. Coordinates are returned as a tuple with the sequence step
+        and the actual codebook coordinates.
+        &#34;&#34;&#34;
+        assert t &lt;= self.timesteps, &#34;provided timesteps is greater than the pattern&#39;s number of timesteps&#34;
+        if q is not None:
+            assert q &lt;= self.n_q, &#34;provided number of codebooks is greater than the pattern&#39;s number of codebooks&#34;
+        coords = []
+        for s, seq_codes in enumerate(self.layout):
+            for code in seq_codes:
+                if code.t == t and (q is None or code.q == q):
+                    coords.append((s, code))
+        return coords
+
+    def get_steps_with_timestep(self, t: int, q: tp.Optional[int] = None) -&gt; tp.List[int]:
+        return [step for step, coords in self.get_sequence_coords_with_timestep(t, q)]
+
+    def get_first_step_with_timesteps(self, t: int, q: tp.Optional[int] = None) -&gt; tp.Optional[int]:
+        steps_with_timesteps = self.get_steps_with_timestep(t, q)
+        return steps_with_timesteps[0] if len(steps_with_timesteps) &gt; 0 else None
+
+    def _build_pattern_sequence_scatter_indexes(self, timesteps: int, n_q: int, keep_only_valid_steps: bool,
+                                                device: tp.Union[torch.device, str] = &#39;cpu&#39;):
+        &#34;&#34;&#34;Build scatter indexes corresponding to the pattern, up to the provided sequence_steps.
+
+        Args:
+            timesteps (int): Maximum number of timesteps steps to consider.
+            keep_only_valid_steps (bool): Restrict the pattern layout to match only valid steps.
+            device (Union[torch.device, str]): Device for created tensors.
+        Returns:
+            indexes (torch.Tensor): Indexes corresponding to the sequence, of shape [K, S].
+            mask (torch.Tensor): Mask corresponding to indexes that matches valid indexes, of shape [K, S].
+        &#34;&#34;&#34;
+        assert n_q == self.n_q, f&#34;invalid number of codebooks for the sequence and the pattern: {n_q} != {self.n_q}&#34;
+        assert timesteps &lt;= self.timesteps, &#34;invalid number of timesteps used to build the sequence from the pattern&#34;
+        # use the proper layout based on whether we limit ourselves to valid steps only or not,
+        # note that using the valid_layout will result in a truncated sequence up to the valid steps
+        ref_layout = self.valid_layout if keep_only_valid_steps else self.layout
+        # single item indexing being super slow with pytorch vs. numpy, so we use numpy here
+        indexes = torch.zeros(n_q, len(ref_layout), dtype=torch.long).numpy()
+        mask = torch.zeros(n_q, len(ref_layout), dtype=torch.bool).numpy()
+        # fill indexes with last sequence step value that will correspond to our special token
+        # the last value is n_q * timesteps as we have flattened z and append special token as the last token
+        # which will correspond to the index: n_q * timesteps
+        indexes[:] = n_q * timesteps
+        # iterate over the pattern and fill scattered indexes and mask
+        for s, sequence_coords in enumerate(ref_layout):
+            for coords in sequence_coords:
+                if coords.t &lt; timesteps:
+                    indexes[coords.q, s] = coords.t + coords.q * timesteps
+                    mask[coords.q, s] = 1
+        indexes = torch.from_numpy(indexes).to(device)
+        mask = torch.from_numpy(mask).to(device)
+        return indexes, mask
+
+    def build_pattern_sequence(self, z: torch.Tensor, special_token: int, keep_only_valid_steps: bool = False):
+        &#34;&#34;&#34;Build sequence corresponding to the pattern from the input tensor z.
+        The sequence is built using up to sequence_steps if specified, and non-pattern
+        coordinates are filled with the special token.
+
+        Args:
+            z (torch.Tensor): Input tensor of multi-codebooks sequence, of shape [B, K, T].
+            special_token (int): Special token used to fill non-pattern coordinates in the new sequence.
+            keep_only_valid_steps (bool): Build a sequence from the pattern up to valid (= fully defined) steps.
+                Steps that are beyond valid steps will be replaced by the special_token in that case.
+        Returns:
+            values (torch.Tensor): Interleaved sequence matching the pattern, of shape [B, K, S] with S
+                corresponding either to the sequence_steps if provided, otherwise to the length of the pattern.
+            indexes (torch.Tensor): Indexes corresponding to the interleaved sequence, of shape [K, S].
+            mask (torch.Tensor): Mask corresponding to indexes that matches valid indexes of shape [K, S].
+        &#34;&#34;&#34;
+        B, K, T = z.shape
+        indexes, mask = self._build_pattern_sequence_scatter_indexes(
+            T, K, keep_only_valid_steps=keep_only_valid_steps, device=str(z.device)
+        )
+        z = z.view(B, -1)
+        # we append the special token as the last index of our flattened z tensor
+        z = torch.cat([z, torch.zeros_like(z[:, :1]) + special_token], dim=1)
+        values = z[:, indexes.view(-1)]
+        values = values.view(B, K, indexes.shape[-1])
+        return values, indexes, mask
+
+    def _build_reverted_sequence_scatter_indexes(self, sequence_steps: int, n_q: int,
+                                                 keep_only_valid_steps: bool = False,
+                                                 is_model_output: bool = False,
+                                                 device: tp.Union[torch.device, str] = &#39;cpu&#39;):
+        &#34;&#34;&#34;Builds scatter indexes required to retrieve the original multi-codebook sequence
+        from interleaving pattern.
+
+        Args:
+            sequence_steps (int): Sequence steps.
+            n_q (int): Number of codebooks.
+            keep_only_valid_steps (bool): Build a sequence from the pattern up to valid (= fully defined) steps.
+                Steps that are beyond valid steps will be replaced by the special_token in that case.
+            is_model_output (bool): Whether to keep the sequence item corresponding to initial special token or not.
+            device (Union[torch.device, str]): Device for created tensors.
+        Returns:
+            torch.Tensor: Indexes for reconstructing the output, of shape [K, T].
+            mask (torch.Tensor): Mask corresponding to indexes that matches valid indexes of shape [K, T].
+        &#34;&#34;&#34;
+        ref_layout = self.valid_layout if keep_only_valid_steps else self.layout
+        # TODO(jade): Do we want to further truncate to only valid timesteps here as well?
+        timesteps = self.timesteps
+        assert n_q == self.n_q, f&#34;invalid number of codebooks for the sequence and the pattern: {n_q} != {self.n_q}&#34;
+        assert sequence_steps &lt;= len(ref_layout), \
+            f&#34;sequence to revert is longer than the defined pattern: {sequence_steps} &gt; {len(ref_layout)}&#34;
+
+        # ensure we take the appropriate indexes to keep the model output from the first special token as well
+        if is_model_output:
+            ref_layout = ref_layout[1:]
+
+        # single item indexing being super slow with pytorch vs. numpy, so we use numpy here
+        indexes = torch.zeros(n_q, timesteps, dtype=torch.long).numpy()
+        mask = torch.zeros(n_q, timesteps, dtype=torch.bool).numpy()
+        # fill indexes with last sequence step value that will correspond to our special token
+        indexes[:] = n_q * sequence_steps
+        for s, sequence_codes in enumerate(ref_layout):
+            if s &lt; sequence_steps:
+                for code in sequence_codes:
+                    if code.t &lt; timesteps:
+                        indexes[code.q, code.t] = s + code.q * sequence_steps
+                        mask[code.q, code.t] = 1
+        indexes = torch.from_numpy(indexes).to(device)
+        mask = torch.from_numpy(mask).to(device)
+        return indexes, mask
+
+    def revert_pattern_sequence(self, s: torch.Tensor, special_token: int, keep_only_valid_steps: bool = False):
+        &#34;&#34;&#34;Revert a sequence built from the pattern back to the original multi-codebook sequence without interleaving.
+        The sequence is reverted using up to timesteps if specified, and non-pattern coordinates
+        are filled with the special token.
+
+        Args:
+            s (torch.Tensor): Interleaved sequence tensor obtained from the pattern, of shape [B, K, S].
+            special_token (int or float): Special token used to fill non-pattern coordinates in the new sequence.
+        Returns:
+            values (torch.Tensor): Interleaved sequence matching the pattern, of shape [B, K, T] with T
+                corresponding either to the timesteps if provided, or the total timesteps in pattern otherwise.
+            indexes (torch.Tensor): Indexes corresponding to the interleaved sequence, of shape [K, T].
+            mask (torch.Tensor): Mask corresponding to indexes that matches valid indexes of shape [K, T].
+        &#34;&#34;&#34;
+        B, K, S = s.shape
+        indexes, mask = self._build_reverted_sequence_scatter_indexes(
+            S, K, keep_only_valid_steps, is_model_output=False, device=str(s.device)
+        )
+        s = s.view(B, -1)
+        # we append the special token as the last index of our flattened z tensor
+        s = torch.cat([s, torch.zeros_like(s[:, :1]) + special_token], dim=1)
+        values = s[:, indexes.view(-1)]
+        values = values.view(B, K, indexes.shape[-1])
+        return values, indexes, mask
+
+    def revert_pattern_logits(self, logits: torch.Tensor, special_token: float, keep_only_valid_steps: bool = False):
+        &#34;&#34;&#34;Revert model logits obtained on a sequence built from the pattern
+        back to a tensor matching the original sequence.
+
+        This method is similar to ``revert_pattern_sequence`` with the following specificities:
+        1. It is designed to work with the extra cardinality dimension
+        2. We return the logits for the first sequence item that matches the special_token and
+        which matching target in the original sequence is the first item of the sequence,
+        while we skip the last logits as there is no matching target
+        &#34;&#34;&#34;
+        B, card, K, S = logits.shape
+        indexes, mask = self._build_reverted_sequence_scatter_indexes(
+            S, K, keep_only_valid_steps, is_model_output=True, device=logits.device
+        )
+        logits = logits.reshape(B, card, -1)
+        # we append the special token as the last index of our flattened z tensor
+        logits = torch.cat([logits, torch.zeros_like(logits[:, :, :1]) + special_token], dim=-1)  # [B, card, K x S]
+        values = logits[:, :, indexes.view(-1)]
+        values = values.view(B, card, K, indexes.shape[-1])
+        return values, indexes, mask</code></pre>
+</details>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.codebooks_patterns.Pattern.layout"><code class="name">var <span class="ident">layout</span> : List[List[<a title="audiocraft.modules.codebooks_patterns.LayoutCoord" href="#audiocraft.modules.codebooks_patterns.LayoutCoord">LayoutCoord</a>]]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.codebooks_patterns.Pattern.n_q"><code class="name">var <span class="ident">n_q</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.codebooks_patterns.Pattern.timesteps"><code class="name">var <span class="ident">timesteps</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Instance variables</h3>
+<dl>
+<dt id="audiocraft.modules.codebooks_patterns.Pattern.max_delay"><code class="name">var <span class="ident">max_delay</span></code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def max_delay(self):
+    max_t_in_seq_coords = 0
+    for seq_coords in self.layout[1:]:
+        for coords in seq_coords:
+            max_t_in_seq_coords = max(max_t_in_seq_coords, coords.t + 1)
+    return max_t_in_seq_coords - self.timesteps</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.codebooks_patterns.Pattern.num_sequence_steps"><code class="name">var <span class="ident">num_sequence_steps</span></code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def num_sequence_steps(self):
+    return len(self.layout) - 1</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.codebooks_patterns.Pattern.valid_layout"><code class="name">var <span class="ident">valid_layout</span></code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def valid_layout(self):
+    valid_step = len(self.layout) - self.max_delay
+    return self.layout[:valid_step]</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.modules.codebooks_patterns.Pattern.build_pattern_sequence"><code class="name flex">
+<span>def <span class="ident">build_pattern_sequence</span></span>(<span>self, z: torch.Tensor, special_token: int, keep_only_valid_steps: bool = False)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Build sequence corresponding to the pattern from the input tensor z.
+The sequence is built using up to sequence_steps if specified, and non-pattern
+coordinates are filled with the special token.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>z</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>Input tensor of multi-codebooks sequence, of shape [B, K, T].</dd>
+<dt><strong><code>special_token</code></strong> :&ensp;<code>int</code></dt>
+<dd>Special token used to fill non-pattern coordinates in the new sequence.</dd>
+<dt><strong><code>keep_only_valid_steps</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Build a sequence from the pattern up to valid (= fully defined) steps.
+Steps that are beyond valid steps will be replaced by the special_token in that case.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<p>values (torch.Tensor): Interleaved sequence matching the pattern, of shape [B, K, S] with S
+corresponding either to the sequence_steps if provided, otherwise to the length of the pattern.
+indexes (torch.Tensor): Indexes corresponding to the interleaved sequence, of shape [K, S].
+mask (torch.Tensor): Mask corresponding to indexes that matches valid indexes of shape [K, S].</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def build_pattern_sequence(self, z: torch.Tensor, special_token: int, keep_only_valid_steps: bool = False):
+    &#34;&#34;&#34;Build sequence corresponding to the pattern from the input tensor z.
+    The sequence is built using up to sequence_steps if specified, and non-pattern
+    coordinates are filled with the special token.
+
+    Args:
+        z (torch.Tensor): Input tensor of multi-codebooks sequence, of shape [B, K, T].
+        special_token (int): Special token used to fill non-pattern coordinates in the new sequence.
+        keep_only_valid_steps (bool): Build a sequence from the pattern up to valid (= fully defined) steps.
+            Steps that are beyond valid steps will be replaced by the special_token in that case.
+    Returns:
+        values (torch.Tensor): Interleaved sequence matching the pattern, of shape [B, K, S] with S
+            corresponding either to the sequence_steps if provided, otherwise to the length of the pattern.
+        indexes (torch.Tensor): Indexes corresponding to the interleaved sequence, of shape [K, S].
+        mask (torch.Tensor): Mask corresponding to indexes that matches valid indexes of shape [K, S].
+    &#34;&#34;&#34;
+    B, K, T = z.shape
+    indexes, mask = self._build_pattern_sequence_scatter_indexes(
+        T, K, keep_only_valid_steps=keep_only_valid_steps, device=str(z.device)
+    )
+    z = z.view(B, -1)
+    # we append the special token as the last index of our flattened z tensor
+    z = torch.cat([z, torch.zeros_like(z[:, :1]) + special_token], dim=1)
+    values = z[:, indexes.view(-1)]
+    values = values.view(B, K, indexes.shape[-1])
+    return values, indexes, mask</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.codebooks_patterns.Pattern.get_first_step_with_timesteps"><code class="name flex">
+<span>def <span class="ident">get_first_step_with_timesteps</span></span>(<span>self, t: int, q: Optional[int] = None) ‑> Optional[int]</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_first_step_with_timesteps(self, t: int, q: tp.Optional[int] = None) -&gt; tp.Optional[int]:
+    steps_with_timesteps = self.get_steps_with_timestep(t, q)
+    return steps_with_timesteps[0] if len(steps_with_timesteps) &gt; 0 else None</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.codebooks_patterns.Pattern.get_sequence_coords_with_timestep"><code class="name flex">
+<span>def <span class="ident">get_sequence_coords_with_timestep</span></span>(<span>self, t: int, q: Optional[int] = None)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Get codebook coordinates in the layout that corresponds to the specified timestep t
+and optionally to the codebook q. Coordinates are returned as a tuple with the sequence step
+and the actual codebook coordinates.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_sequence_coords_with_timestep(self, t: int, q: tp.Optional[int] = None):
+    &#34;&#34;&#34;Get codebook coordinates in the layout that corresponds to the specified timestep t
+    and optionally to the codebook q. Coordinates are returned as a tuple with the sequence step
+    and the actual codebook coordinates.
+    &#34;&#34;&#34;
+    assert t &lt;= self.timesteps, &#34;provided timesteps is greater than the pattern&#39;s number of timesteps&#34;
+    if q is not None:
+        assert q &lt;= self.n_q, &#34;provided number of codebooks is greater than the pattern&#39;s number of codebooks&#34;
+    coords = []
+    for s, seq_codes in enumerate(self.layout):
+        for code in seq_codes:
+            if code.t == t and (q is None or code.q == q):
+                coords.append((s, code))
+    return coords</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.codebooks_patterns.Pattern.get_steps_with_timestep"><code class="name flex">
+<span>def <span class="ident">get_steps_with_timestep</span></span>(<span>self, t: int, q: Optional[int] = None) ‑> List[int]</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_steps_with_timestep(self, t: int, q: tp.Optional[int] = None) -&gt; tp.List[int]:
+    return [step for step, coords in self.get_sequence_coords_with_timestep(t, q)]</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.codebooks_patterns.Pattern.revert_pattern_logits"><code class="name flex">
+<span>def <span class="ident">revert_pattern_logits</span></span>(<span>self, logits: torch.Tensor, special_token: float, keep_only_valid_steps: bool = False)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Revert model logits obtained on a sequence built from the pattern
+back to a tensor matching the original sequence.</p>
+<p>This method is similar to <code>revert_pattern_sequence</code> with the following specificities:
+1. It is designed to work with the extra cardinality dimension
+2. We return the logits for the first sequence item that matches the special_token and
+which matching target in the original sequence is the first item of the sequence,
+while we skip the last logits as there is no matching target</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def revert_pattern_logits(self, logits: torch.Tensor, special_token: float, keep_only_valid_steps: bool = False):
+    &#34;&#34;&#34;Revert model logits obtained on a sequence built from the pattern
+    back to a tensor matching the original sequence.
+
+    This method is similar to ``revert_pattern_sequence`` with the following specificities:
+    1. It is designed to work with the extra cardinality dimension
+    2. We return the logits for the first sequence item that matches the special_token and
+    which matching target in the original sequence is the first item of the sequence,
+    while we skip the last logits as there is no matching target
+    &#34;&#34;&#34;
+    B, card, K, S = logits.shape
+    indexes, mask = self._build_reverted_sequence_scatter_indexes(
+        S, K, keep_only_valid_steps, is_model_output=True, device=logits.device
+    )
+    logits = logits.reshape(B, card, -1)
+    # we append the special token as the last index of our flattened z tensor
+    logits = torch.cat([logits, torch.zeros_like(logits[:, :, :1]) + special_token], dim=-1)  # [B, card, K x S]
+    values = logits[:, :, indexes.view(-1)]
+    values = values.view(B, card, K, indexes.shape[-1])
+    return values, indexes, mask</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.codebooks_patterns.Pattern.revert_pattern_sequence"><code class="name flex">
+<span>def <span class="ident">revert_pattern_sequence</span></span>(<span>self, s: torch.Tensor, special_token: int, keep_only_valid_steps: bool = False)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Revert a sequence built from the pattern back to the original multi-codebook sequence without interleaving.
+The sequence is reverted using up to timesteps if specified, and non-pattern coordinates
+are filled with the special token.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>s</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>Interleaved sequence tensor obtained from the pattern, of shape [B, K, S].</dd>
+<dt><strong><code>special_token</code></strong> :&ensp;<code>int</code> or <code>float</code></dt>
+<dd>Special token used to fill non-pattern coordinates in the new sequence.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<p>values (torch.Tensor): Interleaved sequence matching the pattern, of shape [B, K, T] with T
+corresponding either to the timesteps if provided, or the total timesteps in pattern otherwise.
+indexes (torch.Tensor): Indexes corresponding to the interleaved sequence, of shape [K, T].
+mask (torch.Tensor): Mask corresponding to indexes that matches valid indexes of shape [K, T].</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def revert_pattern_sequence(self, s: torch.Tensor, special_token: int, keep_only_valid_steps: bool = False):
+    &#34;&#34;&#34;Revert a sequence built from the pattern back to the original multi-codebook sequence without interleaving.
+    The sequence is reverted using up to timesteps if specified, and non-pattern coordinates
+    are filled with the special token.
+
+    Args:
+        s (torch.Tensor): Interleaved sequence tensor obtained from the pattern, of shape [B, K, S].
+        special_token (int or float): Special token used to fill non-pattern coordinates in the new sequence.
+    Returns:
+        values (torch.Tensor): Interleaved sequence matching the pattern, of shape [B, K, T] with T
+            corresponding either to the timesteps if provided, or the total timesteps in pattern otherwise.
+        indexes (torch.Tensor): Indexes corresponding to the interleaved sequence, of shape [K, T].
+        mask (torch.Tensor): Mask corresponding to indexes that matches valid indexes of shape [K, T].
+    &#34;&#34;&#34;
+    B, K, S = s.shape
+    indexes, mask = self._build_reverted_sequence_scatter_indexes(
+        S, K, keep_only_valid_steps, is_model_output=False, device=str(s.device)
+    )
+    s = s.view(B, -1)
+    # we append the special token as the last index of our flattened z tensor
+    s = torch.cat([s, torch.zeros_like(s[:, :1]) + special_token], dim=1)
+    values = s[:, indexes.view(-1)]
+    values = values.view(B, K, indexes.shape[-1])
+    return values, indexes, mask</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.modules.codebooks_patterns.UnrolledPatternProvider"><code class="flex name class">
+<span>class <span class="ident">UnrolledPatternProvider</span></span>
+<span>(</span><span>n_q: int, flattening: Optional[List[int]] = None, delays: Optional[List[int]] = None)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Provider for unrolling codebooks pattern.
+This pattern provider enables to represent the codebook flattened completely or only to some extend
+while also specifying a given delay between the flattened codebooks representation, allowing to
+unroll the codebooks in the sequence.</p>
+<h2 id="example">Example</h2>
+<ol>
+<li>Flattening of the codebooks.
+By default, the pattern provider will fully flatten the codebooks such as flattening=range(n_q),
+taking n_q = 3 and timesteps = 4:
+[[1, 2, 3, 4],
+[1, 2, 3, 4],
+[1, 2, 3, 4]]
+will result into:
+[[S, S, 1, S, S, 2, S, S, 3, S, S, 4],
+[S, 1, S, S, 2, S, S, 3, S, S, 4, S],
+[1, S, S, 2, S, S, 3, S, S, 4, S, S]]</li>
+<li>Partial flattening of the codebooks. The <code>flattening</code> parameter allows to specify the inner step
+for each of the codebook, allowing to define which codebook to flatten (or keep in parallel), for example
+taking n_q = 3, timesteps = 4 and flattening = [0, 1, 1]:
+[[1, 2, 3, 4],
+[1, 2, 3, 4],
+[1, 2, 3, 4]]
+will result into:
+[[S, 1, S, S, 2, S, S, 3, S, S, 4, S],
+[S, 1, S, S, 2, S, S, 3, S, S, 4, S],
+[1, S, S, 2, S, S, 3, S, S, 4, S, S]]</li>
+<li>Flattening with delay. The <code>delay</code> parameter allows to further unroll the sequence of codebooks
+allowing to specify the delay per codebook. Note that the delay between codebooks flattened to the
+same inner timestep should be coherent. For example, taking n_q = 3, timesteps = 4, flattening = [0, 1, 1]
+and delays = [0, 3, 3]:
+[[1, 2, 3, 4],
+[1, 2, 3, 4],
+[1, 2, 3, 4]]
+will result into:
+[[S, S, S, 1, S, 2, S, 3, S, 4],
+[S, S, S, 1, S, 2, S, 3, S, 4],
+[1, 2, 3, S, 4, S, 5, S, 6, S]]</li>
+</ol>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>n_q</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of codebooks.</dd>
+<dt><strong><code>flattening</code></strong> :&ensp;<code>Optional[List[int]]</code></dt>
+<dd>Flattening schema over the codebooks. If not defined,
+the codebooks will be flattened to 1 codebook per step, meaning that the sequence will
+have n_q extra steps for each timestep.</dd>
+<dt><strong><code>delays</code></strong> :&ensp;<code>Optional[List[int]]</code></dt>
+<dd>Delay for each of the codebooks. If not defined,
+no delay is added and therefore will default to [0] * <code>n_q</code>.
+Note that two codebooks that will be flattened to the same inner step
+should have the same delay, otherwise the pattern is considered as invalid.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class UnrolledPatternProvider(CodebooksPatternProvider):
+    &#34;&#34;&#34;Provider for unrolling codebooks pattern.
+    This pattern provider enables to represent the codebook flattened completely or only to some extend
+    while also specifying a given delay between the flattened codebooks representation, allowing to
+    unroll the codebooks in the sequence.
+
+    Example:
+        1. Flattening of the codebooks.
+        By default, the pattern provider will fully flatten the codebooks such as flattening=range(n_q),
+        taking n_q = 3 and timesteps = 4:
+        [[1, 2, 3, 4],
+         [1, 2, 3, 4],
+         [1, 2, 3, 4]]
+        will result into:
+        [[S, S, 1, S, S, 2, S, S, 3, S, S, 4],
+         [S, 1, S, S, 2, S, S, 3, S, S, 4, S],
+         [1, S, S, 2, S, S, 3, S, S, 4, S, S]]
+        2. Partial flattening of the codebooks. The ``flattening`` parameter allows to specify the inner step
+        for each of the codebook, allowing to define which codebook to flatten (or keep in parallel), for example
+        taking n_q = 3, timesteps = 4 and flattening = [0, 1, 1]:
+        [[1, 2, 3, 4],
+         [1, 2, 3, 4],
+         [1, 2, 3, 4]]
+        will result into:
+        [[S, 1, S, S, 2, S, S, 3, S, S, 4, S],
+         [S, 1, S, S, 2, S, S, 3, S, S, 4, S],
+         [1, S, S, 2, S, S, 3, S, S, 4, S, S]]
+        3. Flattening with delay. The ``delay`` parameter allows to further unroll the sequence of codebooks
+        allowing to specify the delay per codebook. Note that the delay between codebooks flattened to the
+        same inner timestep should be coherent. For example, taking n_q = 3, timesteps = 4, flattening = [0, 1, 1]
+        and delays = [0, 3, 3]:
+        [[1, 2, 3, 4],
+         [1, 2, 3, 4],
+         [1, 2, 3, 4]]
+        will result into:
+        [[S, S, S, 1, S, 2, S, 3, S, 4],
+         [S, S, S, 1, S, 2, S, 3, S, 4],
+         [1, 2, 3, S, 4, S, 5, S, 6, S]]
+
+    Args:
+        n_q (int): Number of codebooks.
+        flattening (Optional[List[int]]): Flattening schema over the codebooks. If not defined,
+            the codebooks will be flattened to 1 codebook per step, meaning that the sequence will
+            have n_q extra steps for each timestep.
+        delays (Optional[List[int]]): Delay for each of the codebooks. If not defined,
+            no delay is added and therefore will default to [0] * ``n_q``.
+            Note that two codebooks that will be flattened to the same inner step
+            should have the same delay, otherwise the pattern is considered as invalid.
+    &#34;&#34;&#34;
+    FlattenedCodebook = namedtuple(&#39;FlattenedCodebook&#39;, [&#39;codebooks&#39;, &#39;delay&#39;])
+
+    def __init__(self, n_q: int, flattening: tp.Optional[tp.List[int]] = None,
+                 delays: tp.Optional[tp.List[int]] = None):
+        super().__init__(n_q)
+        if flattening is None:
+            flattening = list(range(n_q))
+        if delays is None:
+            delays = [0] * n_q
+        assert len(flattening) == n_q
+        assert len(delays) == n_q
+        assert sorted(flattening) == flattening
+        assert sorted(delays) == delays
+        self._flattened_codebooks = self._build_flattened_codebooks(delays, flattening)
+        self.max_delay = max(delays)
+
+    def _build_flattened_codebooks(self, delays: tp.List[int], flattening: tp.List[int]):
+        &#34;&#34;&#34;Build a flattened codebooks representation as a dictionary of inner step
+        and the actual codebook indices corresponding to the flattened codebook. For convenience, we
+        also store the delay associated to the flattened codebook to avoid maintaining an extra mapping.
+        &#34;&#34;&#34;
+        flattened_codebooks: dict = {}
+        for q, (inner_step, delay) in enumerate(zip(flattening, delays)):
+            if inner_step not in flattened_codebooks:
+                flat_codebook = UnrolledPatternProvider.FlattenedCodebook(codebooks=[q], delay=delay)
+            else:
+                flat_codebook = flattened_codebooks[inner_step]
+                assert flat_codebook.delay == delay, (
+                    &#34;Delay and flattening between codebooks is inconsistent: &#34;,
+                    &#34;two codebooks flattened to the same position should have the same delay.&#34;
+                )
+                flat_codebook.codebooks.append(q)
+            flattened_codebooks[inner_step] = flat_codebook
+        return flattened_codebooks
+
+    @property
+    def _num_inner_steps(self):
+        &#34;&#34;&#34;Number of inner steps to unroll between timesteps in order to flatten the codebooks.
+        &#34;&#34;&#34;
+        return max([inner_step for inner_step in self._flattened_codebooks.keys()]) + 1
+
+    def num_virtual_steps(self, timesteps: int) -&gt; int:
+        return timesteps * self._num_inner_steps + 1
+
+    def get_pattern(self, timesteps: int) -&gt; Pattern:
+        &#34;&#34;&#34;Builds pattern for delay across codebooks.
+
+        Args:
+            timesteps (int): Total numer of timesteps.
+        &#34;&#34;&#34;
+        # the PatternLayout is built as a tuple of sequence position and list of coordinates
+        # so that it can be reordered properly given the required delay between codebooks of given timesteps
+        indexed_out: list = [(-1, [])]
+        max_timesteps = timesteps + self.max_delay
+        for t in range(max_timesteps):
+            # for each timestep, we unroll the flattened codebooks,
+            # emitting the sequence step with the corresponding delay
+            for step in range(self._num_inner_steps):
+                if step in self._flattened_codebooks:
+                    # we have codebooks at this virtual step to emit
+                    step_codebooks = self._flattened_codebooks[step]
+                    t_for_q = t + step_codebooks.delay
+                    coords = [LayoutCoord(t, q) for q in step_codebooks.codebooks]
+                    if t_for_q &lt; max_timesteps and t &lt; max_timesteps:
+                        indexed_out.append((t_for_q, coords))
+                else:
+                    # there is no codebook in this virtual step so we emit an empty list
+                    indexed_out.append((t, []))
+        out = [coords for _, coords in sorted(indexed_out)]
+        return Pattern(out, n_q=self.n_q, timesteps=timesteps)</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.codebooks_patterns.CodebooksPatternProvider" href="#audiocraft.modules.codebooks_patterns.CodebooksPatternProvider">CodebooksPatternProvider</a></li>
+<li>abc.ABC</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.codebooks_patterns.UnrolledPatternProvider.FlattenedCodebook"><code class="name">var <span class="ident">FlattenedCodebook</span></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.modules.codebooks_patterns.UnrolledPatternProvider.get_pattern"><code class="name flex">
+<span>def <span class="ident">get_pattern</span></span>(<span>self, timesteps: int) ‑> <a title="audiocraft.modules.codebooks_patterns.Pattern" href="#audiocraft.modules.codebooks_patterns.Pattern">Pattern</a></span>
+</code></dt>
+<dd>
+<div class="desc"><p>Builds pattern for delay across codebooks.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>timesteps</code></strong> :&ensp;<code>int</code></dt>
+<dd>Total numer of timesteps.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_pattern(self, timesteps: int) -&gt; Pattern:
+    &#34;&#34;&#34;Builds pattern for delay across codebooks.
+
+    Args:
+        timesteps (int): Total numer of timesteps.
+    &#34;&#34;&#34;
+    # the PatternLayout is built as a tuple of sequence position and list of coordinates
+    # so that it can be reordered properly given the required delay between codebooks of given timesteps
+    indexed_out: list = [(-1, [])]
+    max_timesteps = timesteps + self.max_delay
+    for t in range(max_timesteps):
+        # for each timestep, we unroll the flattened codebooks,
+        # emitting the sequence step with the corresponding delay
+        for step in range(self._num_inner_steps):
+            if step in self._flattened_codebooks:
+                # we have codebooks at this virtual step to emit
+                step_codebooks = self._flattened_codebooks[step]
+                t_for_q = t + step_codebooks.delay
+                coords = [LayoutCoord(t, q) for q in step_codebooks.codebooks]
+                if t_for_q &lt; max_timesteps and t &lt; max_timesteps:
+                    indexed_out.append((t_for_q, coords))
+            else:
+                # there is no codebook in this virtual step so we emit an empty list
+                indexed_out.append((t, []))
+    out = [coords for _, coords in sorted(indexed_out)]
+    return Pattern(out, n_q=self.n_q, timesteps=timesteps)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.codebooks_patterns.UnrolledPatternProvider.num_virtual_steps"><code class="name flex">
+<span>def <span class="ident">num_virtual_steps</span></span>(<span>self, timesteps: int) ‑> int</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def num_virtual_steps(self, timesteps: int) -&gt; int:
+    return timesteps * self._num_inner_steps + 1</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.modules.codebooks_patterns.VALLEPattern"><code class="flex name class">
+<span>class <span class="ident">VALLEPattern</span></span>
+<span>(</span><span>n_q: int, delays: Optional[List[int]] = None)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Almost VALL-E style pattern. We futher allow some delays for the
+codebooks other than the first one.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>n_q</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of codebooks.</dd>
+<dt><strong><code>delays</code></strong> :&ensp;<code>Optional[List[int]]</code></dt>
+<dd>Delay for each of the codebooks.
+If delays not defined, each codebook is delayed by 1 compared to the previous one.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class VALLEPattern(CodebooksPatternProvider):
+    &#34;&#34;&#34;Almost VALL-E style pattern. We futher allow some delays for the
+    codebooks other than the first one.
+
+    Args:
+        n_q (int): Number of codebooks.
+        delays (Optional[List[int]]): Delay for each of the codebooks.
+            If delays not defined, each codebook is delayed by 1 compared to the previous one.
+    &#34;&#34;&#34;
+    def __init__(self, n_q: int, delays: tp.Optional[tp.List[int]] = None):
+        super().__init__(n_q)
+        if delays is None:
+            delays = [0] * (n_q - 1)
+        self.delays = delays
+        assert len(self.delays) == self.n_q - 1
+        assert sorted(self.delays) == self.delays
+
+    def get_pattern(self, timesteps: int) -&gt; Pattern:
+        out: PatternLayout = [[]]
+        for t in range(timesteps):
+            out.append([LayoutCoord(t, 0)])
+        max_delay = max(self.delays)
+        for t in range(timesteps + max_delay):
+            v = []
+            for q, delay in enumerate(self.delays):
+                t_for_q = t - delay
+                if t_for_q &gt;= 0:
+                    v.append(LayoutCoord(t_for_q, q + 1))
+            out.append(v)
+        return Pattern(out, n_q=self.n_q, timesteps=timesteps)</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.codebooks_patterns.CodebooksPatternProvider" href="#audiocraft.modules.codebooks_patterns.CodebooksPatternProvider">CodebooksPatternProvider</a></li>
+<li>abc.ABC</li>
+</ul>
+<h3>Inherited members</h3>
+<ul class="hlist">
+<li><code><b><a title="audiocraft.modules.codebooks_patterns.CodebooksPatternProvider" href="#audiocraft.modules.codebooks_patterns.CodebooksPatternProvider">CodebooksPatternProvider</a></b></code>:
+<ul class="hlist">
+<li><code><a title="audiocraft.modules.codebooks_patterns.CodebooksPatternProvider.get_pattern" href="#audiocraft.modules.codebooks_patterns.CodebooksPatternProvider.get_pattern">get_pattern</a></code></li>
+</ul>
+</li>
+</ul>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.modules" href="index.html">audiocraft.modules</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.modules.codebooks_patterns.CodebooksPatternProvider" href="#audiocraft.modules.codebooks_patterns.CodebooksPatternProvider">CodebooksPatternProvider</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.codebooks_patterns.CodebooksPatternProvider.get_pattern" href="#audiocraft.modules.codebooks_patterns.CodebooksPatternProvider.get_pattern">get_pattern</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.codebooks_patterns.DelayedPatternProvider" href="#audiocraft.modules.codebooks_patterns.DelayedPatternProvider">DelayedPatternProvider</a></code></h4>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.codebooks_patterns.LayoutCoord" href="#audiocraft.modules.codebooks_patterns.LayoutCoord">LayoutCoord</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.codebooks_patterns.LayoutCoord.q" href="#audiocraft.modules.codebooks_patterns.LayoutCoord.q">q</a></code></li>
+<li><code><a title="audiocraft.modules.codebooks_patterns.LayoutCoord.t" href="#audiocraft.modules.codebooks_patterns.LayoutCoord.t">t</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.codebooks_patterns.MusicLMPattern" href="#audiocraft.modules.codebooks_patterns.MusicLMPattern">MusicLMPattern</a></code></h4>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.codebooks_patterns.ParallelPatternProvider" href="#audiocraft.modules.codebooks_patterns.ParallelPatternProvider">ParallelPatternProvider</a></code></h4>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.codebooks_patterns.Pattern" href="#audiocraft.modules.codebooks_patterns.Pattern">Pattern</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.codebooks_patterns.Pattern.build_pattern_sequence" href="#audiocraft.modules.codebooks_patterns.Pattern.build_pattern_sequence">build_pattern_sequence</a></code></li>
+<li><code><a title="audiocraft.modules.codebooks_patterns.Pattern.get_first_step_with_timesteps" href="#audiocraft.modules.codebooks_patterns.Pattern.get_first_step_with_timesteps">get_first_step_with_timesteps</a></code></li>
+<li><code><a title="audiocraft.modules.codebooks_patterns.Pattern.get_sequence_coords_with_timestep" href="#audiocraft.modules.codebooks_patterns.Pattern.get_sequence_coords_with_timestep">get_sequence_coords_with_timestep</a></code></li>
+<li><code><a title="audiocraft.modules.codebooks_patterns.Pattern.get_steps_with_timestep" href="#audiocraft.modules.codebooks_patterns.Pattern.get_steps_with_timestep">get_steps_with_timestep</a></code></li>
+<li><code><a title="audiocraft.modules.codebooks_patterns.Pattern.layout" href="#audiocraft.modules.codebooks_patterns.Pattern.layout">layout</a></code></li>
+<li><code><a title="audiocraft.modules.codebooks_patterns.Pattern.max_delay" href="#audiocraft.modules.codebooks_patterns.Pattern.max_delay">max_delay</a></code></li>
+<li><code><a title="audiocraft.modules.codebooks_patterns.Pattern.n_q" href="#audiocraft.modules.codebooks_patterns.Pattern.n_q">n_q</a></code></li>
+<li><code><a title="audiocraft.modules.codebooks_patterns.Pattern.num_sequence_steps" href="#audiocraft.modules.codebooks_patterns.Pattern.num_sequence_steps">num_sequence_steps</a></code></li>
+<li><code><a title="audiocraft.modules.codebooks_patterns.Pattern.revert_pattern_logits" href="#audiocraft.modules.codebooks_patterns.Pattern.revert_pattern_logits">revert_pattern_logits</a></code></li>
+<li><code><a title="audiocraft.modules.codebooks_patterns.Pattern.revert_pattern_sequence" href="#audiocraft.modules.codebooks_patterns.Pattern.revert_pattern_sequence">revert_pattern_sequence</a></code></li>
+<li><code><a title="audiocraft.modules.codebooks_patterns.Pattern.timesteps" href="#audiocraft.modules.codebooks_patterns.Pattern.timesteps">timesteps</a></code></li>
+<li><code><a title="audiocraft.modules.codebooks_patterns.Pattern.valid_layout" href="#audiocraft.modules.codebooks_patterns.Pattern.valid_layout">valid_layout</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.codebooks_patterns.UnrolledPatternProvider" href="#audiocraft.modules.codebooks_patterns.UnrolledPatternProvider">UnrolledPatternProvider</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.codebooks_patterns.UnrolledPatternProvider.FlattenedCodebook" href="#audiocraft.modules.codebooks_patterns.UnrolledPatternProvider.FlattenedCodebook">FlattenedCodebook</a></code></li>
+<li><code><a title="audiocraft.modules.codebooks_patterns.UnrolledPatternProvider.get_pattern" href="#audiocraft.modules.codebooks_patterns.UnrolledPatternProvider.get_pattern">get_pattern</a></code></li>
+<li><code><a title="audiocraft.modules.codebooks_patterns.UnrolledPatternProvider.num_virtual_steps" href="#audiocraft.modules.codebooks_patterns.UnrolledPatternProvider.num_virtual_steps">num_virtual_steps</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.codebooks_patterns.VALLEPattern" href="#audiocraft.modules.codebooks_patterns.VALLEPattern">VALLEPattern</a></code></h4>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/docs/audiocraft/modules/conditioners.html b/docs/audiocraft/modules/conditioners.html
new file mode 100644
index 00000000..049caf6c
--- /dev/null
+++ b/docs/audiocraft/modules/conditioners.html
@@ -0,0 +1,3573 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.modules.conditioners API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.modules.conditioners</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from collections import defaultdict
+from copy import deepcopy
+from dataclasses import dataclass, field
+from itertools import chain
+import logging
+import math
+import random
+import re
+import typing as tp
+import warnings
+
+from einops import rearrange
+from num2words import num2words
+import spacy
+from transformers import T5EncoderModel, T5Tokenizer  # type: ignore
+import torchaudio
+import torch
+from torch import nn
+from torch import Tensor
+import torch.nn.functional as F
+from torch.nn.utils.rnn import pad_sequence
+
+from .streaming import StreamingModule
+from .transformer import create_sin_embedding
+from ..data.audio_dataset import SegmentInfo
+from ..utils.autocast import TorchAutocast
+from ..utils.utils import hash_trick, length_to_mask, collate
+
+
+logger = logging.getLogger(__name__)
+TextCondition = tp.Optional[str]  # a text condition can be a string or None (if doesn&#39;t exist)
+ConditionType = tp.Tuple[Tensor, Tensor]  # condition, mask
+
+
+class WavCondition(tp.NamedTuple):
+    wav: Tensor
+    length: Tensor
+    path: tp.List[tp.Optional[str]] = []
+
+
+def nullify_condition(condition: ConditionType, dim: int = 1):
+    &#34;&#34;&#34;This function transforms an input condition to a null condition.
+    The way it is done by converting it to a single zero vector similarly
+    to how it is done inside WhiteSpaceTokenizer and NoopTokenizer.
+
+    Args:
+        condition (ConditionType): a tuple of condition and mask (tp.Tuple[Tensor, Tensor])
+        dim (int): the dimension that will be truncated (should be the time dimension)
+        WARNING!: dim should not be the batch dimension!
+    Returns:
+        ConditionType: a tuple of null condition and mask
+    &#34;&#34;&#34;
+    assert dim != 0, &#34;dim cannot be the batch dimension!&#34;
+    assert type(condition) == tuple and \
+        type(condition[0]) == Tensor and \
+        type(condition[1]) == Tensor, &#34;&#39;nullify_condition&#39; got an unexpected input type!&#34;
+    cond, mask = condition
+    B = cond.shape[0]
+    last_dim = cond.dim() - 1
+    out = cond.transpose(dim, last_dim)
+    out = 0. * out[..., :1]
+    out = out.transpose(dim, last_dim)
+    mask = torch.zeros((B, 1), device=out.device).int()
+    assert cond.dim() == out.dim()
+    return out, mask
+
+
+def nullify_wav(wav: Tensor) -&gt; WavCondition:
+    &#34;&#34;&#34;Create a nullified WavCondition from a wav tensor with appropriate shape.
+
+    Args:
+        wav (Tensor): tensor of shape [B, T]
+    Returns:
+        WavCondition: wav condition with nullified wav.
+    &#34;&#34;&#34;
+    null_wav, _ = nullify_condition((wav, torch.zeros_like(wav)), dim=wav.dim() - 1)
+    return WavCondition(
+        wav=null_wav,
+        length=torch.tensor([0] * wav.shape[0], device=wav.device),
+        path=[&#39;null_wav&#39;] * wav.shape[0]
+    )
+
+
+@dataclass
+class ConditioningAttributes:
+    text: tp.Dict[str, tp.Optional[str]] = field(default_factory=dict)
+    wav: tp.Dict[str, WavCondition] = field(default_factory=dict)
+
+    def __getitem__(self, item):
+        return getattr(self, item)
+
+    @property
+    def text_attributes(self):
+        return self.text.keys()
+
+    @property
+    def wav_attributes(self):
+        return self.wav.keys()
+
+    @property
+    def attributes(self):
+        return {&#34;text&#34;: self.text_attributes, &#34;wav&#34;: self.wav_attributes}
+
+    def to_flat_dict(self):
+        return {
+            **{f&#34;text.{k}&#34;: v for k, v in self.text.items()},
+            **{f&#34;wav.{k}&#34;: v for k, v in self.wav.items()},
+        }
+
+    @classmethod
+    def from_flat_dict(cls, x):
+        out = cls()
+        for k, v in x.items():
+            kind, att = k.split(&#34;.&#34;)
+            out[kind][att] = v
+        return out
+
+
+class SegmentWithAttributes(SegmentInfo):
+    &#34;&#34;&#34;Base class for all dataclasses that are used for conditioning.
+    All child classes should implement `to_condition_attributes` that converts
+    the existing attributes to a dataclass of type ConditioningAttributes.
+    &#34;&#34;&#34;
+    def to_condition_attributes(self) -&gt; ConditioningAttributes:
+        raise NotImplementedError()
+
+
+class Tokenizer:
+    &#34;&#34;&#34;Base class for all tokenizers
+    (in case we want to introduce more advances tokenizers in the future).
+    &#34;&#34;&#34;
+    def __call__(self, texts: tp.List[tp.Optional[str]]) -&gt; tp.Tuple[Tensor, Tensor]:
+        raise NotImplementedError()
+
+
+class WhiteSpaceTokenizer(Tokenizer):
+    &#34;&#34;&#34;This tokenizer should be used for natural language descriptions.
+    For example:
+    [&#34;he didn&#39;t, know he&#39;s going home.&#34;, &#39;shorter sentence&#39;] =&gt;
+    [[78, 62, 31,  4, 78, 25, 19, 34],
+    [59, 77,  0,  0,  0,  0,  0,  0]]
+    &#34;&#34;&#34;
+    PUNCTUATIONS = &#34;?:!.,;&#34;
+
+    def __init__(self, n_bins: int, pad_idx: int = 0, language: str = &#34;en_core_web_sm&#34;,
+                 lemma: bool = True, stopwords: bool = True) -&gt; None:
+        self.n_bins = n_bins
+        self.pad_idx = pad_idx
+        self.lemma = lemma
+        self.stopwords = stopwords
+        try:
+            self.nlp = spacy.load(language)
+        except IOError:
+            spacy.cli.download(language)  # type: ignore
+            self.nlp = spacy.load(language)
+
+    @tp.no_type_check
+    def __call__(
+        self,
+        texts: tp.List[tp.Optional[str]],
+        return_text: bool = False
+    ) -&gt; tp.Tuple[Tensor, Tensor]:
+        &#34;&#34;&#34;Take a list of strings and convert them to a tensor of indices.
+
+        Args:
+            texts (tp.List[str]): List of strings.
+            return_text (bool, optional): Whether to return text as additional tuple item. Defaults to False.
+        Returns:
+            tp.Tuple[Tensor, Tensor]:
+                - Indices of words in the LUT.
+                - And a mask indicating where the padding tokens are
+        &#34;&#34;&#34;
+        output, lengths = [], []
+        texts = deepcopy(texts)
+        for i, text in enumerate(texts):
+            # if current sample doesn&#39;t have a certain attribute, replace with pad token
+            if text is None:
+                output.append(Tensor([self.pad_idx]))
+                lengths.append(0)
+                continue
+
+            # convert numbers to words
+            text = re.sub(r&#34;(\d+)&#34;, lambda x: num2words(int(x.group(0))), text)  # type: ignore
+            # normalize text
+            text = self.nlp(text)  # type: ignore
+            # remove stopwords
+            if self.stopwords:
+                text = [w for w in text if not w.is_stop]  # type: ignore
+            # remove punctuations
+            text = [w for w in text if w.text not in self.PUNCTUATIONS]  # type: ignore
+            # lemmatize if needed
+            text = [getattr(t, &#34;lemma_&#34; if self.lemma else &#34;text&#34;) for t in text]  # type: ignore
+
+            texts[i] = &#34; &#34;.join(text)
+            lengths.append(len(text))
+            # convert to tensor
+            tokens = Tensor([hash_trick(w, self.n_bins) for w in text])
+            output.append(tokens)
+
+        mask = length_to_mask(torch.IntTensor(lengths)).int()
+        padded_output = pad_sequence(output, padding_value=self.pad_idx).int().t()
+        if return_text:
+            return padded_output, mask, texts  # type: ignore
+        return padded_output, mask
+
+
+class NoopTokenizer(Tokenizer):
+    &#34;&#34;&#34;This tokenizer should be used for global conditioners such as: artist, genre, key, etc.
+    The difference between this and WhiteSpaceTokenizer is that NoopTokenizer does not split
+    strings, so &#34;Jeff Buckley&#34; will get it&#39;s own index. Whereas WhiteSpaceTokenizer will
+    split it to [&#34;Jeff&#34;, &#34;Buckley&#34;] and return an index per word.
+
+    For example:
+    [&#34;Queen&#34;, &#34;ABBA&#34;, &#34;Jeff Buckley&#34;] =&gt; [43, 55, 101]
+    [&#34;Metal&#34;, &#34;Rock&#34;, &#34;Classical&#34;] =&gt; [0, 223, 51]
+    &#34;&#34;&#34;
+    def __init__(self, n_bins: int, pad_idx: int = 0):
+        self.n_bins = n_bins
+        self.pad_idx = pad_idx
+
+    def __call__(self, texts: tp.List[tp.Optional[str]]) -&gt; tp.Tuple[Tensor, Tensor]:
+        output, lengths = [], []
+        for text in texts:
+            # if current sample doesn&#39;t have a certain attribute, replace with pad token
+            if text is None:
+                output.append(self.pad_idx)
+                lengths.append(0)
+            else:
+                output.append(hash_trick(text, self.n_bins))
+                lengths.append(1)
+
+        tokens = torch.LongTensor(output).unsqueeze(1)
+        mask = length_to_mask(torch.IntTensor(lengths)).int()
+        return tokens, mask
+
+
+class BaseConditioner(nn.Module):
+    &#34;&#34;&#34;Base model for all conditioner modules. We allow the output dim to be different
+    than the hidden dim for two reasons: 1) keep our LUTs small when the vocab is large;
+    2) make all condition dims consistent.
+
+    Args:
+        dim (int): Hidden dim of the model (text-encoder/LUT).
+        output_dim (int): Output dim of the conditioner.
+    &#34;&#34;&#34;
+    def __init__(self, dim, output_dim):
+        super().__init__()
+        self.dim = dim
+        self.output_dim = output_dim
+        self.output_proj = nn.Linear(dim, output_dim)
+
+    def tokenize(self, *args, **kwargs) -&gt; tp.Any:
+        &#34;&#34;&#34;Should be any part of the processing that will lead to a synchronization
+        point, e.g. BPE tokenization with transfer to the GPU.
+
+        The returned value will be saved and return later when calling forward().
+        &#34;&#34;&#34;
+        raise NotImplementedError()
+
+    def forward(self, inputs: tp.Any) -&gt; ConditionType:
+        &#34;&#34;&#34;Gets input that should be used as conditioning (e.g, genre, description or a waveform).
+        Outputs a ConditionType, after the input data was embedded as a dense vector.
+
+        Returns:
+            ConditionType:
+                - A tensor of size [B, T, D] where B is the batch size, T is the length of the
+                  output embedding and D is the dimension of the embedding.
+                - And a mask indicating where the padding tokens.
+        &#34;&#34;&#34;
+        raise NotImplementedError()
+
+
+class TextConditioner(BaseConditioner):
+    ...
+
+
+class LUTConditioner(TextConditioner):
+    &#34;&#34;&#34;Lookup table TextConditioner.
+
+    Args:
+        n_bins (int): Number of bins.
+        dim (int): Hidden dim of the model (text-encoder/LUT).
+        output_dim (int): Output dim of the conditioner.
+        tokenizer (str): Name of the tokenizer.
+        pad_idx (int, optional): Index for padding token. Defaults to 0.
+    &#34;&#34;&#34;
+    def __init__(self, n_bins: int, dim: int, output_dim: int, tokenizer: str, pad_idx: int = 0):
+        super().__init__(dim, output_dim)
+        self.embed = nn.Embedding(n_bins, dim)
+        self.tokenizer: Tokenizer
+        if tokenizer == &#34;whitespace&#34;:
+            self.tokenizer = WhiteSpaceTokenizer(n_bins, pad_idx=pad_idx)
+        elif tokenizer == &#34;noop&#34;:
+            self.tokenizer = NoopTokenizer(n_bins, pad_idx=pad_idx)
+        else:
+            raise ValueError(f&#34;unrecognized tokenizer `{tokenizer}`.&#34;)
+
+    def tokenize(self, x: tp.List[tp.Optional[str]]) -&gt; tp.Tuple[torch.Tensor, torch.Tensor]:
+        device = self.embed.weight.device
+        tokens, mask = self.tokenizer(x)
+        tokens, mask = tokens.to(device), mask.to(device)
+        return tokens, mask
+
+    def forward(self, inputs: tp.Tuple[torch.Tensor, torch.Tensor]) -&gt; ConditionType:
+        tokens, mask = inputs
+        embeds = self.embed(tokens)
+        embeds = self.output_proj(embeds)
+        embeds = (embeds * mask.unsqueeze(-1))
+        return embeds, mask
+
+
+class T5Conditioner(TextConditioner):
+    &#34;&#34;&#34;T5-based TextConditioner.
+
+    Args:
+        name (str): Name of the T5 model.
+        output_dim (int): Output dim of the conditioner.
+        finetune (bool): Whether to fine-tune T5 at train time.
+        device (str): Device for T5 Conditioner.
+        autocast_dtype (tp.Optional[str], optional): Autocast dtype.
+        word_dropout (float, optional): Word dropout probability.
+        normalize_text (bool, optional): Whether to apply text normalization.
+    &#34;&#34;&#34;
+    MODELS = [&#34;t5-small&#34;, &#34;t5-base&#34;, &#34;t5-large&#34;, &#34;t5-3b&#34;, &#34;t5-11b&#34;,
+              &#34;google/flan-t5-small&#34;, &#34;google/flan-t5-base&#34;, &#34;google/flan-t5-large&#34;,
+              &#34;google/flan-t5-xl&#34;, &#34;google/flan-t5-xxl&#34;]
+    MODELS_DIMS = {
+        &#34;t5-small&#34;: 512,
+        &#34;t5-base&#34;: 768,
+        &#34;t5-large&#34;: 1024,
+        &#34;t5-3b&#34;: 1024,
+        &#34;t5-11b&#34;: 1024,
+        &#34;google/flan-t5-small&#34;: 512,
+        &#34;google/flan-t5-base&#34;: 768,
+        &#34;google/flan-t5-large&#34;: 1024,
+        &#34;google/flan-t5-3b&#34;: 1024,
+        &#34;google/flan-t5-11b&#34;: 1024,
+    }
+
+    def __init__(self, name: str, output_dim: int, finetune: bool, device: str,
+                 autocast_dtype: tp.Optional[str] = &#39;float32&#39;, word_dropout: float = 0.,
+                 normalize_text: bool = False):
+        assert name in self.MODELS, f&#34;unrecognized t5 model name (should in {self.MODELS})&#34;
+        super().__init__(self.MODELS_DIMS[name], output_dim)
+        self.device = device
+        self.name = name
+        self.finetune = finetune
+        self.word_dropout = word_dropout
+
+        if autocast_dtype is None or self.device == &#39;cpu&#39;:
+            self.autocast = TorchAutocast(enabled=False)
+            if self.device != &#39;cpu&#39;:
+                logger.warning(&#34;T5 has no autocast, this might lead to NaN&#34;)
+        else:
+            dtype = getattr(torch, autocast_dtype)
+            assert isinstance(dtype, torch.dtype)
+            logger.info(f&#34;T5 will be evaluated with autocast as {autocast_dtype}&#34;)
+            self.autocast = TorchAutocast(enabled=True, device_type=self.device, dtype=dtype)
+        # Let&#39;s disable logging temporarily because T5 will vomit some errors otherwise.
+        # thanks https://gist.github.com/simon-weber/7853144
+        previous_level = logging.root.manager.disable
+        logging.disable(logging.ERROR)
+        with warnings.catch_warnings():
+            warnings.simplefilter(&#34;ignore&#34;)
+            try:
+                self.t5_tokenizer = T5Tokenizer.from_pretrained(name)
+                t5 = T5EncoderModel.from_pretrained(name).train(mode=finetune)
+            finally:
+                logging.disable(previous_level)
+        if finetune:
+            self.t5 = t5
+        else:
+            # this makes sure that the t5 models is not part
+            # of the saved checkpoint
+            self.__dict__[&#34;t5&#34;] = t5.to(device)
+
+        self.normalize_text = normalize_text
+        if normalize_text:
+            self.text_normalizer = WhiteSpaceTokenizer(1, lemma=True, stopwords=True)
+
+    def tokenize(self, x: tp.List[tp.Optional[str]]) -&gt; tp.Dict[str, torch.Tensor]:
+        # if current sample doesn&#39;t have a certain attribute, replace with empty string
+        entries: tp.List[str] = [xi if xi is not None else &#34;&#34; for xi in x]
+        if self.normalize_text:
+            _, _, entries = self.text_normalizer(entries, return_text=True)
+        if self.word_dropout &gt; 0. and self.training:
+            new_entries = []
+            for entry in entries:
+                words = [word for word in entry.split(&#34; &#34;) if random.random() &gt;= self.word_dropout]
+                new_entries.append(&#34; &#34;.join(words))
+            entries = new_entries
+
+        empty_idx = torch.LongTensor([i for i, xi in enumerate(entries) if xi == &#34;&#34;])
+
+        inputs = self.t5_tokenizer(entries, return_tensors=&#34;pt&#34;, padding=True).to(self.device)
+        mask = inputs[&#34;attention_mask&#34;]
+        mask[empty_idx, :] = 0  # zero-out index where the input is non-existant
+        return inputs
+
+    def forward(self, inputs: tp.Dict[str, torch.Tensor]) -&gt; ConditionType:
+        mask = inputs[&#34;attention_mask&#34;]
+        with torch.set_grad_enabled(self.finetune), self.autocast:
+            embeds = self.t5(**inputs).last_hidden_state
+        embeds = self.output_proj(embeds.to(self.output_proj.weight))
+        embeds = (embeds * mask.unsqueeze(-1))
+        return embeds, mask
+
+
+class WaveformConditioner(BaseConditioner):
+    &#34;&#34;&#34;Base class for all conditioners that take a waveform as input.
+    Classes that inherit must implement `_get_wav_embedding` that outputs
+    a continuous tensor, and `_downsampling_factor` that returns the down-sampling
+    factor of the embedding model.
+
+    Args:
+        dim (int): The internal representation dimension.
+        output_dim (int): Output dimension.
+        device (tp.Union[torch.device, str]): Device.
+    &#34;&#34;&#34;
+    def __init__(self, dim: int, output_dim: int, device: tp.Union[torch.device, str]):
+        super().__init__(dim, output_dim)
+        self.device = device
+
+    def tokenize(self, wav_length: WavCondition) -&gt; WavCondition:
+        wav, length, path = wav_length
+        assert length is not None
+        return WavCondition(wav.to(self.device), length.to(self.device), path)
+
+    def _get_wav_embedding(self, wav: Tensor) -&gt; Tensor:
+        &#34;&#34;&#34;Gets as input a wav and returns a dense vector of conditions.&#34;&#34;&#34;
+        raise NotImplementedError()
+
+    def _downsampling_factor(self):
+        &#34;&#34;&#34;Returns the downsampling factor of the embedding model.&#34;&#34;&#34;
+        raise NotImplementedError()
+
+    def forward(self, inputs: WavCondition) -&gt; ConditionType:
+        &#34;&#34;&#34;
+        Args:
+            input (WavCondition): Tuple of (waveform, lengths).
+        Returns:
+            ConditionType: Dense vector representing the conditioning along with its&#39; mask.
+        &#34;&#34;&#34;
+        wav, lengths, path = inputs
+        with torch.no_grad():
+            embeds = self._get_wav_embedding(wav)
+        embeds = embeds.to(self.output_proj.weight)
+        embeds = self.output_proj(embeds)
+
+        if lengths is not None:
+            lengths = lengths / self._downsampling_factor()
+            mask = length_to_mask(lengths, max_len=embeds.shape[1]).int()  # type: ignore
+        else:
+            mask = torch.ones_like(embeds)
+        embeds = (embeds * mask.unsqueeze(2).to(self.device))
+
+        return embeds, mask
+
+
+class ChromaStemConditioner(WaveformConditioner):
+    &#34;&#34;&#34;Chroma conditioner that uses DEMUCS to first filter out drums and bass. The is followed by
+    the insight the drums and bass often dominate the chroma, leading to the chroma not containing the
+    information about melody.
+
+    Args:
+        output_dim (int): Output dimension for the conditioner.
+        sample_rate (int): Sample rate for the chroma extractor.
+        n_chroma (int): Number of chroma for the chroma extractor.
+        radix2_exp (int): Radix2 exponent for the chroma extractor.
+        duration (float): Duration used during training. This is later used for correct padding
+            in case we are using chroma as prefix.
+        match_len_on_eval (bool, optional): If True then all chromas are padded to the training
+            duration. Defaults to False.
+        eval_wavs (str, optional): Path to a json egg with waveform, this waveforms are used as
+            conditions during eval (for cases where we don&#39;t want to leak test conditions like MusicCaps).
+            Defaults to None.
+        n_eval_wavs (int, optional): Limits the number of waveforms used for conditioning. Defaults to 0.
+        device (tp.Union[torch.device, str], optional): Device for the conditioner.
+        **kwargs: Additional parameters for the chroma extractor.
+    &#34;&#34;&#34;
+    def __init__(self, output_dim: int, sample_rate: int, n_chroma: int, radix2_exp: int,
+                 duration: float, match_len_on_eval: bool = True, eval_wavs: tp.Optional[str] = None,
+                 n_eval_wavs: int = 0, device: tp.Union[torch.device, str] = &#34;cpu&#34;, **kwargs):
+        from demucs import pretrained
+        super().__init__(dim=n_chroma, output_dim=output_dim, device=device)
+        self.autocast = TorchAutocast(enabled=device != &#34;cpu&#34;, device_type=self.device, dtype=torch.float32)
+        self.sample_rate = sample_rate
+        self.match_len_on_eval = match_len_on_eval
+        self.duration = duration
+        self.__dict__[&#34;demucs&#34;] = pretrained.get_model(&#39;htdemucs&#39;).to(device)
+        self.stem2idx = {&#39;drums&#39;: 0, &#39;bass&#39;: 1, &#39;other&#39;: 2, &#39;vocal&#39;: 3}
+        self.stem_idx = torch.LongTensor([self.stem2idx[&#39;vocal&#39;], self.stem2idx[&#39;other&#39;]]).to(device)
+        self.chroma = ChromaExtractor(sample_rate=sample_rate, n_chroma=n_chroma, radix2_exp=radix2_exp,
+                                      device=device, **kwargs)
+        self.chroma_len = self._get_chroma_len()
+
+    def _downsampling_factor(self):
+        return self.chroma.winhop
+
+    def _get_chroma_len(self):
+        &#34;&#34;&#34;Get length of chroma during training&#34;&#34;&#34;
+        dummy_wav = torch.zeros((1, self.sample_rate * self.duration), device=self.device)
+        dummy_chr = self.chroma(dummy_wav)
+        return dummy_chr.shape[1]
+
+    @torch.no_grad()
+    def _get_filtered_wav(self, wav):
+        from demucs.apply import apply_model
+        from demucs.audio import convert_audio
+        with self.autocast:
+            wav = convert_audio(wav, self.sample_rate, self.demucs.samplerate, self.demucs.audio_channels)
+            stems = apply_model(self.demucs, wav, device=self.device)
+            stems = stems[:, self.stem_idx]  # extract stem
+            stems = stems.sum(1)  # merge extracted stems
+            stems = stems.mean(1, keepdim=True)  # mono
+            stems = convert_audio(stems, self.demucs.samplerate, self.sample_rate, 1)
+            return stems
+
+    @torch.no_grad()
+    def _get_wav_embedding(self, wav):
+        # avoid 0-size tensors when we are working with null conds
+        if wav.shape[-1] == 1:
+            return self.chroma(wav)
+        stems = self._get_filtered_wav(wav)
+        chroma = self.chroma(stems)
+
+        if self.match_len_on_eval:
+            b, t, c = chroma.shape
+            if t &gt; self.chroma_len:
+                chroma = chroma[:, :self.chroma_len]
+                logger.debug(f&#39;chroma was truncated! ({t} -&gt; {chroma.shape[1]})&#39;)
+            elif t &lt; self.chroma_len:
+                # chroma = F.pad(chroma, (0, 0, 0, self.chroma_len - t))
+                n_repeat = int(math.ceil(self.chroma_len / t))
+                chroma = chroma.repeat(1, n_repeat, 1)
+                chroma = chroma[:, :self.chroma_len]
+                logger.debug(f&#39;chroma was zero-padded! ({t} -&gt; {chroma.shape[1]})&#39;)
+        return chroma
+
+
+class ChromaExtractor(nn.Module):
+    &#34;&#34;&#34;Chroma extraction class, handles chroma extraction and quantization.
+
+    Args:
+        sample_rate (int): Sample rate.
+        n_chroma (int): Number of chroma to consider.
+        radix2_exp (int): Radix2 exponent.
+        nfft (tp.Optional[int], optional): Number of FFT.
+        winlen (tp.Optional[int], optional): Window length.
+        winhop (tp.Optional[int], optional): Window hop size.
+        argmax (bool, optional): Whether to use argmax. Defaults to False.
+        norm (float, optional): Norm for chroma normalization. Defaults to inf.
+        device (tp.Union[torch.device, str], optional): Device to use. Defaults to cpu.
+    &#34;&#34;&#34;
+    def __init__(self, sample_rate: int, n_chroma: int = 12, radix2_exp: int = 12,
+                 nfft: tp.Optional[int] = None, winlen: tp.Optional[int] = None, winhop: tp.Optional[int] = None,
+                 argmax: bool = False, norm: float = torch.inf, device: tp.Union[torch.device, str] = &#34;cpu&#34;):
+        super().__init__()
+        from librosa import filters
+        self.device = device
+        self.autocast = TorchAutocast(enabled=device != &#34;cpu&#34;, device_type=self.device, dtype=torch.float32)
+        self.winlen = winlen or 2 ** radix2_exp
+        self.nfft = nfft or self.winlen
+        self.winhop = winhop or (self.winlen // 4)
+        self.sr = sample_rate
+        self.n_chroma = n_chroma
+        self.norm = norm
+        self.argmax = argmax
+        self.window = torch.hann_window(self.winlen).to(device)
+        self.fbanks = torch.from_numpy(filters.chroma(sr=sample_rate, n_fft=self.nfft, tuning=0,
+                                                      n_chroma=self.n_chroma)).to(device)
+        self.spec = torchaudio.transforms.Spectrogram(n_fft=self.nfft, win_length=self.winlen,
+                                                      hop_length=self.winhop, power=2, center=True,
+                                                      pad=0, normalized=True).to(device)
+
+    def forward(self, wav):
+        with self.autocast:
+            T = wav.shape[-1]
+            # in case we are getting a wav that was dropped out (nullified)
+            # make sure wav length is no less that nfft
+            if T &lt; self.nfft:
+                pad = self.nfft - T
+                r = 0 if pad % 2 == 0 else 1
+                wav = F.pad(wav, (pad // 2, pad // 2 + r), &#39;constant&#39;, 0)
+                assert wav.shape[-1] == self.nfft, f&#39;expected len {self.nfft} but got {wav.shape[-1]}&#39;
+            spec = self.spec(wav).squeeze(1)
+            raw_chroma = torch.einsum(&#34;cf,...ft-&gt;...ct&#34;, self.fbanks, spec)
+            norm_chroma = torch.nn.functional.normalize(raw_chroma, p=self.norm, dim=-2, eps=1e-6)
+            norm_chroma = rearrange(norm_chroma, &#34;b d t -&gt; b t d&#34;)
+
+            if self.argmax:
+                idx = norm_chroma.argmax(-1, keepdims=True)
+                norm_chroma[:] = 0
+                norm_chroma.scatter_(dim=-1, index=idx, value=1)
+
+            return norm_chroma
+
+
+def dropout_condition(sample: ConditioningAttributes, condition_type: str, condition: str):
+    &#34;&#34;&#34;Utility function for nullifying an attribute inside an ConditioningAttributes object.
+    If the condition is of type &#34;wav&#34;, then nullify it using &#34;nullify_condition&#34;.
+    If the condition is of any other type, set its&#39; value to None.
+    Works in-place.
+    &#34;&#34;&#34;
+    if condition_type not in [&#34;text&#34;, &#34;wav&#34;]:
+        raise ValueError(
+            &#34;dropout_condition got an unexpected condition type!&#34;
+            f&#34; expected &#39;wav&#39; or &#39;text&#39; but got &#39;{condition_type}&#39;&#34;
+        )
+
+    if condition not in getattr(sample, condition_type):
+        raise ValueError(
+            &#34;dropout_condition received an unexpected condition!&#34;
+            f&#34; expected wav={sample.wav.keys()} and text={sample.text.keys()}&#34;
+            f&#34;but got &#39;{condition}&#39; of type &#39;{condition_type}&#39;!&#34;
+        )
+
+    if condition_type == &#34;wav&#34;:
+        wav, length, path = sample.wav[condition]
+        sample.wav[condition] = nullify_wav(wav)
+    else:
+        sample.text[condition] = None
+
+    return sample
+
+
+class DropoutModule(nn.Module):
+    &#34;&#34;&#34;Base class for all dropout modules.&#34;&#34;&#34;
+    def __init__(self, seed: int = 1234):
+        super().__init__()
+        self.rng = torch.Generator()
+        self.rng.manual_seed(seed)
+
+
+class AttributeDropout(DropoutModule):
+    &#34;&#34;&#34;Applies dropout with a given probability per attribute. This is different from the behavior of
+    ClassifierFreeGuidanceDropout as this allows for attributes to be dropped out separately. For example,
+    &#34;artist&#34; can be dropped while &#34;genre&#34; remains. This is in contrast to ClassifierFreeGuidanceDropout
+    where if &#34;artist&#34; is dropped &#34;genre&#34; must also be dropped.
+
+    Args:
+        p (tp.Dict[str, float]): A dict mapping between attributes and dropout probability. For example:
+            ...
+            &#34;genre&#34;: 0.1,
+            &#34;artist&#34;: 0.5,
+            &#34;wav&#34;: 0.25,
+            ...
+        active_on_eval (bool, optional): Whether the dropout is active at eval. Default to False.
+        seed (int, optional): Random seed.
+    &#34;&#34;&#34;
+    def __init__(self, p: tp.Dict[str, tp.Dict[str, float]], active_on_eval: bool = False, seed: int = 1234):
+        super().__init__(seed=seed)
+        self.active_on_eval = active_on_eval
+        # construct dict that return the values from p otherwise 0
+        self.p = {}
+        for condition_type, probs in p.items():
+            self.p[condition_type] = defaultdict(lambda: 0, probs)
+
+    def forward(self, samples: tp.List[ConditioningAttributes]) -&gt; tp.List[ConditioningAttributes]:
+        &#34;&#34;&#34;
+        Args:
+            samples (tp.List[ConditioningAttributes]): List of conditions.
+        Returns:
+            tp.List[ConditioningAttributes]: List of conditions after certain attributes were set to None.
+        &#34;&#34;&#34;
+        if not self.training and not self.active_on_eval:
+            return samples
+
+        samples = deepcopy(samples)
+
+        for condition_type, ps in self.p.items():  # for condition types [text, wav]
+            for condition, p in ps.items():  # for attributes of each type (e.g., [artist, genre])
+                if torch.rand(1, generator=self.rng).item() &lt; p:
+                    for sample in samples:
+                        dropout_condition(sample, condition_type, condition)
+
+        return samples
+
+    def __repr__(self):
+        return f&#34;AttributeDropout({dict(self.p)})&#34;
+
+
+class ClassifierFreeGuidanceDropout(DropoutModule):
+    &#34;&#34;&#34;Applies Classifier Free Guidance dropout, meaning all attributes
+    are dropped with the same probability.
+
+    Args:
+        p (float): Probability to apply condition dropout during training.
+        seed (int): Random seed.
+    &#34;&#34;&#34;
+    def __init__(self, p: float, seed: int = 1234):
+        super().__init__(seed=seed)
+        self.p = p
+
+    def forward(self, samples: tp.List[ConditioningAttributes]) -&gt; tp.List[ConditioningAttributes]:
+        &#34;&#34;&#34;
+        Args:
+            samples (tp.List[ConditioningAttributes]): List of conditions.
+        Returns:
+            tp.List[ConditioningAttributes]: List of conditions after all attributes were set to None.
+        &#34;&#34;&#34;
+        if not self.training:
+            return samples
+
+        # decide on which attributes to drop in a batched fashion
+        drop = torch.rand(1, generator=self.rng).item() &lt; self.p
+        if not drop:
+            return samples
+
+        # nullify conditions of all attributes
+        samples = deepcopy(samples)
+
+        for condition_type in [&#34;wav&#34;, &#34;text&#34;]:
+            for sample in samples:
+                for condition in sample.attributes[condition_type]:
+                    dropout_condition(sample, condition_type, condition)
+
+        return samples
+
+    def __repr__(self):
+        return f&#34;ClassifierFreeGuidanceDropout(p={self.p})&#34;
+
+
+class ConditioningProvider(nn.Module):
+    &#34;&#34;&#34;Main class to provide conditions given all the supported conditioners.
+
+    Args:
+        conditioners (dict): Dictionary of conditioners.
+        merge_text_conditions_p (float, optional): Probability to merge all text sources
+            into a single text condition. Defaults to 0.
+        drop_desc_p (float, optional): Probability to drop the original description
+            when merging all text sources into a single text condition. Defaults to 0.
+        device (tp.Union[torch.device, str], optional): Device for conditioners and output condition types.
+    &#34;&#34;&#34;
+    def __init__(
+        self,
+        conditioners: tp.Dict[str, BaseConditioner],
+        merge_text_conditions_p: float = 0,
+        drop_desc_p: float = 0,
+        device: tp.Union[torch.device, str] = &#34;cpu&#34;,
+    ):
+        super().__init__()
+        self.device = device
+        self.merge_text_conditions_p = merge_text_conditions_p
+        self.drop_desc_p = drop_desc_p
+        self.conditioners = nn.ModuleDict(conditioners)
+
+    @property
+    def text_conditions(self):
+        return [k for k, v in self.conditioners.items() if isinstance(v, TextConditioner)]
+
+    @property
+    def wav_conditions(self):
+        return [k for k, v in self.conditioners.items() if isinstance(v, WaveformConditioner)]
+
+    @property
+    def has_wav_condition(self):
+        return len(self.wav_conditions) &gt; 0
+
+    def tokenize(self, inputs: tp.List[ConditioningAttributes]) -&gt; tp.Dict[str, tp.Any]:
+        &#34;&#34;&#34;Match attributes/wavs with existing conditioners in self, and compute tokenize them accordingly.
+        This should be called before starting any real GPU work to avoid synchronization points.
+        This will return a dict matching conditioner names to their arbitrary tokenized representations.
+
+        Args:
+            inputs (list[ConditioningAttribres]): List of ConditioningAttributes objects containing
+                text and wav conditions.
+        &#34;&#34;&#34;
+        assert all([type(x) == ConditioningAttributes for x in inputs]), \
+            &#34;got unexpected types input for conditioner! should be tp.List[ConditioningAttributes]&#34; \
+            f&#34; but types were {set([type(x) for x in inputs])}&#34;
+
+        output = {}
+        text = self._collate_text(inputs)
+        wavs = self._collate_wavs(inputs)
+
+        assert set(text.keys() | wavs.keys()).issubset(set(self.conditioners.keys())), \
+            f&#34;got an unexpected attribute! Expected {self.conditioners.keys()}, got {text.keys(), wavs.keys()}&#34;
+
+        for attribute, batch in chain(text.items(), wavs.items()):
+            output[attribute] = self.conditioners[attribute].tokenize(batch)
+        return output
+
+    def forward(self, tokenized: tp.Dict[str, tp.Any]) -&gt; tp.Dict[str, ConditionType]:
+        &#34;&#34;&#34;Compute pairs of `(embedding, mask)` using the configured conditioners
+        and the tokenized representations. The output is for example:
+
+            {
+                &#34;genre&#34;: (torch.Tensor([B, 1, D_genre]), torch.Tensor([B, 1])),
+                &#34;description&#34;: (torch.Tensor([B, T_desc, D_desc]), torch.Tensor([B, T_desc])),
+                ...
+            }
+
+        Args:
+            tokenized (dict): Dict of tokenized representations as returned by `tokenize()`.
+        &#34;&#34;&#34;
+        output = {}
+        for attribute, inputs in tokenized.items():
+            condition, mask = self.conditioners[attribute](inputs)
+            output[attribute] = (condition, mask)
+        return output
+
+    def _collate_text(self, samples: tp.List[ConditioningAttributes]) -&gt; tp.Dict[str, tp.List[tp.Optional[str]]]:
+        &#34;&#34;&#34;Given a list of ConditioningAttributes objects, compile a dictionary where the keys
+        are the attributes and the values are the aggregated input per attribute.
+        For example:
+        Input:
+        [
+            ConditioningAttributes(text={&#34;genre&#34;: &#34;Rock&#34;, &#34;description&#34;: &#34;A rock song with a guitar solo&#34;}, wav=...),
+            ConditioningAttributes(text={&#34;genre&#34;: &#34;Hip-hop&#34;, &#34;description&#34;: &#34;A hip-hop verse&#34;}, wav=...),
+        ]
+        Output:
+        {
+            &#34;genre&#34;: [&#34;Rock&#34;, &#34;Hip-hop&#34;],
+            &#34;description&#34;: [&#34;A rock song with a guitar solo&#34;, &#34;A hip-hop verse&#34;]
+        }
+        &#34;&#34;&#34;
+        batch_per_attribute: tp.Dict[str, tp.List[tp.Optional[str]]] = defaultdict(list)
+
+        def _merge_conds(cond, merge_text_conditions_p=0, drop_desc_p=0):
+            def is_valid(k, v):
+                k_valid = k in [&#39;key&#39;, &#39;bpm&#39;, &#39;genre&#39;, &#39;moods&#39;, &#39;instrument&#39;]
+                v_valid = v is not None and isinstance(v, (int, float, str, list))
+                return k_valid and v_valid
+
+            def process_value(v):
+                if isinstance(v, (int, float, str)):
+                    return v
+                if isinstance(v, list):
+                    return &#34;, &#34;.join(v)
+                else:
+                    RuntimeError(f&#34;unknown type for text value! ({type(v), v})&#34;)
+
+            desc = cond.text[&#39;description&#39;]
+            meta_data = &#34;&#34;
+            if random.uniform(0, 1) &lt; merge_text_conditions_p:
+                meta_pairs = [f&#39;{k}: {process_value(v)}&#39; for k, v in cond.text.items() if is_valid(k, v)]
+                random.shuffle(meta_pairs)
+                meta_data = &#34;. &#34;.join(meta_pairs)
+                desc = desc if not random.uniform(0, 1) &lt; drop_desc_p else None
+
+            if desc is None:
+                desc = meta_data if len(meta_data) &gt; 1 else None
+            else:
+                desc = desc.rstrip(&#39;.&#39;) + &#34;. &#34; + meta_data
+            cond.text[&#39;description&#39;] = desc.strip() if desc else None
+
+        if self.training and self.merge_text_conditions_p:
+            for sample in samples:
+                _merge_conds(sample, self.merge_text_conditions_p, self.drop_desc_p)
+
+        texts = [x.text for x in samples]
+        for text in texts:
+            for condition in self.text_conditions:
+                batch_per_attribute[condition].append(text[condition])
+
+        return batch_per_attribute
+
+    def _collate_wavs(self, samples: tp.List[ConditioningAttributes]):
+        &#34;&#34;&#34;Generate a dict where the keys are attributes by which we fetch similar wavs,
+        and the values are Tensors of wavs according to said attribtues.
+
+        *Note*: by the time the samples reach this function, each sample should have some waveform
+        inside the &#34;wav&#34; attribute. It should be either:
+        1. A real waveform
+        2. A null waveform due to the sample having no similar waveforms (nullified by the dataset)
+        3. A null waveform due to it being dropped in a dropout module (nullified by dropout)
+
+        Args:
+            samples (tp.List[ConditioningAttributes]): List of ConditioningAttributes samples.
+        Returns:
+            dict: A dicionary mapping an attribute name to wavs.
+        &#34;&#34;&#34;
+        wavs = defaultdict(list)
+        lens = defaultdict(list)
+        paths = defaultdict(list)
+        out = {}
+
+        for sample in samples:
+            for attribute in self.wav_conditions:
+                wav, length, path = sample.wav[attribute]
+                wavs[attribute].append(wav.flatten())
+                lens[attribute].append(length)
+                paths[attribute].append(path)
+
+        # stack all wavs to a single tensor
+        for attribute in self.wav_conditions:
+            stacked_wav, _ = collate(wavs[attribute], dim=0)
+            out[attribute] = WavCondition(stacked_wav.unsqueeze(1),
+                                          torch.cat(lens[&#39;self_wav&#39;]), paths[attribute])  # type: ignore
+
+        return out
+
+
+class ConditionFuser(StreamingModule):
+    &#34;&#34;&#34;Condition fuser handles the logic to combine the different conditions
+    to the actual model input.
+
+    Args:
+        fuse2cond (tp.Dict[str, str]): A dictionary that says how to fuse
+            each condition. For example:
+            {
+                &#34;prepend&#34;: [&#34;description&#34;],
+                &#34;sum&#34;: [&#34;genre&#34;, &#34;bpm&#34;],
+                &#34;cross&#34;: [&#34;description&#34;],
+            }
+        cross_attention_pos_emb (bool, optional): Use positional embeddings in cross attention.
+        cross_attention_pos_emb_scale (int): Scale for positional embeddings in cross attention if used.
+    &#34;&#34;&#34;
+    FUSING_METHODS = [&#34;sum&#34;, &#34;prepend&#34;, &#34;cross&#34;, &#34;input_interpolate&#34;]
+
+    def __init__(self, fuse2cond: tp.Dict[str, tp.List[str]], cross_attention_pos_emb: bool = False,
+                 cross_attention_pos_emb_scale: float = 1.0):
+        super().__init__()
+        assert all(
+            [k in self.FUSING_METHODS for k in fuse2cond.keys()]
+        ), f&#34;got invalid fuse method, allowed methods: {self.FUSING_MEHTODS}&#34;
+        self.cross_attention_pos_emb = cross_attention_pos_emb
+        self.cross_attention_pos_emb_scale = cross_attention_pos_emb_scale
+        self.fuse2cond: tp.Dict[str, tp.List[str]] = fuse2cond
+        self.cond2fuse: tp.Dict[str, str] = {}
+        for fuse_method, conditions in fuse2cond.items():
+            for condition in conditions:
+                self.cond2fuse[condition] = fuse_method
+
+    def forward(
+        self,
+        input: Tensor,
+        conditions: tp.Dict[str, ConditionType]
+    ) -&gt; tp.Tuple[Tensor, tp.Optional[Tensor]]:
+        &#34;&#34;&#34;Fuse the conditions to the provided model input.
+
+        Args:
+            input (Tensor): Transformer input.
+            conditions (tp.Dict[str, ConditionType]): Dict of conditions.
+        Returns:
+            tp.Tuple[Tensor, Tensor]: The first tensor is the transformer input
+                after the conditions have been fused. The second output tensor is the tensor
+                used for cross-attention or None if no cross attention inputs exist.
+        &#34;&#34;&#34;
+        B, T, _ = input.shape
+
+        if &#39;offsets&#39; in self._streaming_state:
+            first_step = False
+            offsets = self._streaming_state[&#39;offsets&#39;]
+        else:
+            first_step = True
+            offsets = torch.zeros(input.shape[0], dtype=torch.long, device=input.device)
+
+        assert set(conditions.keys()).issubset(set(self.cond2fuse.keys())), \
+            f&#34;given conditions contain unknown attributes for fuser, &#34; \
+            f&#34;expected {self.cond2fuse.keys()}, got {conditions.keys()}&#34;
+        cross_attention_output = None
+        for cond_type, (cond, cond_mask) in conditions.items():
+            op = self.cond2fuse[cond_type]
+            if op == &#34;sum&#34;:
+                input += cond
+            elif op == &#34;input_interpolate&#34;:
+                cond = rearrange(cond, &#34;b t d -&gt; b d t&#34;)
+                cond = F.interpolate(cond, size=input.shape[1])
+                input += rearrange(cond, &#34;b d t -&gt; b t d&#34;)
+            elif op == &#34;prepend&#34;:
+                if first_step:
+                    input = torch.cat([cond, input], dim=1)
+            elif op == &#34;cross&#34;:
+                if cross_attention_output is not None:
+                    cross_attention_output = torch.cat([cross_attention_output, cond], dim=1)
+                else:
+                    cross_attention_output = cond
+            else:
+                raise ValueError(f&#34;unknown op ({op})&#34;)
+
+        if self.cross_attention_pos_emb and cross_attention_output is not None:
+            positions = torch.arange(
+                cross_attention_output.shape[1],
+                device=cross_attention_output.device
+            ).view(1, -1, 1)
+            pos_emb = create_sin_embedding(positions, cross_attention_output.shape[-1])
+            cross_attention_output = cross_attention_output + self.cross_attention_pos_emb_scale * pos_emb
+
+        if self._is_streaming:
+            self._streaming_state[&#39;offsets&#39;] = offsets + T
+
+        return input, cross_attention_output</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-functions">Functions</h2>
+<dl>
+<dt id="audiocraft.modules.conditioners.dropout_condition"><code class="name flex">
+<span>def <span class="ident">dropout_condition</span></span>(<span>sample: <a title="audiocraft.modules.conditioners.ConditioningAttributes" href="#audiocraft.modules.conditioners.ConditioningAttributes">ConditioningAttributes</a>, condition_type: str, condition: str)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Utility function for nullifying an attribute inside an ConditioningAttributes object.
+If the condition is of type "wav", then nullify it using "nullify_condition".
+If the condition is of any other type, set its' value to None.
+Works in-place.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def dropout_condition(sample: ConditioningAttributes, condition_type: str, condition: str):
+    &#34;&#34;&#34;Utility function for nullifying an attribute inside an ConditioningAttributes object.
+    If the condition is of type &#34;wav&#34;, then nullify it using &#34;nullify_condition&#34;.
+    If the condition is of any other type, set its&#39; value to None.
+    Works in-place.
+    &#34;&#34;&#34;
+    if condition_type not in [&#34;text&#34;, &#34;wav&#34;]:
+        raise ValueError(
+            &#34;dropout_condition got an unexpected condition type!&#34;
+            f&#34; expected &#39;wav&#39; or &#39;text&#39; but got &#39;{condition_type}&#39;&#34;
+        )
+
+    if condition not in getattr(sample, condition_type):
+        raise ValueError(
+            &#34;dropout_condition received an unexpected condition!&#34;
+            f&#34; expected wav={sample.wav.keys()} and text={sample.text.keys()}&#34;
+            f&#34;but got &#39;{condition}&#39; of type &#39;{condition_type}&#39;!&#34;
+        )
+
+    if condition_type == &#34;wav&#34;:
+        wav, length, path = sample.wav[condition]
+        sample.wav[condition] = nullify_wav(wav)
+    else:
+        sample.text[condition] = None
+
+    return sample</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.conditioners.nullify_condition"><code class="name flex">
+<span>def <span class="ident">nullify_condition</span></span>(<span>condition: Tuple[torch.Tensor, torch.Tensor], dim: int = 1)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>This function transforms an input condition to a null condition.
+The way it is done by converting it to a single zero vector similarly
+to how it is done inside WhiteSpaceTokenizer and NoopTokenizer.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>condition</code></strong> :&ensp;<code>ConditionType</code></dt>
+<dd>a tuple of condition and mask (tp.Tuple[Tensor, Tensor])</dd>
+<dt><strong><code>dim</code></strong> :&ensp;<code>int</code></dt>
+<dd>the dimension that will be truncated (should be the time dimension)</dd>
+</dl>
+<p>WARNING!: dim should not be the batch dimension!</p>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><code>ConditionType</code></dt>
+<dd>a tuple of null condition and mask</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def nullify_condition(condition: ConditionType, dim: int = 1):
+    &#34;&#34;&#34;This function transforms an input condition to a null condition.
+    The way it is done by converting it to a single zero vector similarly
+    to how it is done inside WhiteSpaceTokenizer and NoopTokenizer.
+
+    Args:
+        condition (ConditionType): a tuple of condition and mask (tp.Tuple[Tensor, Tensor])
+        dim (int): the dimension that will be truncated (should be the time dimension)
+        WARNING!: dim should not be the batch dimension!
+    Returns:
+        ConditionType: a tuple of null condition and mask
+    &#34;&#34;&#34;
+    assert dim != 0, &#34;dim cannot be the batch dimension!&#34;
+    assert type(condition) == tuple and \
+        type(condition[0]) == Tensor and \
+        type(condition[1]) == Tensor, &#34;&#39;nullify_condition&#39; got an unexpected input type!&#34;
+    cond, mask = condition
+    B = cond.shape[0]
+    last_dim = cond.dim() - 1
+    out = cond.transpose(dim, last_dim)
+    out = 0. * out[..., :1]
+    out = out.transpose(dim, last_dim)
+    mask = torch.zeros((B, 1), device=out.device).int()
+    assert cond.dim() == out.dim()
+    return out, mask</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.conditioners.nullify_wav"><code class="name flex">
+<span>def <span class="ident">nullify_wav</span></span>(<span>wav: torch.Tensor) ‑> <a title="audiocraft.modules.conditioners.WavCondition" href="#audiocraft.modules.conditioners.WavCondition">WavCondition</a></span>
+</code></dt>
+<dd>
+<div class="desc"><p>Create a nullified WavCondition from a wav tensor with appropriate shape.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>wav</code></strong> :&ensp;<code>Tensor</code></dt>
+<dd>tensor of shape [B, T]</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><code><a title="audiocraft.modules.conditioners.WavCondition" href="#audiocraft.modules.conditioners.WavCondition">WavCondition</a></code></dt>
+<dd>wav condition with nullified wav.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def nullify_wav(wav: Tensor) -&gt; WavCondition:
+    &#34;&#34;&#34;Create a nullified WavCondition from a wav tensor with appropriate shape.
+
+    Args:
+        wav (Tensor): tensor of shape [B, T]
+    Returns:
+        WavCondition: wav condition with nullified wav.
+    &#34;&#34;&#34;
+    null_wav, _ = nullify_condition((wav, torch.zeros_like(wav)), dim=wav.dim() - 1)
+    return WavCondition(
+        wav=null_wav,
+        length=torch.tensor([0] * wav.shape[0], device=wav.device),
+        path=[&#39;null_wav&#39;] * wav.shape[0]
+    )</code></pre>
+</details>
+</dd>
+</dl>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.modules.conditioners.AttributeDropout"><code class="flex name class">
+<span>class <span class="ident">AttributeDropout</span></span>
+<span>(</span><span>p: Dict[str, Dict[str, float]], active_on_eval: bool = False, seed: int = 1234)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Applies dropout with a given probability per attribute. This is different from the behavior of
+ClassifierFreeGuidanceDropout as this allows for attributes to be dropped out separately. For example,
+"artist" can be dropped while "genre" remains. This is in contrast to ClassifierFreeGuidanceDropout
+where if "artist" is dropped "genre" must also be dropped.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>p</code></strong> :&ensp;<code>tp.Dict[str, float]</code></dt>
+<dd>A dict mapping between attributes and dropout probability. For example:
+&hellip;
+"genre": 0.1,
+"artist": 0.5,
+"wav": 0.25,
+&hellip;</dd>
+<dt><strong><code>active_on_eval</code></strong> :&ensp;<code>bool</code>, optional</dt>
+<dd>Whether the dropout is active at eval. Default to False.</dd>
+<dt><strong><code>seed</code></strong> :&ensp;<code>int</code>, optional</dt>
+<dd>Random seed.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class AttributeDropout(DropoutModule):
+    &#34;&#34;&#34;Applies dropout with a given probability per attribute. This is different from the behavior of
+    ClassifierFreeGuidanceDropout as this allows for attributes to be dropped out separately. For example,
+    &#34;artist&#34; can be dropped while &#34;genre&#34; remains. This is in contrast to ClassifierFreeGuidanceDropout
+    where if &#34;artist&#34; is dropped &#34;genre&#34; must also be dropped.
+
+    Args:
+        p (tp.Dict[str, float]): A dict mapping between attributes and dropout probability. For example:
+            ...
+            &#34;genre&#34;: 0.1,
+            &#34;artist&#34;: 0.5,
+            &#34;wav&#34;: 0.25,
+            ...
+        active_on_eval (bool, optional): Whether the dropout is active at eval. Default to False.
+        seed (int, optional): Random seed.
+    &#34;&#34;&#34;
+    def __init__(self, p: tp.Dict[str, tp.Dict[str, float]], active_on_eval: bool = False, seed: int = 1234):
+        super().__init__(seed=seed)
+        self.active_on_eval = active_on_eval
+        # construct dict that return the values from p otherwise 0
+        self.p = {}
+        for condition_type, probs in p.items():
+            self.p[condition_type] = defaultdict(lambda: 0, probs)
+
+    def forward(self, samples: tp.List[ConditioningAttributes]) -&gt; tp.List[ConditioningAttributes]:
+        &#34;&#34;&#34;
+        Args:
+            samples (tp.List[ConditioningAttributes]): List of conditions.
+        Returns:
+            tp.List[ConditioningAttributes]: List of conditions after certain attributes were set to None.
+        &#34;&#34;&#34;
+        if not self.training and not self.active_on_eval:
+            return samples
+
+        samples = deepcopy(samples)
+
+        for condition_type, ps in self.p.items():  # for condition types [text, wav]
+            for condition, p in ps.items():  # for attributes of each type (e.g., [artist, genre])
+                if torch.rand(1, generator=self.rng).item() &lt; p:
+                    for sample in samples:
+                        dropout_condition(sample, condition_type, condition)
+
+        return samples
+
+    def __repr__(self):
+        return f&#34;AttributeDropout({dict(self.p)})&#34;</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.conditioners.DropoutModule" href="#audiocraft.modules.conditioners.DropoutModule">DropoutModule</a></li>
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.conditioners.AttributeDropout.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.AttributeDropout.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.AttributeDropout.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.modules.conditioners.AttributeDropout.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, samples: List[<a title="audiocraft.modules.conditioners.ConditioningAttributes" href="#audiocraft.modules.conditioners.ConditioningAttributes">ConditioningAttributes</a>]) ‑> List[<a title="audiocraft.modules.conditioners.ConditioningAttributes" href="#audiocraft.modules.conditioners.ConditioningAttributes">ConditioningAttributes</a>]</span>
+</code></dt>
+<dd>
+<div class="desc"><h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>samples</code></strong> :&ensp;<code>tp.List[<a title="audiocraft.modules.conditioners.ConditioningAttributes" href="#audiocraft.modules.conditioners.ConditioningAttributes">ConditioningAttributes</a>]</code></dt>
+<dd>List of conditions.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><code>tp.List[<a title="audiocraft.modules.conditioners.ConditioningAttributes" href="#audiocraft.modules.conditioners.ConditioningAttributes">ConditioningAttributes</a>]</code></dt>
+<dd>List of conditions after certain attributes were set to None.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, samples: tp.List[ConditioningAttributes]) -&gt; tp.List[ConditioningAttributes]:
+    &#34;&#34;&#34;
+    Args:
+        samples (tp.List[ConditioningAttributes]): List of conditions.
+    Returns:
+        tp.List[ConditioningAttributes]: List of conditions after certain attributes were set to None.
+    &#34;&#34;&#34;
+    if not self.training and not self.active_on_eval:
+        return samples
+
+    samples = deepcopy(samples)
+
+    for condition_type, ps in self.p.items():  # for condition types [text, wav]
+        for condition, p in ps.items():  # for attributes of each type (e.g., [artist, genre])
+            if torch.rand(1, generator=self.rng).item() &lt; p:
+                for sample in samples:
+                    dropout_condition(sample, condition_type, condition)
+
+    return samples</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.modules.conditioners.BaseConditioner"><code class="flex name class">
+<span>class <span class="ident">BaseConditioner</span></span>
+<span>(</span><span>dim, output_dim)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Base model for all conditioner modules. We allow the output dim to be different
+than the hidden dim for two reasons: 1) keep our LUTs small when the vocab is large;
+2) make all condition dims consistent.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>dim</code></strong> :&ensp;<code>int</code></dt>
+<dd>Hidden dim of the model (text-encoder/LUT).</dd>
+<dt><strong><code>output_dim</code></strong> :&ensp;<code>int</code></dt>
+<dd>Output dim of the conditioner.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class BaseConditioner(nn.Module):
+    &#34;&#34;&#34;Base model for all conditioner modules. We allow the output dim to be different
+    than the hidden dim for two reasons: 1) keep our LUTs small when the vocab is large;
+    2) make all condition dims consistent.
+
+    Args:
+        dim (int): Hidden dim of the model (text-encoder/LUT).
+        output_dim (int): Output dim of the conditioner.
+    &#34;&#34;&#34;
+    def __init__(self, dim, output_dim):
+        super().__init__()
+        self.dim = dim
+        self.output_dim = output_dim
+        self.output_proj = nn.Linear(dim, output_dim)
+
+    def tokenize(self, *args, **kwargs) -&gt; tp.Any:
+        &#34;&#34;&#34;Should be any part of the processing that will lead to a synchronization
+        point, e.g. BPE tokenization with transfer to the GPU.
+
+        The returned value will be saved and return later when calling forward().
+        &#34;&#34;&#34;
+        raise NotImplementedError()
+
+    def forward(self, inputs: tp.Any) -&gt; ConditionType:
+        &#34;&#34;&#34;Gets input that should be used as conditioning (e.g, genre, description or a waveform).
+        Outputs a ConditionType, after the input data was embedded as a dense vector.
+
+        Returns:
+            ConditionType:
+                - A tensor of size [B, T, D] where B is the batch size, T is the length of the
+                  output embedding and D is the dimension of the embedding.
+                - And a mask indicating where the padding tokens.
+        &#34;&#34;&#34;
+        raise NotImplementedError()</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Subclasses</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.conditioners.TextConditioner" href="#audiocraft.modules.conditioners.TextConditioner">TextConditioner</a></li>
+<li><a title="audiocraft.modules.conditioners.WaveformConditioner" href="#audiocraft.modules.conditioners.WaveformConditioner">WaveformConditioner</a></li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.conditioners.BaseConditioner.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.BaseConditioner.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.BaseConditioner.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.modules.conditioners.BaseConditioner.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, inputs: Any) ‑> Tuple[torch.Tensor, torch.Tensor]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Gets input that should be used as conditioning (e.g, genre, description or a waveform).
+Outputs a ConditionType, after the input data was embedded as a dense vector.</p>
+<h2 id="returns">Returns</h2>
+<p>ConditionType:
+- A tensor of size [B, T, D] where B is the batch size, T is the length of the
+output embedding and D is the dimension of the embedding.
+- And a mask indicating where the padding tokens.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, inputs: tp.Any) -&gt; ConditionType:
+    &#34;&#34;&#34;Gets input that should be used as conditioning (e.g, genre, description or a waveform).
+    Outputs a ConditionType, after the input data was embedded as a dense vector.
+
+    Returns:
+        ConditionType:
+            - A tensor of size [B, T, D] where B is the batch size, T is the length of the
+              output embedding and D is the dimension of the embedding.
+            - And a mask indicating where the padding tokens.
+    &#34;&#34;&#34;
+    raise NotImplementedError()</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.conditioners.BaseConditioner.tokenize"><code class="name flex">
+<span>def <span class="ident">tokenize</span></span>(<span>self, *args, **kwargs) ‑> Any</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Should be any part of the processing that will lead to a synchronization
+point, e.g. BPE tokenization with transfer to the GPU.</p>
+<p>The returned value will be saved and return later when calling forward().</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def tokenize(self, *args, **kwargs) -&gt; tp.Any:
+    &#34;&#34;&#34;Should be any part of the processing that will lead to a synchronization
+    point, e.g. BPE tokenization with transfer to the GPU.
+
+    The returned value will be saved and return later when calling forward().
+    &#34;&#34;&#34;
+    raise NotImplementedError()</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.modules.conditioners.ChromaExtractor"><code class="flex name class">
+<span>class <span class="ident">ChromaExtractor</span></span>
+<span>(</span><span>sample_rate: int, n_chroma: int = 12, radix2_exp: int = 12, nfft: Optional[int] = None, winlen: Optional[int] = None, winhop: Optional[int] = None, argmax: bool = False, norm: float = inf, device: Union[torch.device, str] = 'cpu')</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Chroma extraction class, handles chroma extraction and quantization.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>sample_rate</code></strong> :&ensp;<code>int</code></dt>
+<dd>Sample rate.</dd>
+<dt><strong><code>n_chroma</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of chroma to consider.</dd>
+<dt><strong><code>radix2_exp</code></strong> :&ensp;<code>int</code></dt>
+<dd>Radix2 exponent.</dd>
+<dt><strong><code>nfft</code></strong> :&ensp;<code>tp.Optional[int]</code>, optional</dt>
+<dd>Number of FFT.</dd>
+<dt><strong><code>winlen</code></strong> :&ensp;<code>tp.Optional[int]</code>, optional</dt>
+<dd>Window length.</dd>
+<dt><strong><code>winhop</code></strong> :&ensp;<code>tp.Optional[int]</code>, optional</dt>
+<dd>Window hop size.</dd>
+<dt><strong><code>argmax</code></strong> :&ensp;<code>bool</code>, optional</dt>
+<dd>Whether to use argmax. Defaults to False.</dd>
+<dt><strong><code>norm</code></strong> :&ensp;<code>float</code>, optional</dt>
+<dd>Norm for chroma normalization. Defaults to inf.</dd>
+<dt><strong><code>device</code></strong> :&ensp;<code>tp.Union[torch.device, str]</code>, optional</dt>
+<dd>Device to use. Defaults to cpu.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class ChromaExtractor(nn.Module):
+    &#34;&#34;&#34;Chroma extraction class, handles chroma extraction and quantization.
+
+    Args:
+        sample_rate (int): Sample rate.
+        n_chroma (int): Number of chroma to consider.
+        radix2_exp (int): Radix2 exponent.
+        nfft (tp.Optional[int], optional): Number of FFT.
+        winlen (tp.Optional[int], optional): Window length.
+        winhop (tp.Optional[int], optional): Window hop size.
+        argmax (bool, optional): Whether to use argmax. Defaults to False.
+        norm (float, optional): Norm for chroma normalization. Defaults to inf.
+        device (tp.Union[torch.device, str], optional): Device to use. Defaults to cpu.
+    &#34;&#34;&#34;
+    def __init__(self, sample_rate: int, n_chroma: int = 12, radix2_exp: int = 12,
+                 nfft: tp.Optional[int] = None, winlen: tp.Optional[int] = None, winhop: tp.Optional[int] = None,
+                 argmax: bool = False, norm: float = torch.inf, device: tp.Union[torch.device, str] = &#34;cpu&#34;):
+        super().__init__()
+        from librosa import filters
+        self.device = device
+        self.autocast = TorchAutocast(enabled=device != &#34;cpu&#34;, device_type=self.device, dtype=torch.float32)
+        self.winlen = winlen or 2 ** radix2_exp
+        self.nfft = nfft or self.winlen
+        self.winhop = winhop or (self.winlen // 4)
+        self.sr = sample_rate
+        self.n_chroma = n_chroma
+        self.norm = norm
+        self.argmax = argmax
+        self.window = torch.hann_window(self.winlen).to(device)
+        self.fbanks = torch.from_numpy(filters.chroma(sr=sample_rate, n_fft=self.nfft, tuning=0,
+                                                      n_chroma=self.n_chroma)).to(device)
+        self.spec = torchaudio.transforms.Spectrogram(n_fft=self.nfft, win_length=self.winlen,
+                                                      hop_length=self.winhop, power=2, center=True,
+                                                      pad=0, normalized=True).to(device)
+
+    def forward(self, wav):
+        with self.autocast:
+            T = wav.shape[-1]
+            # in case we are getting a wav that was dropped out (nullified)
+            # make sure wav length is no less that nfft
+            if T &lt; self.nfft:
+                pad = self.nfft - T
+                r = 0 if pad % 2 == 0 else 1
+                wav = F.pad(wav, (pad // 2, pad // 2 + r), &#39;constant&#39;, 0)
+                assert wav.shape[-1] == self.nfft, f&#39;expected len {self.nfft} but got {wav.shape[-1]}&#39;
+            spec = self.spec(wav).squeeze(1)
+            raw_chroma = torch.einsum(&#34;cf,...ft-&gt;...ct&#34;, self.fbanks, spec)
+            norm_chroma = torch.nn.functional.normalize(raw_chroma, p=self.norm, dim=-2, eps=1e-6)
+            norm_chroma = rearrange(norm_chroma, &#34;b d t -&gt; b t d&#34;)
+
+            if self.argmax:
+                idx = norm_chroma.argmax(-1, keepdims=True)
+                norm_chroma[:] = 0
+                norm_chroma.scatter_(dim=-1, index=idx, value=1)
+
+            return norm_chroma</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.conditioners.ChromaExtractor.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.ChromaExtractor.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.ChromaExtractor.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.modules.conditioners.ChromaExtractor.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, wav) ‑> Callable[..., Any]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Defines the computation performed at every call.</p>
+<p>Should be overridden by all subclasses.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>Although the recipe for forward pass needs to be defined within
+this function, one should call the :class:<code>Module</code> instance afterwards
+instead of this since the former takes care of running the
+registered hooks while the latter silently ignores them.</p>
+</div></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, wav):
+    with self.autocast:
+        T = wav.shape[-1]
+        # in case we are getting a wav that was dropped out (nullified)
+        # make sure wav length is no less that nfft
+        if T &lt; self.nfft:
+            pad = self.nfft - T
+            r = 0 if pad % 2 == 0 else 1
+            wav = F.pad(wav, (pad // 2, pad // 2 + r), &#39;constant&#39;, 0)
+            assert wav.shape[-1] == self.nfft, f&#39;expected len {self.nfft} but got {wav.shape[-1]}&#39;
+        spec = self.spec(wav).squeeze(1)
+        raw_chroma = torch.einsum(&#34;cf,...ft-&gt;...ct&#34;, self.fbanks, spec)
+        norm_chroma = torch.nn.functional.normalize(raw_chroma, p=self.norm, dim=-2, eps=1e-6)
+        norm_chroma = rearrange(norm_chroma, &#34;b d t -&gt; b t d&#34;)
+
+        if self.argmax:
+            idx = norm_chroma.argmax(-1, keepdims=True)
+            norm_chroma[:] = 0
+            norm_chroma.scatter_(dim=-1, index=idx, value=1)
+
+        return norm_chroma</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.modules.conditioners.ChromaStemConditioner"><code class="flex name class">
+<span>class <span class="ident">ChromaStemConditioner</span></span>
+<span>(</span><span>output_dim: int, sample_rate: int, n_chroma: int, radix2_exp: int, duration: float, match_len_on_eval: bool = True, eval_wavs: Optional[str] = None, n_eval_wavs: int = 0, device: Union[torch.device, str] = 'cpu', **kwargs)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Chroma conditioner that uses DEMUCS to first filter out drums and bass. The is followed by
+the insight the drums and bass often dominate the chroma, leading to the chroma not containing the
+information about melody.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>output_dim</code></strong> :&ensp;<code>int</code></dt>
+<dd>Output dimension for the conditioner.</dd>
+<dt><strong><code>sample_rate</code></strong> :&ensp;<code>int</code></dt>
+<dd>Sample rate for the chroma extractor.</dd>
+<dt><strong><code>n_chroma</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of chroma for the chroma extractor.</dd>
+<dt><strong><code>radix2_exp</code></strong> :&ensp;<code>int</code></dt>
+<dd>Radix2 exponent for the chroma extractor.</dd>
+<dt><strong><code>duration</code></strong> :&ensp;<code>float</code></dt>
+<dd>Duration used during training. This is later used for correct padding
+in case we are using chroma as prefix.</dd>
+<dt><strong><code>match_len_on_eval</code></strong> :&ensp;<code>bool</code>, optional</dt>
+<dd>If True then all chromas are padded to the training
+duration. Defaults to False.</dd>
+<dt><strong><code>eval_wavs</code></strong> :&ensp;<code>str</code>, optional</dt>
+<dd>Path to a json egg with waveform, this waveforms are used as
+conditions during eval (for cases where we don't want to leak test conditions like MusicCaps).
+Defaults to None.</dd>
+<dt><strong><code>n_eval_wavs</code></strong> :&ensp;<code>int</code>, optional</dt>
+<dd>Limits the number of waveforms used for conditioning. Defaults to 0.</dd>
+<dt><strong><code>device</code></strong> :&ensp;<code>tp.Union[torch.device, str]</code>, optional</dt>
+<dd>Device for the conditioner.</dd>
+<dt><strong><code>**kwargs</code></strong></dt>
+<dd>Additional parameters for the chroma extractor.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class ChromaStemConditioner(WaveformConditioner):
+    &#34;&#34;&#34;Chroma conditioner that uses DEMUCS to first filter out drums and bass. The is followed by
+    the insight the drums and bass often dominate the chroma, leading to the chroma not containing the
+    information about melody.
+
+    Args:
+        output_dim (int): Output dimension for the conditioner.
+        sample_rate (int): Sample rate for the chroma extractor.
+        n_chroma (int): Number of chroma for the chroma extractor.
+        radix2_exp (int): Radix2 exponent for the chroma extractor.
+        duration (float): Duration used during training. This is later used for correct padding
+            in case we are using chroma as prefix.
+        match_len_on_eval (bool, optional): If True then all chromas are padded to the training
+            duration. Defaults to False.
+        eval_wavs (str, optional): Path to a json egg with waveform, this waveforms are used as
+            conditions during eval (for cases where we don&#39;t want to leak test conditions like MusicCaps).
+            Defaults to None.
+        n_eval_wavs (int, optional): Limits the number of waveforms used for conditioning. Defaults to 0.
+        device (tp.Union[torch.device, str], optional): Device for the conditioner.
+        **kwargs: Additional parameters for the chroma extractor.
+    &#34;&#34;&#34;
+    def __init__(self, output_dim: int, sample_rate: int, n_chroma: int, radix2_exp: int,
+                 duration: float, match_len_on_eval: bool = True, eval_wavs: tp.Optional[str] = None,
+                 n_eval_wavs: int = 0, device: tp.Union[torch.device, str] = &#34;cpu&#34;, **kwargs):
+        from demucs import pretrained
+        super().__init__(dim=n_chroma, output_dim=output_dim, device=device)
+        self.autocast = TorchAutocast(enabled=device != &#34;cpu&#34;, device_type=self.device, dtype=torch.float32)
+        self.sample_rate = sample_rate
+        self.match_len_on_eval = match_len_on_eval
+        self.duration = duration
+        self.__dict__[&#34;demucs&#34;] = pretrained.get_model(&#39;htdemucs&#39;).to(device)
+        self.stem2idx = {&#39;drums&#39;: 0, &#39;bass&#39;: 1, &#39;other&#39;: 2, &#39;vocal&#39;: 3}
+        self.stem_idx = torch.LongTensor([self.stem2idx[&#39;vocal&#39;], self.stem2idx[&#39;other&#39;]]).to(device)
+        self.chroma = ChromaExtractor(sample_rate=sample_rate, n_chroma=n_chroma, radix2_exp=radix2_exp,
+                                      device=device, **kwargs)
+        self.chroma_len = self._get_chroma_len()
+
+    def _downsampling_factor(self):
+        return self.chroma.winhop
+
+    def _get_chroma_len(self):
+        &#34;&#34;&#34;Get length of chroma during training&#34;&#34;&#34;
+        dummy_wav = torch.zeros((1, self.sample_rate * self.duration), device=self.device)
+        dummy_chr = self.chroma(dummy_wav)
+        return dummy_chr.shape[1]
+
+    @torch.no_grad()
+    def _get_filtered_wav(self, wav):
+        from demucs.apply import apply_model
+        from demucs.audio import convert_audio
+        with self.autocast:
+            wav = convert_audio(wav, self.sample_rate, self.demucs.samplerate, self.demucs.audio_channels)
+            stems = apply_model(self.demucs, wav, device=self.device)
+            stems = stems[:, self.stem_idx]  # extract stem
+            stems = stems.sum(1)  # merge extracted stems
+            stems = stems.mean(1, keepdim=True)  # mono
+            stems = convert_audio(stems, self.demucs.samplerate, self.sample_rate, 1)
+            return stems
+
+    @torch.no_grad()
+    def _get_wav_embedding(self, wav):
+        # avoid 0-size tensors when we are working with null conds
+        if wav.shape[-1] == 1:
+            return self.chroma(wav)
+        stems = self._get_filtered_wav(wav)
+        chroma = self.chroma(stems)
+
+        if self.match_len_on_eval:
+            b, t, c = chroma.shape
+            if t &gt; self.chroma_len:
+                chroma = chroma[:, :self.chroma_len]
+                logger.debug(f&#39;chroma was truncated! ({t} -&gt; {chroma.shape[1]})&#39;)
+            elif t &lt; self.chroma_len:
+                # chroma = F.pad(chroma, (0, 0, 0, self.chroma_len - t))
+                n_repeat = int(math.ceil(self.chroma_len / t))
+                chroma = chroma.repeat(1, n_repeat, 1)
+                chroma = chroma[:, :self.chroma_len]
+                logger.debug(f&#39;chroma was zero-padded! ({t} -&gt; {chroma.shape[1]})&#39;)
+        return chroma</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.conditioners.WaveformConditioner" href="#audiocraft.modules.conditioners.WaveformConditioner">WaveformConditioner</a></li>
+<li><a title="audiocraft.modules.conditioners.BaseConditioner" href="#audiocraft.modules.conditioners.BaseConditioner">BaseConditioner</a></li>
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.conditioners.ChromaStemConditioner.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.ChromaStemConditioner.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.ChromaStemConditioner.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Inherited members</h3>
+<ul class="hlist">
+<li><code><b><a title="audiocraft.modules.conditioners.WaveformConditioner" href="#audiocraft.modules.conditioners.WaveformConditioner">WaveformConditioner</a></b></code>:
+<ul class="hlist">
+<li><code><a title="audiocraft.modules.conditioners.WaveformConditioner.forward" href="#audiocraft.modules.conditioners.WaveformConditioner.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.WaveformConditioner.tokenize" href="#audiocraft.modules.conditioners.BaseConditioner.tokenize">tokenize</a></code></li>
+</ul>
+</li>
+</ul>
+</dd>
+<dt id="audiocraft.modules.conditioners.ClassifierFreeGuidanceDropout"><code class="flex name class">
+<span>class <span class="ident">ClassifierFreeGuidanceDropout</span></span>
+<span>(</span><span>p: float, seed: int = 1234)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Applies Classifier Free Guidance dropout, meaning all attributes
+are dropped with the same probability.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>p</code></strong> :&ensp;<code>float</code></dt>
+<dd>Probability to apply condition dropout during training.</dd>
+<dt><strong><code>seed</code></strong> :&ensp;<code>int</code></dt>
+<dd>Random seed.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class ClassifierFreeGuidanceDropout(DropoutModule):
+    &#34;&#34;&#34;Applies Classifier Free Guidance dropout, meaning all attributes
+    are dropped with the same probability.
+
+    Args:
+        p (float): Probability to apply condition dropout during training.
+        seed (int): Random seed.
+    &#34;&#34;&#34;
+    def __init__(self, p: float, seed: int = 1234):
+        super().__init__(seed=seed)
+        self.p = p
+
+    def forward(self, samples: tp.List[ConditioningAttributes]) -&gt; tp.List[ConditioningAttributes]:
+        &#34;&#34;&#34;
+        Args:
+            samples (tp.List[ConditioningAttributes]): List of conditions.
+        Returns:
+            tp.List[ConditioningAttributes]: List of conditions after all attributes were set to None.
+        &#34;&#34;&#34;
+        if not self.training:
+            return samples
+
+        # decide on which attributes to drop in a batched fashion
+        drop = torch.rand(1, generator=self.rng).item() &lt; self.p
+        if not drop:
+            return samples
+
+        # nullify conditions of all attributes
+        samples = deepcopy(samples)
+
+        for condition_type in [&#34;wav&#34;, &#34;text&#34;]:
+            for sample in samples:
+                for condition in sample.attributes[condition_type]:
+                    dropout_condition(sample, condition_type, condition)
+
+        return samples
+
+    def __repr__(self):
+        return f&#34;ClassifierFreeGuidanceDropout(p={self.p})&#34;</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.conditioners.DropoutModule" href="#audiocraft.modules.conditioners.DropoutModule">DropoutModule</a></li>
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.conditioners.ClassifierFreeGuidanceDropout.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.ClassifierFreeGuidanceDropout.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.ClassifierFreeGuidanceDropout.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.modules.conditioners.ClassifierFreeGuidanceDropout.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, samples: List[<a title="audiocraft.modules.conditioners.ConditioningAttributes" href="#audiocraft.modules.conditioners.ConditioningAttributes">ConditioningAttributes</a>]) ‑> List[<a title="audiocraft.modules.conditioners.ConditioningAttributes" href="#audiocraft.modules.conditioners.ConditioningAttributes">ConditioningAttributes</a>]</span>
+</code></dt>
+<dd>
+<div class="desc"><h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>samples</code></strong> :&ensp;<code>tp.List[<a title="audiocraft.modules.conditioners.ConditioningAttributes" href="#audiocraft.modules.conditioners.ConditioningAttributes">ConditioningAttributes</a>]</code></dt>
+<dd>List of conditions.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><code>tp.List[<a title="audiocraft.modules.conditioners.ConditioningAttributes" href="#audiocraft.modules.conditioners.ConditioningAttributes">ConditioningAttributes</a>]</code></dt>
+<dd>List of conditions after all attributes were set to None.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, samples: tp.List[ConditioningAttributes]) -&gt; tp.List[ConditioningAttributes]:
+    &#34;&#34;&#34;
+    Args:
+        samples (tp.List[ConditioningAttributes]): List of conditions.
+    Returns:
+        tp.List[ConditioningAttributes]: List of conditions after all attributes were set to None.
+    &#34;&#34;&#34;
+    if not self.training:
+        return samples
+
+    # decide on which attributes to drop in a batched fashion
+    drop = torch.rand(1, generator=self.rng).item() &lt; self.p
+    if not drop:
+        return samples
+
+    # nullify conditions of all attributes
+    samples = deepcopy(samples)
+
+    for condition_type in [&#34;wav&#34;, &#34;text&#34;]:
+        for sample in samples:
+            for condition in sample.attributes[condition_type]:
+                dropout_condition(sample, condition_type, condition)
+
+    return samples</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.modules.conditioners.ConditionFuser"><code class="flex name class">
+<span>class <span class="ident">ConditionFuser</span></span>
+<span>(</span><span>fuse2cond: Dict[str, List[str]], cross_attention_pos_emb: bool = False, cross_attention_pos_emb_scale: float = 1.0)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Condition fuser handles the logic to combine the different conditions
+to the actual model input.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>fuse2cond</code></strong> :&ensp;<code>tp.Dict[str, str]</code></dt>
+<dd>A dictionary that says how to fuse
+each condition. For example:
+{
+"prepend": ["description"],
+"sum": ["genre", "bpm"],
+"cross": ["description"],
+}</dd>
+<dt><strong><code>cross_attention_pos_emb</code></strong> :&ensp;<code>bool</code>, optional</dt>
+<dd>Use positional embeddings in cross attention.</dd>
+<dt><strong><code>cross_attention_pos_emb_scale</code></strong> :&ensp;<code>int</code></dt>
+<dd>Scale for positional embeddings in cross attention if used.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class ConditionFuser(StreamingModule):
+    &#34;&#34;&#34;Condition fuser handles the logic to combine the different conditions
+    to the actual model input.
+
+    Args:
+        fuse2cond (tp.Dict[str, str]): A dictionary that says how to fuse
+            each condition. For example:
+            {
+                &#34;prepend&#34;: [&#34;description&#34;],
+                &#34;sum&#34;: [&#34;genre&#34;, &#34;bpm&#34;],
+                &#34;cross&#34;: [&#34;description&#34;],
+            }
+        cross_attention_pos_emb (bool, optional): Use positional embeddings in cross attention.
+        cross_attention_pos_emb_scale (int): Scale for positional embeddings in cross attention if used.
+    &#34;&#34;&#34;
+    FUSING_METHODS = [&#34;sum&#34;, &#34;prepend&#34;, &#34;cross&#34;, &#34;input_interpolate&#34;]
+
+    def __init__(self, fuse2cond: tp.Dict[str, tp.List[str]], cross_attention_pos_emb: bool = False,
+                 cross_attention_pos_emb_scale: float = 1.0):
+        super().__init__()
+        assert all(
+            [k in self.FUSING_METHODS for k in fuse2cond.keys()]
+        ), f&#34;got invalid fuse method, allowed methods: {self.FUSING_MEHTODS}&#34;
+        self.cross_attention_pos_emb = cross_attention_pos_emb
+        self.cross_attention_pos_emb_scale = cross_attention_pos_emb_scale
+        self.fuse2cond: tp.Dict[str, tp.List[str]] = fuse2cond
+        self.cond2fuse: tp.Dict[str, str] = {}
+        for fuse_method, conditions in fuse2cond.items():
+            for condition in conditions:
+                self.cond2fuse[condition] = fuse_method
+
+    def forward(
+        self,
+        input: Tensor,
+        conditions: tp.Dict[str, ConditionType]
+    ) -&gt; tp.Tuple[Tensor, tp.Optional[Tensor]]:
+        &#34;&#34;&#34;Fuse the conditions to the provided model input.
+
+        Args:
+            input (Tensor): Transformer input.
+            conditions (tp.Dict[str, ConditionType]): Dict of conditions.
+        Returns:
+            tp.Tuple[Tensor, Tensor]: The first tensor is the transformer input
+                after the conditions have been fused. The second output tensor is the tensor
+                used for cross-attention or None if no cross attention inputs exist.
+        &#34;&#34;&#34;
+        B, T, _ = input.shape
+
+        if &#39;offsets&#39; in self._streaming_state:
+            first_step = False
+            offsets = self._streaming_state[&#39;offsets&#39;]
+        else:
+            first_step = True
+            offsets = torch.zeros(input.shape[0], dtype=torch.long, device=input.device)
+
+        assert set(conditions.keys()).issubset(set(self.cond2fuse.keys())), \
+            f&#34;given conditions contain unknown attributes for fuser, &#34; \
+            f&#34;expected {self.cond2fuse.keys()}, got {conditions.keys()}&#34;
+        cross_attention_output = None
+        for cond_type, (cond, cond_mask) in conditions.items():
+            op = self.cond2fuse[cond_type]
+            if op == &#34;sum&#34;:
+                input += cond
+            elif op == &#34;input_interpolate&#34;:
+                cond = rearrange(cond, &#34;b t d -&gt; b d t&#34;)
+                cond = F.interpolate(cond, size=input.shape[1])
+                input += rearrange(cond, &#34;b d t -&gt; b t d&#34;)
+            elif op == &#34;prepend&#34;:
+                if first_step:
+                    input = torch.cat([cond, input], dim=1)
+            elif op == &#34;cross&#34;:
+                if cross_attention_output is not None:
+                    cross_attention_output = torch.cat([cross_attention_output, cond], dim=1)
+                else:
+                    cross_attention_output = cond
+            else:
+                raise ValueError(f&#34;unknown op ({op})&#34;)
+
+        if self.cross_attention_pos_emb and cross_attention_output is not None:
+            positions = torch.arange(
+                cross_attention_output.shape[1],
+                device=cross_attention_output.device
+            ).view(1, -1, 1)
+            pos_emb = create_sin_embedding(positions, cross_attention_output.shape[-1])
+            cross_attention_output = cross_attention_output + self.cross_attention_pos_emb_scale * pos_emb
+
+        if self._is_streaming:
+            self._streaming_state[&#39;offsets&#39;] = offsets + T
+
+        return input, cross_attention_output</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.streaming.StreamingModule" href="streaming.html#audiocraft.modules.streaming.StreamingModule">StreamingModule</a></li>
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.conditioners.ConditionFuser.FUSING_METHODS"><code class="name">var <span class="ident">FUSING_METHODS</span></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.ConditionFuser.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.ConditionFuser.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.ConditionFuser.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.modules.conditioners.ConditionFuser.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, input: torch.Tensor, conditions: Dict[str, Tuple[torch.Tensor, torch.Tensor]]) ‑> Tuple[torch.Tensor, Optional[torch.Tensor]]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Fuse the conditions to the provided model input.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>input</code></strong> :&ensp;<code>Tensor</code></dt>
+<dd>Transformer input.</dd>
+<dt><strong><code>conditions</code></strong> :&ensp;<code>tp.Dict[str, ConditionType]</code></dt>
+<dd>Dict of conditions.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><code>tp.Tuple[Tensor, Tensor]</code></dt>
+<dd>The first tensor is the transformer input
+after the conditions have been fused. The second output tensor is the tensor
+used for cross-attention or None if no cross attention inputs exist.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(
+    self,
+    input: Tensor,
+    conditions: tp.Dict[str, ConditionType]
+) -&gt; tp.Tuple[Tensor, tp.Optional[Tensor]]:
+    &#34;&#34;&#34;Fuse the conditions to the provided model input.
+
+    Args:
+        input (Tensor): Transformer input.
+        conditions (tp.Dict[str, ConditionType]): Dict of conditions.
+    Returns:
+        tp.Tuple[Tensor, Tensor]: The first tensor is the transformer input
+            after the conditions have been fused. The second output tensor is the tensor
+            used for cross-attention or None if no cross attention inputs exist.
+    &#34;&#34;&#34;
+    B, T, _ = input.shape
+
+    if &#39;offsets&#39; in self._streaming_state:
+        first_step = False
+        offsets = self._streaming_state[&#39;offsets&#39;]
+    else:
+        first_step = True
+        offsets = torch.zeros(input.shape[0], dtype=torch.long, device=input.device)
+
+    assert set(conditions.keys()).issubset(set(self.cond2fuse.keys())), \
+        f&#34;given conditions contain unknown attributes for fuser, &#34; \
+        f&#34;expected {self.cond2fuse.keys()}, got {conditions.keys()}&#34;
+    cross_attention_output = None
+    for cond_type, (cond, cond_mask) in conditions.items():
+        op = self.cond2fuse[cond_type]
+        if op == &#34;sum&#34;:
+            input += cond
+        elif op == &#34;input_interpolate&#34;:
+            cond = rearrange(cond, &#34;b t d -&gt; b d t&#34;)
+            cond = F.interpolate(cond, size=input.shape[1])
+            input += rearrange(cond, &#34;b d t -&gt; b t d&#34;)
+        elif op == &#34;prepend&#34;:
+            if first_step:
+                input = torch.cat([cond, input], dim=1)
+        elif op == &#34;cross&#34;:
+            if cross_attention_output is not None:
+                cross_attention_output = torch.cat([cross_attention_output, cond], dim=1)
+            else:
+                cross_attention_output = cond
+        else:
+            raise ValueError(f&#34;unknown op ({op})&#34;)
+
+    if self.cross_attention_pos_emb and cross_attention_output is not None:
+        positions = torch.arange(
+            cross_attention_output.shape[1],
+            device=cross_attention_output.device
+        ).view(1, -1, 1)
+        pos_emb = create_sin_embedding(positions, cross_attention_output.shape[-1])
+        cross_attention_output = cross_attention_output + self.cross_attention_pos_emb_scale * pos_emb
+
+    if self._is_streaming:
+        self._streaming_state[&#39;offsets&#39;] = offsets + T
+
+    return input, cross_attention_output</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Inherited members</h3>
+<ul class="hlist">
+<li><code><b><a title="audiocraft.modules.streaming.StreamingModule" href="streaming.html#audiocraft.modules.streaming.StreamingModule">StreamingModule</a></b></code>:
+<ul class="hlist">
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.flush" href="streaming.html#audiocraft.modules.streaming.StreamingModule.flush">flush</a></code></li>
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.get_streaming_state" href="streaming.html#audiocraft.modules.streaming.StreamingModule.get_streaming_state">get_streaming_state</a></code></li>
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.reset_streaming" href="streaming.html#audiocraft.modules.streaming.StreamingModule.reset_streaming">reset_streaming</a></code></li>
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.set_streaming_state" href="streaming.html#audiocraft.modules.streaming.StreamingModule.set_streaming_state">set_streaming_state</a></code></li>
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.streaming" href="streaming.html#audiocraft.modules.streaming.StreamingModule.streaming">streaming</a></code></li>
+</ul>
+</li>
+</ul>
+</dd>
+<dt id="audiocraft.modules.conditioners.ConditioningAttributes"><code class="flex name class">
+<span>class <span class="ident">ConditioningAttributes</span></span>
+<span>(</span><span>text: Dict[str, Optional[str]] = &lt;factory&gt;, wav: Dict[str, <a title="audiocraft.modules.conditioners.WavCondition" href="#audiocraft.modules.conditioners.WavCondition">WavCondition</a>] = &lt;factory&gt;)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>ConditioningAttributes(text: Dict[str, Union[str, NoneType]] = <factory>, wav: Dict[str, audiocraft.modules.conditioners.WavCondition] = <factory>)</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class ConditioningAttributes:
+    text: tp.Dict[str, tp.Optional[str]] = field(default_factory=dict)
+    wav: tp.Dict[str, WavCondition] = field(default_factory=dict)
+
+    def __getitem__(self, item):
+        return getattr(self, item)
+
+    @property
+    def text_attributes(self):
+        return self.text.keys()
+
+    @property
+    def wav_attributes(self):
+        return self.wav.keys()
+
+    @property
+    def attributes(self):
+        return {&#34;text&#34;: self.text_attributes, &#34;wav&#34;: self.wav_attributes}
+
+    def to_flat_dict(self):
+        return {
+            **{f&#34;text.{k}&#34;: v for k, v in self.text.items()},
+            **{f&#34;wav.{k}&#34;: v for k, v in self.wav.items()},
+        }
+
+    @classmethod
+    def from_flat_dict(cls, x):
+        out = cls()
+        for k, v in x.items():
+            kind, att = k.split(&#34;.&#34;)
+            out[kind][att] = v
+        return out</code></pre>
+</details>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.conditioners.ConditioningAttributes.text"><code class="name">var <span class="ident">text</span> : Dict[str, Optional[str]]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.ConditioningAttributes.wav"><code class="name">var <span class="ident">wav</span> : Dict[str, <a title="audiocraft.modules.conditioners.WavCondition" href="#audiocraft.modules.conditioners.WavCondition">WavCondition</a>]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Static methods</h3>
+<dl>
+<dt id="audiocraft.modules.conditioners.ConditioningAttributes.from_flat_dict"><code class="name flex">
+<span>def <span class="ident">from_flat_dict</span></span>(<span>x)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@classmethod
+def from_flat_dict(cls, x):
+    out = cls()
+    for k, v in x.items():
+        kind, att = k.split(&#34;.&#34;)
+        out[kind][att] = v
+    return out</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Instance variables</h3>
+<dl>
+<dt id="audiocraft.modules.conditioners.ConditioningAttributes.attributes"><code class="name">var <span class="ident">attributes</span></code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def attributes(self):
+    return {&#34;text&#34;: self.text_attributes, &#34;wav&#34;: self.wav_attributes}</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.conditioners.ConditioningAttributes.text_attributes"><code class="name">var <span class="ident">text_attributes</span></code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def text_attributes(self):
+    return self.text.keys()</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.conditioners.ConditioningAttributes.wav_attributes"><code class="name">var <span class="ident">wav_attributes</span></code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def wav_attributes(self):
+    return self.wav.keys()</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.modules.conditioners.ConditioningAttributes.to_flat_dict"><code class="name flex">
+<span>def <span class="ident">to_flat_dict</span></span>(<span>self)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def to_flat_dict(self):
+    return {
+        **{f&#34;text.{k}&#34;: v for k, v in self.text.items()},
+        **{f&#34;wav.{k}&#34;: v for k, v in self.wav.items()},
+    }</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.modules.conditioners.ConditioningProvider"><code class="flex name class">
+<span>class <span class="ident">ConditioningProvider</span></span>
+<span>(</span><span>conditioners: Dict[str, <a title="audiocraft.modules.conditioners.BaseConditioner" href="#audiocraft.modules.conditioners.BaseConditioner">BaseConditioner</a>], merge_text_conditions_p: float = 0, drop_desc_p: float = 0, device: Union[torch.device, str] = 'cpu')</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Main class to provide conditions given all the supported conditioners.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>conditioners</code></strong> :&ensp;<code>dict</code></dt>
+<dd>Dictionary of conditioners.</dd>
+<dt><strong><code>merge_text_conditions_p</code></strong> :&ensp;<code>float</code>, optional</dt>
+<dd>Probability to merge all text sources
+into a single text condition. Defaults to 0.</dd>
+<dt><strong><code>drop_desc_p</code></strong> :&ensp;<code>float</code>, optional</dt>
+<dd>Probability to drop the original description
+when merging all text sources into a single text condition. Defaults to 0.</dd>
+<dt><strong><code>device</code></strong> :&ensp;<code>tp.Union[torch.device, str]</code>, optional</dt>
+<dd>Device for conditioners and output condition types.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class ConditioningProvider(nn.Module):
+    &#34;&#34;&#34;Main class to provide conditions given all the supported conditioners.
+
+    Args:
+        conditioners (dict): Dictionary of conditioners.
+        merge_text_conditions_p (float, optional): Probability to merge all text sources
+            into a single text condition. Defaults to 0.
+        drop_desc_p (float, optional): Probability to drop the original description
+            when merging all text sources into a single text condition. Defaults to 0.
+        device (tp.Union[torch.device, str], optional): Device for conditioners and output condition types.
+    &#34;&#34;&#34;
+    def __init__(
+        self,
+        conditioners: tp.Dict[str, BaseConditioner],
+        merge_text_conditions_p: float = 0,
+        drop_desc_p: float = 0,
+        device: tp.Union[torch.device, str] = &#34;cpu&#34;,
+    ):
+        super().__init__()
+        self.device = device
+        self.merge_text_conditions_p = merge_text_conditions_p
+        self.drop_desc_p = drop_desc_p
+        self.conditioners = nn.ModuleDict(conditioners)
+
+    @property
+    def text_conditions(self):
+        return [k for k, v in self.conditioners.items() if isinstance(v, TextConditioner)]
+
+    @property
+    def wav_conditions(self):
+        return [k for k, v in self.conditioners.items() if isinstance(v, WaveformConditioner)]
+
+    @property
+    def has_wav_condition(self):
+        return len(self.wav_conditions) &gt; 0
+
+    def tokenize(self, inputs: tp.List[ConditioningAttributes]) -&gt; tp.Dict[str, tp.Any]:
+        &#34;&#34;&#34;Match attributes/wavs with existing conditioners in self, and compute tokenize them accordingly.
+        This should be called before starting any real GPU work to avoid synchronization points.
+        This will return a dict matching conditioner names to their arbitrary tokenized representations.
+
+        Args:
+            inputs (list[ConditioningAttribres]): List of ConditioningAttributes objects containing
+                text and wav conditions.
+        &#34;&#34;&#34;
+        assert all([type(x) == ConditioningAttributes for x in inputs]), \
+            &#34;got unexpected types input for conditioner! should be tp.List[ConditioningAttributes]&#34; \
+            f&#34; but types were {set([type(x) for x in inputs])}&#34;
+
+        output = {}
+        text = self._collate_text(inputs)
+        wavs = self._collate_wavs(inputs)
+
+        assert set(text.keys() | wavs.keys()).issubset(set(self.conditioners.keys())), \
+            f&#34;got an unexpected attribute! Expected {self.conditioners.keys()}, got {text.keys(), wavs.keys()}&#34;
+
+        for attribute, batch in chain(text.items(), wavs.items()):
+            output[attribute] = self.conditioners[attribute].tokenize(batch)
+        return output
+
+    def forward(self, tokenized: tp.Dict[str, tp.Any]) -&gt; tp.Dict[str, ConditionType]:
+        &#34;&#34;&#34;Compute pairs of `(embedding, mask)` using the configured conditioners
+        and the tokenized representations. The output is for example:
+
+            {
+                &#34;genre&#34;: (torch.Tensor([B, 1, D_genre]), torch.Tensor([B, 1])),
+                &#34;description&#34;: (torch.Tensor([B, T_desc, D_desc]), torch.Tensor([B, T_desc])),
+                ...
+            }
+
+        Args:
+            tokenized (dict): Dict of tokenized representations as returned by `tokenize()`.
+        &#34;&#34;&#34;
+        output = {}
+        for attribute, inputs in tokenized.items():
+            condition, mask = self.conditioners[attribute](inputs)
+            output[attribute] = (condition, mask)
+        return output
+
+    def _collate_text(self, samples: tp.List[ConditioningAttributes]) -&gt; tp.Dict[str, tp.List[tp.Optional[str]]]:
+        &#34;&#34;&#34;Given a list of ConditioningAttributes objects, compile a dictionary where the keys
+        are the attributes and the values are the aggregated input per attribute.
+        For example:
+        Input:
+        [
+            ConditioningAttributes(text={&#34;genre&#34;: &#34;Rock&#34;, &#34;description&#34;: &#34;A rock song with a guitar solo&#34;}, wav=...),
+            ConditioningAttributes(text={&#34;genre&#34;: &#34;Hip-hop&#34;, &#34;description&#34;: &#34;A hip-hop verse&#34;}, wav=...),
+        ]
+        Output:
+        {
+            &#34;genre&#34;: [&#34;Rock&#34;, &#34;Hip-hop&#34;],
+            &#34;description&#34;: [&#34;A rock song with a guitar solo&#34;, &#34;A hip-hop verse&#34;]
+        }
+        &#34;&#34;&#34;
+        batch_per_attribute: tp.Dict[str, tp.List[tp.Optional[str]]] = defaultdict(list)
+
+        def _merge_conds(cond, merge_text_conditions_p=0, drop_desc_p=0):
+            def is_valid(k, v):
+                k_valid = k in [&#39;key&#39;, &#39;bpm&#39;, &#39;genre&#39;, &#39;moods&#39;, &#39;instrument&#39;]
+                v_valid = v is not None and isinstance(v, (int, float, str, list))
+                return k_valid and v_valid
+
+            def process_value(v):
+                if isinstance(v, (int, float, str)):
+                    return v
+                if isinstance(v, list):
+                    return &#34;, &#34;.join(v)
+                else:
+                    RuntimeError(f&#34;unknown type for text value! ({type(v), v})&#34;)
+
+            desc = cond.text[&#39;description&#39;]
+            meta_data = &#34;&#34;
+            if random.uniform(0, 1) &lt; merge_text_conditions_p:
+                meta_pairs = [f&#39;{k}: {process_value(v)}&#39; for k, v in cond.text.items() if is_valid(k, v)]
+                random.shuffle(meta_pairs)
+                meta_data = &#34;. &#34;.join(meta_pairs)
+                desc = desc if not random.uniform(0, 1) &lt; drop_desc_p else None
+
+            if desc is None:
+                desc = meta_data if len(meta_data) &gt; 1 else None
+            else:
+                desc = desc.rstrip(&#39;.&#39;) + &#34;. &#34; + meta_data
+            cond.text[&#39;description&#39;] = desc.strip() if desc else None
+
+        if self.training and self.merge_text_conditions_p:
+            for sample in samples:
+                _merge_conds(sample, self.merge_text_conditions_p, self.drop_desc_p)
+
+        texts = [x.text for x in samples]
+        for text in texts:
+            for condition in self.text_conditions:
+                batch_per_attribute[condition].append(text[condition])
+
+        return batch_per_attribute
+
+    def _collate_wavs(self, samples: tp.List[ConditioningAttributes]):
+        &#34;&#34;&#34;Generate a dict where the keys are attributes by which we fetch similar wavs,
+        and the values are Tensors of wavs according to said attribtues.
+
+        *Note*: by the time the samples reach this function, each sample should have some waveform
+        inside the &#34;wav&#34; attribute. It should be either:
+        1. A real waveform
+        2. A null waveform due to the sample having no similar waveforms (nullified by the dataset)
+        3. A null waveform due to it being dropped in a dropout module (nullified by dropout)
+
+        Args:
+            samples (tp.List[ConditioningAttributes]): List of ConditioningAttributes samples.
+        Returns:
+            dict: A dicionary mapping an attribute name to wavs.
+        &#34;&#34;&#34;
+        wavs = defaultdict(list)
+        lens = defaultdict(list)
+        paths = defaultdict(list)
+        out = {}
+
+        for sample in samples:
+            for attribute in self.wav_conditions:
+                wav, length, path = sample.wav[attribute]
+                wavs[attribute].append(wav.flatten())
+                lens[attribute].append(length)
+                paths[attribute].append(path)
+
+        # stack all wavs to a single tensor
+        for attribute in self.wav_conditions:
+            stacked_wav, _ = collate(wavs[attribute], dim=0)
+            out[attribute] = WavCondition(stacked_wav.unsqueeze(1),
+                                          torch.cat(lens[&#39;self_wav&#39;]), paths[attribute])  # type: ignore
+
+        return out</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.conditioners.ConditioningProvider.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.ConditioningProvider.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.ConditioningProvider.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Instance variables</h3>
+<dl>
+<dt id="audiocraft.modules.conditioners.ConditioningProvider.has_wav_condition"><code class="name">var <span class="ident">has_wav_condition</span></code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def has_wav_condition(self):
+    return len(self.wav_conditions) &gt; 0</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.conditioners.ConditioningProvider.text_conditions"><code class="name">var <span class="ident">text_conditions</span></code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def text_conditions(self):
+    return [k for k, v in self.conditioners.items() if isinstance(v, TextConditioner)]</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.conditioners.ConditioningProvider.wav_conditions"><code class="name">var <span class="ident">wav_conditions</span></code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def wav_conditions(self):
+    return [k for k, v in self.conditioners.items() if isinstance(v, WaveformConditioner)]</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.modules.conditioners.ConditioningProvider.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, tokenized: Dict[str, Any]) ‑> Dict[str, Tuple[torch.Tensor, torch.Tensor]]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Compute pairs of <code>(embedding, mask)</code> using the configured conditioners
+and the tokenized representations. The output is for example:</p>
+<pre><code>{
+    "genre": (torch.Tensor([B, 1, D_genre]), torch.Tensor([B, 1])),
+    "description": (torch.Tensor([B, T_desc, D_desc]), torch.Tensor([B, T_desc])),
+    ...
+}
+</code></pre>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>tokenized</code></strong> :&ensp;<code>dict</code></dt>
+<dd>Dict of tokenized representations as returned by <code>tokenize()</code>.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, tokenized: tp.Dict[str, tp.Any]) -&gt; tp.Dict[str, ConditionType]:
+    &#34;&#34;&#34;Compute pairs of `(embedding, mask)` using the configured conditioners
+    and the tokenized representations. The output is for example:
+
+        {
+            &#34;genre&#34;: (torch.Tensor([B, 1, D_genre]), torch.Tensor([B, 1])),
+            &#34;description&#34;: (torch.Tensor([B, T_desc, D_desc]), torch.Tensor([B, T_desc])),
+            ...
+        }
+
+    Args:
+        tokenized (dict): Dict of tokenized representations as returned by `tokenize()`.
+    &#34;&#34;&#34;
+    output = {}
+    for attribute, inputs in tokenized.items():
+        condition, mask = self.conditioners[attribute](inputs)
+        output[attribute] = (condition, mask)
+    return output</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.conditioners.ConditioningProvider.tokenize"><code class="name flex">
+<span>def <span class="ident">tokenize</span></span>(<span>self, inputs: List[<a title="audiocraft.modules.conditioners.ConditioningAttributes" href="#audiocraft.modules.conditioners.ConditioningAttributes">ConditioningAttributes</a>]) ‑> Dict[str, Any]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Match attributes/wavs with existing conditioners in self, and compute tokenize them accordingly.
+This should be called before starting any real GPU work to avoid synchronization points.
+This will return a dict matching conditioner names to their arbitrary tokenized representations.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>inputs</code></strong> :&ensp;<code>list[ConditioningAttribres]</code></dt>
+<dd>List of ConditioningAttributes objects containing
+text and wav conditions.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def tokenize(self, inputs: tp.List[ConditioningAttributes]) -&gt; tp.Dict[str, tp.Any]:
+    &#34;&#34;&#34;Match attributes/wavs with existing conditioners in self, and compute tokenize them accordingly.
+    This should be called before starting any real GPU work to avoid synchronization points.
+    This will return a dict matching conditioner names to their arbitrary tokenized representations.
+
+    Args:
+        inputs (list[ConditioningAttribres]): List of ConditioningAttributes objects containing
+            text and wav conditions.
+    &#34;&#34;&#34;
+    assert all([type(x) == ConditioningAttributes for x in inputs]), \
+        &#34;got unexpected types input for conditioner! should be tp.List[ConditioningAttributes]&#34; \
+        f&#34; but types were {set([type(x) for x in inputs])}&#34;
+
+    output = {}
+    text = self._collate_text(inputs)
+    wavs = self._collate_wavs(inputs)
+
+    assert set(text.keys() | wavs.keys()).issubset(set(self.conditioners.keys())), \
+        f&#34;got an unexpected attribute! Expected {self.conditioners.keys()}, got {text.keys(), wavs.keys()}&#34;
+
+    for attribute, batch in chain(text.items(), wavs.items()):
+        output[attribute] = self.conditioners[attribute].tokenize(batch)
+    return output</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.modules.conditioners.DropoutModule"><code class="flex name class">
+<span>class <span class="ident">DropoutModule</span></span>
+<span>(</span><span>seed: int = 1234)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Base class for all dropout modules.</p>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class DropoutModule(nn.Module):
+    &#34;&#34;&#34;Base class for all dropout modules.&#34;&#34;&#34;
+    def __init__(self, seed: int = 1234):
+        super().__init__()
+        self.rng = torch.Generator()
+        self.rng.manual_seed(seed)</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Subclasses</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.conditioners.AttributeDropout" href="#audiocraft.modules.conditioners.AttributeDropout">AttributeDropout</a></li>
+<li><a title="audiocraft.modules.conditioners.ClassifierFreeGuidanceDropout" href="#audiocraft.modules.conditioners.ClassifierFreeGuidanceDropout">ClassifierFreeGuidanceDropout</a></li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.conditioners.DropoutModule.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.DropoutModule.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.DropoutModule.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.modules.conditioners.DropoutModule.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, *input: Any) ‑> None</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Defines the computation performed at every call.</p>
+<p>Should be overridden by all subclasses.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>Although the recipe for forward pass needs to be defined within
+this function, one should call the :class:<code>Module</code> instance afterwards
+instead of this since the former takes care of running the
+registered hooks while the latter silently ignores them.</p>
+</div></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def _forward_unimplemented(self, *input: Any) -&gt; None:
+    r&#34;&#34;&#34;Defines the computation performed at every call.
+
+    Should be overridden by all subclasses.
+
+    .. note::
+        Although the recipe for forward pass needs to be defined within
+        this function, one should call the :class:`Module` instance afterwards
+        instead of this since the former takes care of running the
+        registered hooks while the latter silently ignores them.
+    &#34;&#34;&#34;
+    raise NotImplementedError(f&#34;Module [{type(self).__name__}] is missing the required \&#34;forward\&#34; function&#34;)</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.modules.conditioners.LUTConditioner"><code class="flex name class">
+<span>class <span class="ident">LUTConditioner</span></span>
+<span>(</span><span>n_bins: int, dim: int, output_dim: int, tokenizer: str, pad_idx: int = 0)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Lookup table TextConditioner.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>n_bins</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of bins.</dd>
+<dt><strong><code>dim</code></strong> :&ensp;<code>int</code></dt>
+<dd>Hidden dim of the model (text-encoder/LUT).</dd>
+<dt><strong><code>output_dim</code></strong> :&ensp;<code>int</code></dt>
+<dd>Output dim of the conditioner.</dd>
+<dt><strong><code>tokenizer</code></strong> :&ensp;<code>str</code></dt>
+<dd>Name of the tokenizer.</dd>
+<dt><strong><code>pad_idx</code></strong> :&ensp;<code>int</code>, optional</dt>
+<dd>Index for padding token. Defaults to 0.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class LUTConditioner(TextConditioner):
+    &#34;&#34;&#34;Lookup table TextConditioner.
+
+    Args:
+        n_bins (int): Number of bins.
+        dim (int): Hidden dim of the model (text-encoder/LUT).
+        output_dim (int): Output dim of the conditioner.
+        tokenizer (str): Name of the tokenizer.
+        pad_idx (int, optional): Index for padding token. Defaults to 0.
+    &#34;&#34;&#34;
+    def __init__(self, n_bins: int, dim: int, output_dim: int, tokenizer: str, pad_idx: int = 0):
+        super().__init__(dim, output_dim)
+        self.embed = nn.Embedding(n_bins, dim)
+        self.tokenizer: Tokenizer
+        if tokenizer == &#34;whitespace&#34;:
+            self.tokenizer = WhiteSpaceTokenizer(n_bins, pad_idx=pad_idx)
+        elif tokenizer == &#34;noop&#34;:
+            self.tokenizer = NoopTokenizer(n_bins, pad_idx=pad_idx)
+        else:
+            raise ValueError(f&#34;unrecognized tokenizer `{tokenizer}`.&#34;)
+
+    def tokenize(self, x: tp.List[tp.Optional[str]]) -&gt; tp.Tuple[torch.Tensor, torch.Tensor]:
+        device = self.embed.weight.device
+        tokens, mask = self.tokenizer(x)
+        tokens, mask = tokens.to(device), mask.to(device)
+        return tokens, mask
+
+    def forward(self, inputs: tp.Tuple[torch.Tensor, torch.Tensor]) -&gt; ConditionType:
+        tokens, mask = inputs
+        embeds = self.embed(tokens)
+        embeds = self.output_proj(embeds)
+        embeds = (embeds * mask.unsqueeze(-1))
+        return embeds, mask</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.conditioners.TextConditioner" href="#audiocraft.modules.conditioners.TextConditioner">TextConditioner</a></li>
+<li><a title="audiocraft.modules.conditioners.BaseConditioner" href="#audiocraft.modules.conditioners.BaseConditioner">BaseConditioner</a></li>
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.conditioners.LUTConditioner.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.LUTConditioner.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.LUTConditioner.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Inherited members</h3>
+<ul class="hlist">
+<li><code><b><a title="audiocraft.modules.conditioners.TextConditioner" href="#audiocraft.modules.conditioners.TextConditioner">TextConditioner</a></b></code>:
+<ul class="hlist">
+<li><code><a title="audiocraft.modules.conditioners.TextConditioner.forward" href="#audiocraft.modules.conditioners.BaseConditioner.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.TextConditioner.tokenize" href="#audiocraft.modules.conditioners.BaseConditioner.tokenize">tokenize</a></code></li>
+</ul>
+</li>
+</ul>
+</dd>
+<dt id="audiocraft.modules.conditioners.NoopTokenizer"><code class="flex name class">
+<span>class <span class="ident">NoopTokenizer</span></span>
+<span>(</span><span>n_bins: int, pad_idx: int = 0)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>This tokenizer should be used for global conditioners such as: artist, genre, key, etc.
+The difference between this and WhiteSpaceTokenizer is that NoopTokenizer does not split
+strings, so "Jeff Buckley" will get it's own index. Whereas WhiteSpaceTokenizer will
+split it to ["Jeff", "Buckley"] and return an index per word.</p>
+<p>For example:
+["Queen", "ABBA", "Jeff Buckley"] =&gt; [43, 55, 101]
+["Metal", "Rock", "Classical"] =&gt; [0, 223, 51]</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class NoopTokenizer(Tokenizer):
+    &#34;&#34;&#34;This tokenizer should be used for global conditioners such as: artist, genre, key, etc.
+    The difference between this and WhiteSpaceTokenizer is that NoopTokenizer does not split
+    strings, so &#34;Jeff Buckley&#34; will get it&#39;s own index. Whereas WhiteSpaceTokenizer will
+    split it to [&#34;Jeff&#34;, &#34;Buckley&#34;] and return an index per word.
+
+    For example:
+    [&#34;Queen&#34;, &#34;ABBA&#34;, &#34;Jeff Buckley&#34;] =&gt; [43, 55, 101]
+    [&#34;Metal&#34;, &#34;Rock&#34;, &#34;Classical&#34;] =&gt; [0, 223, 51]
+    &#34;&#34;&#34;
+    def __init__(self, n_bins: int, pad_idx: int = 0):
+        self.n_bins = n_bins
+        self.pad_idx = pad_idx
+
+    def __call__(self, texts: tp.List[tp.Optional[str]]) -&gt; tp.Tuple[Tensor, Tensor]:
+        output, lengths = [], []
+        for text in texts:
+            # if current sample doesn&#39;t have a certain attribute, replace with pad token
+            if text is None:
+                output.append(self.pad_idx)
+                lengths.append(0)
+            else:
+                output.append(hash_trick(text, self.n_bins))
+                lengths.append(1)
+
+        tokens = torch.LongTensor(output).unsqueeze(1)
+        mask = length_to_mask(torch.IntTensor(lengths)).int()
+        return tokens, mask</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.conditioners.Tokenizer" href="#audiocraft.modules.conditioners.Tokenizer">Tokenizer</a></li>
+</ul>
+</dd>
+<dt id="audiocraft.modules.conditioners.SegmentWithAttributes"><code class="flex name class">
+<span>class <span class="ident">SegmentWithAttributes</span></span>
+<span>(</span><span>meta: <a title="audiocraft.data.audio_dataset.AudioMeta" href="../data/audio_dataset.html#audiocraft.data.audio_dataset.AudioMeta">AudioMeta</a>, seek_time: float, n_frames: int, total_frames: int, sample_rate: int)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Base class for all dataclasses that are used for conditioning.
+All child classes should implement <code>to_condition_attributes</code> that converts
+the existing attributes to a dataclass of type ConditioningAttributes.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class SegmentWithAttributes(SegmentInfo):
+    &#34;&#34;&#34;Base class for all dataclasses that are used for conditioning.
+    All child classes should implement `to_condition_attributes` that converts
+    the existing attributes to a dataclass of type ConditioningAttributes.
+    &#34;&#34;&#34;
+    def to_condition_attributes(self) -&gt; ConditioningAttributes:
+        raise NotImplementedError()</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.data.audio_dataset.SegmentInfo" href="../data/audio_dataset.html#audiocraft.data.audio_dataset.SegmentInfo">SegmentInfo</a></li>
+<li><a title="audiocraft.data.audio_dataset.BaseInfo" href="../data/audio_dataset.html#audiocraft.data.audio_dataset.BaseInfo">BaseInfo</a></li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.conditioners.SegmentWithAttributes.meta"><code class="name">var <span class="ident">meta</span> : <a title="audiocraft.data.audio_dataset.AudioMeta" href="../data/audio_dataset.html#audiocraft.data.audio_dataset.AudioMeta">AudioMeta</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.SegmentWithAttributes.n_frames"><code class="name">var <span class="ident">n_frames</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.SegmentWithAttributes.sample_rate"><code class="name">var <span class="ident">sample_rate</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.SegmentWithAttributes.seek_time"><code class="name">var <span class="ident">seek_time</span> : float</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.SegmentWithAttributes.total_frames"><code class="name">var <span class="ident">total_frames</span> : int</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.modules.conditioners.SegmentWithAttributes.to_condition_attributes"><code class="name flex">
+<span>def <span class="ident">to_condition_attributes</span></span>(<span>self) ‑> <a title="audiocraft.modules.conditioners.ConditioningAttributes" href="#audiocraft.modules.conditioners.ConditioningAttributes">ConditioningAttributes</a></span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def to_condition_attributes(self) -&gt; ConditioningAttributes:
+    raise NotImplementedError()</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.modules.conditioners.T5Conditioner"><code class="flex name class">
+<span>class <span class="ident">T5Conditioner</span></span>
+<span>(</span><span>name: str, output_dim: int, finetune: bool, device: str, autocast_dtype: Optional[str] = 'float32', word_dropout: float = 0.0, normalize_text: bool = False)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>T5-based TextConditioner.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>name</code></strong> :&ensp;<code>str</code></dt>
+<dd>Name of the T5 model.</dd>
+<dt><strong><code>output_dim</code></strong> :&ensp;<code>int</code></dt>
+<dd>Output dim of the conditioner.</dd>
+<dt><strong><code>finetune</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to fine-tune T5 at train time.</dd>
+<dt><strong><code>device</code></strong> :&ensp;<code>str</code></dt>
+<dd>Device for T5 Conditioner.</dd>
+<dt><strong><code>autocast_dtype</code></strong> :&ensp;<code>tp.Optional[str]</code>, optional</dt>
+<dd>Autocast dtype.</dd>
+<dt><strong><code>word_dropout</code></strong> :&ensp;<code>float</code>, optional</dt>
+<dd>Word dropout probability.</dd>
+<dt><strong><code>normalize_text</code></strong> :&ensp;<code>bool</code>, optional</dt>
+<dd>Whether to apply text normalization.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class T5Conditioner(TextConditioner):
+    &#34;&#34;&#34;T5-based TextConditioner.
+
+    Args:
+        name (str): Name of the T5 model.
+        output_dim (int): Output dim of the conditioner.
+        finetune (bool): Whether to fine-tune T5 at train time.
+        device (str): Device for T5 Conditioner.
+        autocast_dtype (tp.Optional[str], optional): Autocast dtype.
+        word_dropout (float, optional): Word dropout probability.
+        normalize_text (bool, optional): Whether to apply text normalization.
+    &#34;&#34;&#34;
+    MODELS = [&#34;t5-small&#34;, &#34;t5-base&#34;, &#34;t5-large&#34;, &#34;t5-3b&#34;, &#34;t5-11b&#34;,
+              &#34;google/flan-t5-small&#34;, &#34;google/flan-t5-base&#34;, &#34;google/flan-t5-large&#34;,
+              &#34;google/flan-t5-xl&#34;, &#34;google/flan-t5-xxl&#34;]
+    MODELS_DIMS = {
+        &#34;t5-small&#34;: 512,
+        &#34;t5-base&#34;: 768,
+        &#34;t5-large&#34;: 1024,
+        &#34;t5-3b&#34;: 1024,
+        &#34;t5-11b&#34;: 1024,
+        &#34;google/flan-t5-small&#34;: 512,
+        &#34;google/flan-t5-base&#34;: 768,
+        &#34;google/flan-t5-large&#34;: 1024,
+        &#34;google/flan-t5-3b&#34;: 1024,
+        &#34;google/flan-t5-11b&#34;: 1024,
+    }
+
+    def __init__(self, name: str, output_dim: int, finetune: bool, device: str,
+                 autocast_dtype: tp.Optional[str] = &#39;float32&#39;, word_dropout: float = 0.,
+                 normalize_text: bool = False):
+        assert name in self.MODELS, f&#34;unrecognized t5 model name (should in {self.MODELS})&#34;
+        super().__init__(self.MODELS_DIMS[name], output_dim)
+        self.device = device
+        self.name = name
+        self.finetune = finetune
+        self.word_dropout = word_dropout
+
+        if autocast_dtype is None or self.device == &#39;cpu&#39;:
+            self.autocast = TorchAutocast(enabled=False)
+            if self.device != &#39;cpu&#39;:
+                logger.warning(&#34;T5 has no autocast, this might lead to NaN&#34;)
+        else:
+            dtype = getattr(torch, autocast_dtype)
+            assert isinstance(dtype, torch.dtype)
+            logger.info(f&#34;T5 will be evaluated with autocast as {autocast_dtype}&#34;)
+            self.autocast = TorchAutocast(enabled=True, device_type=self.device, dtype=dtype)
+        # Let&#39;s disable logging temporarily because T5 will vomit some errors otherwise.
+        # thanks https://gist.github.com/simon-weber/7853144
+        previous_level = logging.root.manager.disable
+        logging.disable(logging.ERROR)
+        with warnings.catch_warnings():
+            warnings.simplefilter(&#34;ignore&#34;)
+            try:
+                self.t5_tokenizer = T5Tokenizer.from_pretrained(name)
+                t5 = T5EncoderModel.from_pretrained(name).train(mode=finetune)
+            finally:
+                logging.disable(previous_level)
+        if finetune:
+            self.t5 = t5
+        else:
+            # this makes sure that the t5 models is not part
+            # of the saved checkpoint
+            self.__dict__[&#34;t5&#34;] = t5.to(device)
+
+        self.normalize_text = normalize_text
+        if normalize_text:
+            self.text_normalizer = WhiteSpaceTokenizer(1, lemma=True, stopwords=True)
+
+    def tokenize(self, x: tp.List[tp.Optional[str]]) -&gt; tp.Dict[str, torch.Tensor]:
+        # if current sample doesn&#39;t have a certain attribute, replace with empty string
+        entries: tp.List[str] = [xi if xi is not None else &#34;&#34; for xi in x]
+        if self.normalize_text:
+            _, _, entries = self.text_normalizer(entries, return_text=True)
+        if self.word_dropout &gt; 0. and self.training:
+            new_entries = []
+            for entry in entries:
+                words = [word for word in entry.split(&#34; &#34;) if random.random() &gt;= self.word_dropout]
+                new_entries.append(&#34; &#34;.join(words))
+            entries = new_entries
+
+        empty_idx = torch.LongTensor([i for i, xi in enumerate(entries) if xi == &#34;&#34;])
+
+        inputs = self.t5_tokenizer(entries, return_tensors=&#34;pt&#34;, padding=True).to(self.device)
+        mask = inputs[&#34;attention_mask&#34;]
+        mask[empty_idx, :] = 0  # zero-out index where the input is non-existant
+        return inputs
+
+    def forward(self, inputs: tp.Dict[str, torch.Tensor]) -&gt; ConditionType:
+        mask = inputs[&#34;attention_mask&#34;]
+        with torch.set_grad_enabled(self.finetune), self.autocast:
+            embeds = self.t5(**inputs).last_hidden_state
+        embeds = self.output_proj(embeds.to(self.output_proj.weight))
+        embeds = (embeds * mask.unsqueeze(-1))
+        return embeds, mask</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.conditioners.TextConditioner" href="#audiocraft.modules.conditioners.TextConditioner">TextConditioner</a></li>
+<li><a title="audiocraft.modules.conditioners.BaseConditioner" href="#audiocraft.modules.conditioners.BaseConditioner">BaseConditioner</a></li>
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.conditioners.T5Conditioner.MODELS"><code class="name">var <span class="ident">MODELS</span></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.T5Conditioner.MODELS_DIMS"><code class="name">var <span class="ident">MODELS_DIMS</span></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.T5Conditioner.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.T5Conditioner.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.T5Conditioner.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Inherited members</h3>
+<ul class="hlist">
+<li><code><b><a title="audiocraft.modules.conditioners.TextConditioner" href="#audiocraft.modules.conditioners.TextConditioner">TextConditioner</a></b></code>:
+<ul class="hlist">
+<li><code><a title="audiocraft.modules.conditioners.TextConditioner.forward" href="#audiocraft.modules.conditioners.BaseConditioner.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.TextConditioner.tokenize" href="#audiocraft.modules.conditioners.BaseConditioner.tokenize">tokenize</a></code></li>
+</ul>
+</li>
+</ul>
+</dd>
+<dt id="audiocraft.modules.conditioners.TextConditioner"><code class="flex name class">
+<span>class <span class="ident">TextConditioner</span></span>
+<span>(</span><span>dim, output_dim)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Base model for all conditioner modules. We allow the output dim to be different
+than the hidden dim for two reasons: 1) keep our LUTs small when the vocab is large;
+2) make all condition dims consistent.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>dim</code></strong> :&ensp;<code>int</code></dt>
+<dd>Hidden dim of the model (text-encoder/LUT).</dd>
+<dt><strong><code>output_dim</code></strong> :&ensp;<code>int</code></dt>
+<dd>Output dim of the conditioner.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class TextConditioner(BaseConditioner):
+    ...</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.conditioners.BaseConditioner" href="#audiocraft.modules.conditioners.BaseConditioner">BaseConditioner</a></li>
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Subclasses</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.conditioners.LUTConditioner" href="#audiocraft.modules.conditioners.LUTConditioner">LUTConditioner</a></li>
+<li><a title="audiocraft.modules.conditioners.T5Conditioner" href="#audiocraft.modules.conditioners.T5Conditioner">T5Conditioner</a></li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.conditioners.TextConditioner.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.TextConditioner.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.TextConditioner.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Inherited members</h3>
+<ul class="hlist">
+<li><code><b><a title="audiocraft.modules.conditioners.BaseConditioner" href="#audiocraft.modules.conditioners.BaseConditioner">BaseConditioner</a></b></code>:
+<ul class="hlist">
+<li><code><a title="audiocraft.modules.conditioners.BaseConditioner.forward" href="#audiocraft.modules.conditioners.BaseConditioner.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.BaseConditioner.tokenize" href="#audiocraft.modules.conditioners.BaseConditioner.tokenize">tokenize</a></code></li>
+</ul>
+</li>
+</ul>
+</dd>
+<dt id="audiocraft.modules.conditioners.Tokenizer"><code class="flex name class">
+<span>class <span class="ident">Tokenizer</span></span>
+</code></dt>
+<dd>
+<div class="desc"><p>Base class for all tokenizers
+(in case we want to introduce more advances tokenizers in the future).</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class Tokenizer:
+    &#34;&#34;&#34;Base class for all tokenizers
+    (in case we want to introduce more advances tokenizers in the future).
+    &#34;&#34;&#34;
+    def __call__(self, texts: tp.List[tp.Optional[str]]) -&gt; tp.Tuple[Tensor, Tensor]:
+        raise NotImplementedError()</code></pre>
+</details>
+<h3>Subclasses</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.conditioners.NoopTokenizer" href="#audiocraft.modules.conditioners.NoopTokenizer">NoopTokenizer</a></li>
+<li><a title="audiocraft.modules.conditioners.WhiteSpaceTokenizer" href="#audiocraft.modules.conditioners.WhiteSpaceTokenizer">WhiteSpaceTokenizer</a></li>
+</ul>
+</dd>
+<dt id="audiocraft.modules.conditioners.WavCondition"><code class="flex name class">
+<span>class <span class="ident">WavCondition</span></span>
+<span>(</span><span>wav: torch.Tensor, length: torch.Tensor, path: List[Optional[str]] = [])</span>
+</code></dt>
+<dd>
+<div class="desc"><p>WavCondition(wav, length, path)</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class WavCondition(tp.NamedTuple):
+    wav: Tensor
+    length: Tensor
+    path: tp.List[tp.Optional[str]] = []</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>builtins.tuple</li>
+</ul>
+<h3>Instance variables</h3>
+<dl>
+<dt id="audiocraft.modules.conditioners.WavCondition.length"><code class="name">var <span class="ident">length</span> : torch.Tensor</code></dt>
+<dd>
+<div class="desc"><p>Alias for field number 1</p></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.WavCondition.path"><code class="name">var <span class="ident">path</span> : List[Optional[str]]</code></dt>
+<dd>
+<div class="desc"><p>Alias for field number 2</p></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.WavCondition.wav"><code class="name">var <span class="ident">wav</span> : torch.Tensor</code></dt>
+<dd>
+<div class="desc"><p>Alias for field number 0</p></div>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.modules.conditioners.WaveformConditioner"><code class="flex name class">
+<span>class <span class="ident">WaveformConditioner</span></span>
+<span>(</span><span>dim: int, output_dim: int, device: Union[torch.device, str])</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Base class for all conditioners that take a waveform as input.
+Classes that inherit must implement <code>_get_wav_embedding</code> that outputs
+a continuous tensor, and <code>_downsampling_factor</code> that returns the down-sampling
+factor of the embedding model.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>dim</code></strong> :&ensp;<code>int</code></dt>
+<dd>The internal representation dimension.</dd>
+<dt><strong><code>output_dim</code></strong> :&ensp;<code>int</code></dt>
+<dd>Output dimension.</dd>
+<dt><strong><code>device</code></strong> :&ensp;<code>tp.Union[torch.device, str]</code></dt>
+<dd>Device.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class WaveformConditioner(BaseConditioner):
+    &#34;&#34;&#34;Base class for all conditioners that take a waveform as input.
+    Classes that inherit must implement `_get_wav_embedding` that outputs
+    a continuous tensor, and `_downsampling_factor` that returns the down-sampling
+    factor of the embedding model.
+
+    Args:
+        dim (int): The internal representation dimension.
+        output_dim (int): Output dimension.
+        device (tp.Union[torch.device, str]): Device.
+    &#34;&#34;&#34;
+    def __init__(self, dim: int, output_dim: int, device: tp.Union[torch.device, str]):
+        super().__init__(dim, output_dim)
+        self.device = device
+
+    def tokenize(self, wav_length: WavCondition) -&gt; WavCondition:
+        wav, length, path = wav_length
+        assert length is not None
+        return WavCondition(wav.to(self.device), length.to(self.device), path)
+
+    def _get_wav_embedding(self, wav: Tensor) -&gt; Tensor:
+        &#34;&#34;&#34;Gets as input a wav and returns a dense vector of conditions.&#34;&#34;&#34;
+        raise NotImplementedError()
+
+    def _downsampling_factor(self):
+        &#34;&#34;&#34;Returns the downsampling factor of the embedding model.&#34;&#34;&#34;
+        raise NotImplementedError()
+
+    def forward(self, inputs: WavCondition) -&gt; ConditionType:
+        &#34;&#34;&#34;
+        Args:
+            input (WavCondition): Tuple of (waveform, lengths).
+        Returns:
+            ConditionType: Dense vector representing the conditioning along with its&#39; mask.
+        &#34;&#34;&#34;
+        wav, lengths, path = inputs
+        with torch.no_grad():
+            embeds = self._get_wav_embedding(wav)
+        embeds = embeds.to(self.output_proj.weight)
+        embeds = self.output_proj(embeds)
+
+        if lengths is not None:
+            lengths = lengths / self._downsampling_factor()
+            mask = length_to_mask(lengths, max_len=embeds.shape[1]).int()  # type: ignore
+        else:
+            mask = torch.ones_like(embeds)
+        embeds = (embeds * mask.unsqueeze(2).to(self.device))
+
+        return embeds, mask</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.conditioners.BaseConditioner" href="#audiocraft.modules.conditioners.BaseConditioner">BaseConditioner</a></li>
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Subclasses</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.conditioners.ChromaStemConditioner" href="#audiocraft.modules.conditioners.ChromaStemConditioner">ChromaStemConditioner</a></li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.conditioners.WaveformConditioner.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.WaveformConditioner.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conditioners.WaveformConditioner.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.modules.conditioners.WaveformConditioner.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, inputs: <a title="audiocraft.modules.conditioners.WavCondition" href="#audiocraft.modules.conditioners.WavCondition">WavCondition</a>) ‑> Tuple[torch.Tensor, torch.Tensor]</span>
+</code></dt>
+<dd>
+<div class="desc"><h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>input</code></strong> :&ensp;<code><a title="audiocraft.modules.conditioners.WavCondition" href="#audiocraft.modules.conditioners.WavCondition">WavCondition</a></code></dt>
+<dd>Tuple of (waveform, lengths).</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><code>ConditionType</code></dt>
+<dd>Dense vector representing the conditioning along with its' mask.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, inputs: WavCondition) -&gt; ConditionType:
+    &#34;&#34;&#34;
+    Args:
+        input (WavCondition): Tuple of (waveform, lengths).
+    Returns:
+        ConditionType: Dense vector representing the conditioning along with its&#39; mask.
+    &#34;&#34;&#34;
+    wav, lengths, path = inputs
+    with torch.no_grad():
+        embeds = self._get_wav_embedding(wav)
+    embeds = embeds.to(self.output_proj.weight)
+    embeds = self.output_proj(embeds)
+
+    if lengths is not None:
+        lengths = lengths / self._downsampling_factor()
+        mask = length_to_mask(lengths, max_len=embeds.shape[1]).int()  # type: ignore
+    else:
+        mask = torch.ones_like(embeds)
+    embeds = (embeds * mask.unsqueeze(2).to(self.device))
+
+    return embeds, mask</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Inherited members</h3>
+<ul class="hlist">
+<li><code><b><a title="audiocraft.modules.conditioners.BaseConditioner" href="#audiocraft.modules.conditioners.BaseConditioner">BaseConditioner</a></b></code>:
+<ul class="hlist">
+<li><code><a title="audiocraft.modules.conditioners.BaseConditioner.tokenize" href="#audiocraft.modules.conditioners.BaseConditioner.tokenize">tokenize</a></code></li>
+</ul>
+</li>
+</ul>
+</dd>
+<dt id="audiocraft.modules.conditioners.WhiteSpaceTokenizer"><code class="flex name class">
+<span>class <span class="ident">WhiteSpaceTokenizer</span></span>
+<span>(</span><span>n_bins: int, pad_idx: int = 0, language: str = 'en_core_web_sm', lemma: bool = True, stopwords: bool = True)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>This tokenizer should be used for natural language descriptions.
+For example:
+["he didn't, know he's going home.", 'shorter sentence'] =&gt;
+[[78, 62, 31,
+4, 78, 25, 19, 34],
+[59, 77,
+0,
+0,
+0,
+0,
+0,
+0]]</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class WhiteSpaceTokenizer(Tokenizer):
+    &#34;&#34;&#34;This tokenizer should be used for natural language descriptions.
+    For example:
+    [&#34;he didn&#39;t, know he&#39;s going home.&#34;, &#39;shorter sentence&#39;] =&gt;
+    [[78, 62, 31,  4, 78, 25, 19, 34],
+    [59, 77,  0,  0,  0,  0,  0,  0]]
+    &#34;&#34;&#34;
+    PUNCTUATIONS = &#34;?:!.,;&#34;
+
+    def __init__(self, n_bins: int, pad_idx: int = 0, language: str = &#34;en_core_web_sm&#34;,
+                 lemma: bool = True, stopwords: bool = True) -&gt; None:
+        self.n_bins = n_bins
+        self.pad_idx = pad_idx
+        self.lemma = lemma
+        self.stopwords = stopwords
+        try:
+            self.nlp = spacy.load(language)
+        except IOError:
+            spacy.cli.download(language)  # type: ignore
+            self.nlp = spacy.load(language)
+
+    @tp.no_type_check
+    def __call__(
+        self,
+        texts: tp.List[tp.Optional[str]],
+        return_text: bool = False
+    ) -&gt; tp.Tuple[Tensor, Tensor]:
+        &#34;&#34;&#34;Take a list of strings and convert them to a tensor of indices.
+
+        Args:
+            texts (tp.List[str]): List of strings.
+            return_text (bool, optional): Whether to return text as additional tuple item. Defaults to False.
+        Returns:
+            tp.Tuple[Tensor, Tensor]:
+                - Indices of words in the LUT.
+                - And a mask indicating where the padding tokens are
+        &#34;&#34;&#34;
+        output, lengths = [], []
+        texts = deepcopy(texts)
+        for i, text in enumerate(texts):
+            # if current sample doesn&#39;t have a certain attribute, replace with pad token
+            if text is None:
+                output.append(Tensor([self.pad_idx]))
+                lengths.append(0)
+                continue
+
+            # convert numbers to words
+            text = re.sub(r&#34;(\d+)&#34;, lambda x: num2words(int(x.group(0))), text)  # type: ignore
+            # normalize text
+            text = self.nlp(text)  # type: ignore
+            # remove stopwords
+            if self.stopwords:
+                text = [w for w in text if not w.is_stop]  # type: ignore
+            # remove punctuations
+            text = [w for w in text if w.text not in self.PUNCTUATIONS]  # type: ignore
+            # lemmatize if needed
+            text = [getattr(t, &#34;lemma_&#34; if self.lemma else &#34;text&#34;) for t in text]  # type: ignore
+
+            texts[i] = &#34; &#34;.join(text)
+            lengths.append(len(text))
+            # convert to tensor
+            tokens = Tensor([hash_trick(w, self.n_bins) for w in text])
+            output.append(tokens)
+
+        mask = length_to_mask(torch.IntTensor(lengths)).int()
+        padded_output = pad_sequence(output, padding_value=self.pad_idx).int().t()
+        if return_text:
+            return padded_output, mask, texts  # type: ignore
+        return padded_output, mask</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.conditioners.Tokenizer" href="#audiocraft.modules.conditioners.Tokenizer">Tokenizer</a></li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.conditioners.WhiteSpaceTokenizer.PUNCTUATIONS"><code class="name">var <span class="ident">PUNCTUATIONS</span></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.modules" href="index.html">audiocraft.modules</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-functions">Functions</a></h3>
+<ul class="">
+<li><code><a title="audiocraft.modules.conditioners.dropout_condition" href="#audiocraft.modules.conditioners.dropout_condition">dropout_condition</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.nullify_condition" href="#audiocraft.modules.conditioners.nullify_condition">nullify_condition</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.nullify_wav" href="#audiocraft.modules.conditioners.nullify_wav">nullify_wav</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.modules.conditioners.AttributeDropout" href="#audiocraft.modules.conditioners.AttributeDropout">AttributeDropout</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.conditioners.AttributeDropout.call_super_init" href="#audiocraft.modules.conditioners.AttributeDropout.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.AttributeDropout.dump_patches" href="#audiocraft.modules.conditioners.AttributeDropout.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.AttributeDropout.forward" href="#audiocraft.modules.conditioners.AttributeDropout.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.AttributeDropout.training" href="#audiocraft.modules.conditioners.AttributeDropout.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.conditioners.BaseConditioner" href="#audiocraft.modules.conditioners.BaseConditioner">BaseConditioner</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.conditioners.BaseConditioner.call_super_init" href="#audiocraft.modules.conditioners.BaseConditioner.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.BaseConditioner.dump_patches" href="#audiocraft.modules.conditioners.BaseConditioner.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.BaseConditioner.forward" href="#audiocraft.modules.conditioners.BaseConditioner.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.BaseConditioner.tokenize" href="#audiocraft.modules.conditioners.BaseConditioner.tokenize">tokenize</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.BaseConditioner.training" href="#audiocraft.modules.conditioners.BaseConditioner.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.conditioners.ChromaExtractor" href="#audiocraft.modules.conditioners.ChromaExtractor">ChromaExtractor</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.conditioners.ChromaExtractor.call_super_init" href="#audiocraft.modules.conditioners.ChromaExtractor.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.ChromaExtractor.dump_patches" href="#audiocraft.modules.conditioners.ChromaExtractor.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.ChromaExtractor.forward" href="#audiocraft.modules.conditioners.ChromaExtractor.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.ChromaExtractor.training" href="#audiocraft.modules.conditioners.ChromaExtractor.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.conditioners.ChromaStemConditioner" href="#audiocraft.modules.conditioners.ChromaStemConditioner">ChromaStemConditioner</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.conditioners.ChromaStemConditioner.call_super_init" href="#audiocraft.modules.conditioners.ChromaStemConditioner.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.ChromaStemConditioner.dump_patches" href="#audiocraft.modules.conditioners.ChromaStemConditioner.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.ChromaStemConditioner.training" href="#audiocraft.modules.conditioners.ChromaStemConditioner.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.conditioners.ClassifierFreeGuidanceDropout" href="#audiocraft.modules.conditioners.ClassifierFreeGuidanceDropout">ClassifierFreeGuidanceDropout</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.conditioners.ClassifierFreeGuidanceDropout.call_super_init" href="#audiocraft.modules.conditioners.ClassifierFreeGuidanceDropout.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.ClassifierFreeGuidanceDropout.dump_patches" href="#audiocraft.modules.conditioners.ClassifierFreeGuidanceDropout.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.ClassifierFreeGuidanceDropout.forward" href="#audiocraft.modules.conditioners.ClassifierFreeGuidanceDropout.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.ClassifierFreeGuidanceDropout.training" href="#audiocraft.modules.conditioners.ClassifierFreeGuidanceDropout.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.conditioners.ConditionFuser" href="#audiocraft.modules.conditioners.ConditionFuser">ConditionFuser</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.conditioners.ConditionFuser.FUSING_METHODS" href="#audiocraft.modules.conditioners.ConditionFuser.FUSING_METHODS">FUSING_METHODS</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.ConditionFuser.call_super_init" href="#audiocraft.modules.conditioners.ConditionFuser.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.ConditionFuser.dump_patches" href="#audiocraft.modules.conditioners.ConditionFuser.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.ConditionFuser.forward" href="#audiocraft.modules.conditioners.ConditionFuser.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.ConditionFuser.training" href="#audiocraft.modules.conditioners.ConditionFuser.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.conditioners.ConditioningAttributes" href="#audiocraft.modules.conditioners.ConditioningAttributes">ConditioningAttributes</a></code></h4>
+<ul class="two-column">
+<li><code><a title="audiocraft.modules.conditioners.ConditioningAttributes.attributes" href="#audiocraft.modules.conditioners.ConditioningAttributes.attributes">attributes</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.ConditioningAttributes.from_flat_dict" href="#audiocraft.modules.conditioners.ConditioningAttributes.from_flat_dict">from_flat_dict</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.ConditioningAttributes.text" href="#audiocraft.modules.conditioners.ConditioningAttributes.text">text</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.ConditioningAttributes.text_attributes" href="#audiocraft.modules.conditioners.ConditioningAttributes.text_attributes">text_attributes</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.ConditioningAttributes.to_flat_dict" href="#audiocraft.modules.conditioners.ConditioningAttributes.to_flat_dict">to_flat_dict</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.ConditioningAttributes.wav" href="#audiocraft.modules.conditioners.ConditioningAttributes.wav">wav</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.ConditioningAttributes.wav_attributes" href="#audiocraft.modules.conditioners.ConditioningAttributes.wav_attributes">wav_attributes</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.conditioners.ConditioningProvider" href="#audiocraft.modules.conditioners.ConditioningProvider">ConditioningProvider</a></code></h4>
+<ul class="two-column">
+<li><code><a title="audiocraft.modules.conditioners.ConditioningProvider.call_super_init" href="#audiocraft.modules.conditioners.ConditioningProvider.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.ConditioningProvider.dump_patches" href="#audiocraft.modules.conditioners.ConditioningProvider.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.ConditioningProvider.forward" href="#audiocraft.modules.conditioners.ConditioningProvider.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.ConditioningProvider.has_wav_condition" href="#audiocraft.modules.conditioners.ConditioningProvider.has_wav_condition">has_wav_condition</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.ConditioningProvider.text_conditions" href="#audiocraft.modules.conditioners.ConditioningProvider.text_conditions">text_conditions</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.ConditioningProvider.tokenize" href="#audiocraft.modules.conditioners.ConditioningProvider.tokenize">tokenize</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.ConditioningProvider.training" href="#audiocraft.modules.conditioners.ConditioningProvider.training">training</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.ConditioningProvider.wav_conditions" href="#audiocraft.modules.conditioners.ConditioningProvider.wav_conditions">wav_conditions</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.conditioners.DropoutModule" href="#audiocraft.modules.conditioners.DropoutModule">DropoutModule</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.conditioners.DropoutModule.call_super_init" href="#audiocraft.modules.conditioners.DropoutModule.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.DropoutModule.dump_patches" href="#audiocraft.modules.conditioners.DropoutModule.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.DropoutModule.forward" href="#audiocraft.modules.conditioners.DropoutModule.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.DropoutModule.training" href="#audiocraft.modules.conditioners.DropoutModule.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.conditioners.LUTConditioner" href="#audiocraft.modules.conditioners.LUTConditioner">LUTConditioner</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.conditioners.LUTConditioner.call_super_init" href="#audiocraft.modules.conditioners.LUTConditioner.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.LUTConditioner.dump_patches" href="#audiocraft.modules.conditioners.LUTConditioner.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.LUTConditioner.training" href="#audiocraft.modules.conditioners.LUTConditioner.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.conditioners.NoopTokenizer" href="#audiocraft.modules.conditioners.NoopTokenizer">NoopTokenizer</a></code></h4>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.conditioners.SegmentWithAttributes" href="#audiocraft.modules.conditioners.SegmentWithAttributes">SegmentWithAttributes</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.conditioners.SegmentWithAttributes.meta" href="#audiocraft.modules.conditioners.SegmentWithAttributes.meta">meta</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.SegmentWithAttributes.n_frames" href="#audiocraft.modules.conditioners.SegmentWithAttributes.n_frames">n_frames</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.SegmentWithAttributes.sample_rate" href="#audiocraft.modules.conditioners.SegmentWithAttributes.sample_rate">sample_rate</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.SegmentWithAttributes.seek_time" href="#audiocraft.modules.conditioners.SegmentWithAttributes.seek_time">seek_time</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.SegmentWithAttributes.to_condition_attributes" href="#audiocraft.modules.conditioners.SegmentWithAttributes.to_condition_attributes">to_condition_attributes</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.SegmentWithAttributes.total_frames" href="#audiocraft.modules.conditioners.SegmentWithAttributes.total_frames">total_frames</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.conditioners.T5Conditioner" href="#audiocraft.modules.conditioners.T5Conditioner">T5Conditioner</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.conditioners.T5Conditioner.MODELS" href="#audiocraft.modules.conditioners.T5Conditioner.MODELS">MODELS</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.T5Conditioner.MODELS_DIMS" href="#audiocraft.modules.conditioners.T5Conditioner.MODELS_DIMS">MODELS_DIMS</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.T5Conditioner.call_super_init" href="#audiocraft.modules.conditioners.T5Conditioner.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.T5Conditioner.dump_patches" href="#audiocraft.modules.conditioners.T5Conditioner.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.T5Conditioner.training" href="#audiocraft.modules.conditioners.T5Conditioner.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.conditioners.TextConditioner" href="#audiocraft.modules.conditioners.TextConditioner">TextConditioner</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.conditioners.TextConditioner.call_super_init" href="#audiocraft.modules.conditioners.TextConditioner.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.TextConditioner.dump_patches" href="#audiocraft.modules.conditioners.TextConditioner.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.TextConditioner.training" href="#audiocraft.modules.conditioners.TextConditioner.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.conditioners.Tokenizer" href="#audiocraft.modules.conditioners.Tokenizer">Tokenizer</a></code></h4>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.conditioners.WavCondition" href="#audiocraft.modules.conditioners.WavCondition">WavCondition</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.conditioners.WavCondition.length" href="#audiocraft.modules.conditioners.WavCondition.length">length</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.WavCondition.path" href="#audiocraft.modules.conditioners.WavCondition.path">path</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.WavCondition.wav" href="#audiocraft.modules.conditioners.WavCondition.wav">wav</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.conditioners.WaveformConditioner" href="#audiocraft.modules.conditioners.WaveformConditioner">WaveformConditioner</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.conditioners.WaveformConditioner.call_super_init" href="#audiocraft.modules.conditioners.WaveformConditioner.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.WaveformConditioner.dump_patches" href="#audiocraft.modules.conditioners.WaveformConditioner.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.WaveformConditioner.forward" href="#audiocraft.modules.conditioners.WaveformConditioner.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners.WaveformConditioner.training" href="#audiocraft.modules.conditioners.WaveformConditioner.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.conditioners.WhiteSpaceTokenizer" href="#audiocraft.modules.conditioners.WhiteSpaceTokenizer">WhiteSpaceTokenizer</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.conditioners.WhiteSpaceTokenizer.PUNCTUATIONS" href="#audiocraft.modules.conditioners.WhiteSpaceTokenizer.PUNCTUATIONS">PUNCTUATIONS</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/docs/audiocraft/modules/conv.html b/docs/audiocraft/modules/conv.html
new file mode 100644
index 00000000..0c6281f0
--- /dev/null
+++ b/docs/audiocraft/modules/conv.html
@@ -0,0 +1,1048 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.modules.conv API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.modules.conv</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+import typing as tp
+import warnings
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch.nn.utils import spectral_norm, weight_norm
+
+
+CONV_NORMALIZATIONS = frozenset([&#39;none&#39;, &#39;weight_norm&#39;, &#39;spectral_norm&#39;,
+                                 &#39;time_group_norm&#39;])
+
+
+def apply_parametrization_norm(module: nn.Module, norm: str = &#39;none&#39;):
+    assert norm in CONV_NORMALIZATIONS
+    if norm == &#39;weight_norm&#39;:
+        return weight_norm(module)
+    elif norm == &#39;spectral_norm&#39;:
+        return spectral_norm(module)
+    else:
+        # We already check was in CONV_NORMALIZATION, so any other choice
+        # doesn&#39;t need reparametrization.
+        return module
+
+
+def get_norm_module(module: nn.Module, causal: bool = False, norm: str = &#39;none&#39;, **norm_kwargs):
+    &#34;&#34;&#34;Return the proper normalization module. If causal is True, this will ensure the returned
+    module is causal, or return an error if the normalization doesn&#39;t support causal evaluation.
+    &#34;&#34;&#34;
+    assert norm in CONV_NORMALIZATIONS
+    if norm == &#39;time_group_norm&#39;:
+        if causal:
+            raise ValueError(&#34;GroupNorm doesn&#39;t support causal evaluation.&#34;)
+        assert isinstance(module, nn.modules.conv._ConvNd)
+        return nn.GroupNorm(1, module.out_channels, **norm_kwargs)
+    else:
+        return nn.Identity()
+
+
+def get_extra_padding_for_conv1d(x: torch.Tensor, kernel_size: int, stride: int,
+                                 padding_total: int = 0) -&gt; int:
+    &#34;&#34;&#34;See `pad_for_conv1d`.
+    &#34;&#34;&#34;
+    length = x.shape[-1]
+    n_frames = (length - kernel_size + padding_total) / stride + 1
+    ideal_length = (math.ceil(n_frames) - 1) * stride + (kernel_size - padding_total)
+    return ideal_length - length
+
+
+def pad_for_conv1d(x: torch.Tensor, kernel_size: int, stride: int, padding_total: int = 0):
+    &#34;&#34;&#34;Pad for a convolution to make sure that the last window is full.
+    Extra padding is added at the end. This is required to ensure that we can rebuild
+    an output of the same length, as otherwise, even with padding, some time steps
+    might get removed.
+    For instance, with total padding = 4, kernel size = 4, stride = 2:
+        0 0 1 2 3 4 5 0 0   # (0s are padding)
+        1   2   3           # (output frames of a convolution, last 0 is never used)
+        0 0 1 2 3 4 5 0     # (output of tr. conv., but pos. 5 is going to get removed as padding)
+            1 2 3 4         # once you removed padding, we are missing one time step !
+    &#34;&#34;&#34;
+    extra_padding = get_extra_padding_for_conv1d(x, kernel_size, stride, padding_total)
+    return F.pad(x, (0, extra_padding))
+
+
+def pad1d(x: torch.Tensor, paddings: tp.Tuple[int, int], mode: str = &#39;constant&#39;, value: float = 0.):
+    &#34;&#34;&#34;Tiny wrapper around F.pad, just to allow for reflect padding on small input.
+    If this is the case, we insert extra 0 padding to the right before the reflection happen.
+    &#34;&#34;&#34;
+    length = x.shape[-1]
+    padding_left, padding_right = paddings
+    assert padding_left &gt;= 0 and padding_right &gt;= 0, (padding_left, padding_right)
+    if mode == &#39;reflect&#39;:
+        max_pad = max(padding_left, padding_right)
+        extra_pad = 0
+        if length &lt;= max_pad:
+            extra_pad = max_pad - length + 1
+            x = F.pad(x, (0, extra_pad))
+        padded = F.pad(x, paddings, mode, value)
+        end = padded.shape[-1] - extra_pad
+        return padded[..., :end]
+    else:
+        return F.pad(x, paddings, mode, value)
+
+
+def unpad1d(x: torch.Tensor, paddings: tp.Tuple[int, int]):
+    &#34;&#34;&#34;Remove padding from x, handling properly zero padding. Only for 1d!
+    &#34;&#34;&#34;
+    padding_left, padding_right = paddings
+    assert padding_left &gt;= 0 and padding_right &gt;= 0, (padding_left, padding_right)
+    assert (padding_left + padding_right) &lt;= x.shape[-1]
+    end = x.shape[-1] - padding_right
+    return x[..., padding_left: end]
+
+
+class NormConv1d(nn.Module):
+    &#34;&#34;&#34;Wrapper around Conv1d and normalization applied to this conv
+    to provide a uniform interface across normalization approaches.
+    &#34;&#34;&#34;
+    def __init__(self, *args, causal: bool = False, norm: str = &#39;none&#39;,
+                 norm_kwargs: tp.Dict[str, tp.Any] = {}, **kwargs):
+        super().__init__()
+        self.conv = apply_parametrization_norm(nn.Conv1d(*args, **kwargs), norm)
+        self.norm = get_norm_module(self.conv, causal, norm, **norm_kwargs)
+        self.norm_type = norm
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.norm(x)
+        return x
+
+
+class NormConv2d(nn.Module):
+    &#34;&#34;&#34;Wrapper around Conv2d and normalization applied to this conv
+    to provide a uniform interface across normalization approaches.
+    &#34;&#34;&#34;
+    def __init__(self, *args, norm: str = &#39;none&#39;, norm_kwargs: tp.Dict[str, tp.Any] = {}, **kwargs):
+        super().__init__()
+        self.conv = apply_parametrization_norm(nn.Conv2d(*args, **kwargs), norm)
+        self.norm = get_norm_module(self.conv, causal=False, norm=norm, **norm_kwargs)
+        self.norm_type = norm
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.norm(x)
+        return x
+
+
+class NormConvTranspose1d(nn.Module):
+    &#34;&#34;&#34;Wrapper around ConvTranspose1d and normalization applied to this conv
+    to provide a uniform interface across normalization approaches.
+    &#34;&#34;&#34;
+    def __init__(self, *args, causal: bool = False, norm: str = &#39;none&#39;,
+                 norm_kwargs: tp.Dict[str, tp.Any] = {}, **kwargs):
+        super().__init__()
+        self.convtr = apply_parametrization_norm(nn.ConvTranspose1d(*args, **kwargs), norm)
+        self.norm = get_norm_module(self.convtr, causal, norm, **norm_kwargs)
+        self.norm_type = norm
+
+    def forward(self, x):
+        x = self.convtr(x)
+        x = self.norm(x)
+        return x
+
+
+class NormConvTranspose2d(nn.Module):
+    &#34;&#34;&#34;Wrapper around ConvTranspose2d and normalization applied to this conv
+    to provide a uniform interface across normalization approaches.
+    &#34;&#34;&#34;
+    def __init__(self, *args, norm: str = &#39;none&#39;, norm_kwargs: tp.Dict[str, tp.Any] = {}, **kwargs):
+        super().__init__()
+        self.convtr = apply_parametrization_norm(nn.ConvTranspose2d(*args, **kwargs), norm)
+        self.norm = get_norm_module(self.convtr, causal=False, norm=norm, **norm_kwargs)
+
+    def forward(self, x):
+        x = self.convtr(x)
+        x = self.norm(x)
+        return x
+
+
+class StreamableConv1d(nn.Module):
+    &#34;&#34;&#34;Conv1d with some builtin handling of asymmetric or causal padding
+    and normalization.
+    &#34;&#34;&#34;
+    def __init__(self, in_channels: int, out_channels: int,
+                 kernel_size: int, stride: int = 1, dilation: int = 1,
+                 groups: int = 1, bias: bool = True, causal: bool = False,
+                 norm: str = &#39;none&#39;, norm_kwargs: tp.Dict[str, tp.Any] = {},
+                 pad_mode: str = &#39;reflect&#39;):
+        super().__init__()
+        # warn user on unusual setup between dilation and stride
+        if stride &gt; 1 and dilation &gt; 1:
+            warnings.warn(&#39;StreamableConv1d has been initialized with stride &gt; 1 and dilation &gt; 1&#39;
+                          f&#39; (kernel_size={kernel_size} stride={stride}, dilation={dilation}).&#39;)
+        self.conv = NormConv1d(in_channels, out_channels, kernel_size, stride,
+                               dilation=dilation, groups=groups, bias=bias, causal=causal,
+                               norm=norm, norm_kwargs=norm_kwargs)
+        self.causal = causal
+        self.pad_mode = pad_mode
+
+    def forward(self, x):
+        B, C, T = x.shape
+        kernel_size = self.conv.conv.kernel_size[0]
+        stride = self.conv.conv.stride[0]
+        dilation = self.conv.conv.dilation[0]
+        kernel_size = (kernel_size - 1) * dilation + 1  # effective kernel size with dilations
+        padding_total = kernel_size - stride
+        extra_padding = get_extra_padding_for_conv1d(x, kernel_size, stride, padding_total)
+        if self.causal:
+            # Left padding for causal
+            x = pad1d(x, (padding_total, extra_padding), mode=self.pad_mode)
+        else:
+            # Asymmetric padding required for odd strides
+            padding_right = padding_total // 2
+            padding_left = padding_total - padding_right
+            x = pad1d(x, (padding_left, padding_right + extra_padding), mode=self.pad_mode)
+        return self.conv(x)
+
+
+class StreamableConvTranspose1d(nn.Module):
+    &#34;&#34;&#34;ConvTranspose1d with some builtin handling of asymmetric or causal padding
+    and normalization.
+    &#34;&#34;&#34;
+    def __init__(self, in_channels: int, out_channels: int,
+                 kernel_size: int, stride: int = 1, causal: bool = False,
+                 norm: str = &#39;none&#39;, trim_right_ratio: float = 1.,
+                 norm_kwargs: tp.Dict[str, tp.Any] = {}):
+        super().__init__()
+        self.convtr = NormConvTranspose1d(in_channels, out_channels, kernel_size, stride,
+                                          causal=causal, norm=norm, norm_kwargs=norm_kwargs)
+        self.causal = causal
+        self.trim_right_ratio = trim_right_ratio
+        assert self.causal or self.trim_right_ratio == 1., \
+            &#34;`trim_right_ratio` != 1.0 only makes sense for causal convolutions&#34;
+        assert self.trim_right_ratio &gt;= 0. and self.trim_right_ratio &lt;= 1.
+
+    def forward(self, x):
+        kernel_size = self.convtr.convtr.kernel_size[0]
+        stride = self.convtr.convtr.stride[0]
+        padding_total = kernel_size - stride
+
+        y = self.convtr(x)
+
+        # We will only trim fixed padding. Extra padding from `pad_for_conv1d` would be
+        # removed at the very end, when keeping only the right length for the output,
+        # as removing it here would require also passing the length at the matching layer
+        # in the encoder.
+        if self.causal:
+            # Trim the padding on the right according to the specified ratio
+            # if trim_right_ratio = 1.0, trim everything from right
+            padding_right = math.ceil(padding_total * self.trim_right_ratio)
+            padding_left = padding_total - padding_right
+            y = unpad1d(y, (padding_left, padding_right))
+        else:
+            # Asymmetric padding required for odd strides
+            padding_right = padding_total // 2
+            padding_left = padding_total - padding_right
+            y = unpad1d(y, (padding_left, padding_right))
+        return y</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-functions">Functions</h2>
+<dl>
+<dt id="audiocraft.modules.conv.apply_parametrization_norm"><code class="name flex">
+<span>def <span class="ident">apply_parametrization_norm</span></span>(<span>module: torch.nn.modules.module.Module, norm: str = 'none')</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def apply_parametrization_norm(module: nn.Module, norm: str = &#39;none&#39;):
+    assert norm in CONV_NORMALIZATIONS
+    if norm == &#39;weight_norm&#39;:
+        return weight_norm(module)
+    elif norm == &#39;spectral_norm&#39;:
+        return spectral_norm(module)
+    else:
+        # We already check was in CONV_NORMALIZATION, so any other choice
+        # doesn&#39;t need reparametrization.
+        return module</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.conv.get_extra_padding_for_conv1d"><code class="name flex">
+<span>def <span class="ident">get_extra_padding_for_conv1d</span></span>(<span>x: torch.Tensor, kernel_size: int, stride: int, padding_total: int = 0) ‑> int</span>
+</code></dt>
+<dd>
+<div class="desc"><p>See <code><a title="audiocraft.modules.conv.pad_for_conv1d" href="#audiocraft.modules.conv.pad_for_conv1d">pad_for_conv1d()</a></code>.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_extra_padding_for_conv1d(x: torch.Tensor, kernel_size: int, stride: int,
+                                 padding_total: int = 0) -&gt; int:
+    &#34;&#34;&#34;See `pad_for_conv1d`.
+    &#34;&#34;&#34;
+    length = x.shape[-1]
+    n_frames = (length - kernel_size + padding_total) / stride + 1
+    ideal_length = (math.ceil(n_frames) - 1) * stride + (kernel_size - padding_total)
+    return ideal_length - length</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.conv.get_norm_module"><code class="name flex">
+<span>def <span class="ident">get_norm_module</span></span>(<span>module: torch.nn.modules.module.Module, causal: bool = False, norm: str = 'none', **norm_kwargs)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Return the proper normalization module. If causal is True, this will ensure the returned
+module is causal, or return an error if the normalization doesn't support causal evaluation.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_norm_module(module: nn.Module, causal: bool = False, norm: str = &#39;none&#39;, **norm_kwargs):
+    &#34;&#34;&#34;Return the proper normalization module. If causal is True, this will ensure the returned
+    module is causal, or return an error if the normalization doesn&#39;t support causal evaluation.
+    &#34;&#34;&#34;
+    assert norm in CONV_NORMALIZATIONS
+    if norm == &#39;time_group_norm&#39;:
+        if causal:
+            raise ValueError(&#34;GroupNorm doesn&#39;t support causal evaluation.&#34;)
+        assert isinstance(module, nn.modules.conv._ConvNd)
+        return nn.GroupNorm(1, module.out_channels, **norm_kwargs)
+    else:
+        return nn.Identity()</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.conv.pad1d"><code class="name flex">
+<span>def <span class="ident">pad1d</span></span>(<span>x: torch.Tensor, paddings: Tuple[int, int], mode: str = 'constant', value: float = 0.0)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Tiny wrapper around F.pad, just to allow for reflect padding on small input.
+If this is the case, we insert extra 0 padding to the right before the reflection happen.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def pad1d(x: torch.Tensor, paddings: tp.Tuple[int, int], mode: str = &#39;constant&#39;, value: float = 0.):
+    &#34;&#34;&#34;Tiny wrapper around F.pad, just to allow for reflect padding on small input.
+    If this is the case, we insert extra 0 padding to the right before the reflection happen.
+    &#34;&#34;&#34;
+    length = x.shape[-1]
+    padding_left, padding_right = paddings
+    assert padding_left &gt;= 0 and padding_right &gt;= 0, (padding_left, padding_right)
+    if mode == &#39;reflect&#39;:
+        max_pad = max(padding_left, padding_right)
+        extra_pad = 0
+        if length &lt;= max_pad:
+            extra_pad = max_pad - length + 1
+            x = F.pad(x, (0, extra_pad))
+        padded = F.pad(x, paddings, mode, value)
+        end = padded.shape[-1] - extra_pad
+        return padded[..., :end]
+    else:
+        return F.pad(x, paddings, mode, value)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.conv.pad_for_conv1d"><code class="name flex">
+<span>def <span class="ident">pad_for_conv1d</span></span>(<span>x: torch.Tensor, kernel_size: int, stride: int, padding_total: int = 0)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Pad for a convolution to make sure that the last window is full.
+Extra padding is added at the end. This is required to ensure that we can rebuild
+an output of the same length, as otherwise, even with padding, some time steps
+might get removed.
+For instance, with total padding = 4, kernel size = 4, stride = 2:
+0 0 1 2 3 4 5 0 0
+# (0s are padding)
+1
+2
+3
+# (output frames of a convolution, last 0 is never used)
+0 0 1 2 3 4 5 0
+# (output of tr. conv., but pos. 5 is going to get removed as padding)
+1 2 3 4
+# once you removed padding, we are missing one time step !</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def pad_for_conv1d(x: torch.Tensor, kernel_size: int, stride: int, padding_total: int = 0):
+    &#34;&#34;&#34;Pad for a convolution to make sure that the last window is full.
+    Extra padding is added at the end. This is required to ensure that we can rebuild
+    an output of the same length, as otherwise, even with padding, some time steps
+    might get removed.
+    For instance, with total padding = 4, kernel size = 4, stride = 2:
+        0 0 1 2 3 4 5 0 0   # (0s are padding)
+        1   2   3           # (output frames of a convolution, last 0 is never used)
+        0 0 1 2 3 4 5 0     # (output of tr. conv., but pos. 5 is going to get removed as padding)
+            1 2 3 4         # once you removed padding, we are missing one time step !
+    &#34;&#34;&#34;
+    extra_padding = get_extra_padding_for_conv1d(x, kernel_size, stride, padding_total)
+    return F.pad(x, (0, extra_padding))</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.conv.unpad1d"><code class="name flex">
+<span>def <span class="ident">unpad1d</span></span>(<span>x: torch.Tensor, paddings: Tuple[int, int])</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Remove padding from x, handling properly zero padding. Only for 1d!</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def unpad1d(x: torch.Tensor, paddings: tp.Tuple[int, int]):
+    &#34;&#34;&#34;Remove padding from x, handling properly zero padding. Only for 1d!
+    &#34;&#34;&#34;
+    padding_left, padding_right = paddings
+    assert padding_left &gt;= 0 and padding_right &gt;= 0, (padding_left, padding_right)
+    assert (padding_left + padding_right) &lt;= x.shape[-1]
+    end = x.shape[-1] - padding_right
+    return x[..., padding_left: end]</code></pre>
+</details>
+</dd>
+</dl>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.modules.conv.NormConv1d"><code class="flex name class">
+<span>class <span class="ident">NormConv1d</span></span>
+<span>(</span><span>*args, causal: bool = False, norm: str = 'none', norm_kwargs: Dict[str, Any] = {}, **kwargs)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Wrapper around Conv1d and normalization applied to this conv
+to provide a uniform interface across normalization approaches.</p>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class NormConv1d(nn.Module):
+    &#34;&#34;&#34;Wrapper around Conv1d and normalization applied to this conv
+    to provide a uniform interface across normalization approaches.
+    &#34;&#34;&#34;
+    def __init__(self, *args, causal: bool = False, norm: str = &#39;none&#39;,
+                 norm_kwargs: tp.Dict[str, tp.Any] = {}, **kwargs):
+        super().__init__()
+        self.conv = apply_parametrization_norm(nn.Conv1d(*args, **kwargs), norm)
+        self.norm = get_norm_module(self.conv, causal, norm, **norm_kwargs)
+        self.norm_type = norm
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.norm(x)
+        return x</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.conv.NormConv1d.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conv.NormConv1d.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conv.NormConv1d.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.modules.conv.NormConv1d.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, x) ‑> Callable[..., Any]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Defines the computation performed at every call.</p>
+<p>Should be overridden by all subclasses.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>Although the recipe for forward pass needs to be defined within
+this function, one should call the :class:<code>Module</code> instance afterwards
+instead of this since the former takes care of running the
+registered hooks while the latter silently ignores them.</p>
+</div></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, x):
+    x = self.conv(x)
+    x = self.norm(x)
+    return x</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.modules.conv.NormConv2d"><code class="flex name class">
+<span>class <span class="ident">NormConv2d</span></span>
+<span>(</span><span>*args, norm: str = 'none', norm_kwargs: Dict[str, Any] = {}, **kwargs)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Wrapper around Conv2d and normalization applied to this conv
+to provide a uniform interface across normalization approaches.</p>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class NormConv2d(nn.Module):
+    &#34;&#34;&#34;Wrapper around Conv2d and normalization applied to this conv
+    to provide a uniform interface across normalization approaches.
+    &#34;&#34;&#34;
+    def __init__(self, *args, norm: str = &#39;none&#39;, norm_kwargs: tp.Dict[str, tp.Any] = {}, **kwargs):
+        super().__init__()
+        self.conv = apply_parametrization_norm(nn.Conv2d(*args, **kwargs), norm)
+        self.norm = get_norm_module(self.conv, causal=False, norm=norm, **norm_kwargs)
+        self.norm_type = norm
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.norm(x)
+        return x</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.conv.NormConv2d.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conv.NormConv2d.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conv.NormConv2d.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.modules.conv.NormConv2d.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, x) ‑> Callable[..., Any]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Defines the computation performed at every call.</p>
+<p>Should be overridden by all subclasses.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>Although the recipe for forward pass needs to be defined within
+this function, one should call the :class:<code>Module</code> instance afterwards
+instead of this since the former takes care of running the
+registered hooks while the latter silently ignores them.</p>
+</div></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, x):
+    x = self.conv(x)
+    x = self.norm(x)
+    return x</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.modules.conv.NormConvTranspose1d"><code class="flex name class">
+<span>class <span class="ident">NormConvTranspose1d</span></span>
+<span>(</span><span>*args, causal: bool = False, norm: str = 'none', norm_kwargs: Dict[str, Any] = {}, **kwargs)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Wrapper around ConvTranspose1d and normalization applied to this conv
+to provide a uniform interface across normalization approaches.</p>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class NormConvTranspose1d(nn.Module):
+    &#34;&#34;&#34;Wrapper around ConvTranspose1d and normalization applied to this conv
+    to provide a uniform interface across normalization approaches.
+    &#34;&#34;&#34;
+    def __init__(self, *args, causal: bool = False, norm: str = &#39;none&#39;,
+                 norm_kwargs: tp.Dict[str, tp.Any] = {}, **kwargs):
+        super().__init__()
+        self.convtr = apply_parametrization_norm(nn.ConvTranspose1d(*args, **kwargs), norm)
+        self.norm = get_norm_module(self.convtr, causal, norm, **norm_kwargs)
+        self.norm_type = norm
+
+    def forward(self, x):
+        x = self.convtr(x)
+        x = self.norm(x)
+        return x</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.conv.NormConvTranspose1d.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conv.NormConvTranspose1d.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conv.NormConvTranspose1d.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.modules.conv.NormConvTranspose1d.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, x) ‑> Callable[..., Any]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Defines the computation performed at every call.</p>
+<p>Should be overridden by all subclasses.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>Although the recipe for forward pass needs to be defined within
+this function, one should call the :class:<code>Module</code> instance afterwards
+instead of this since the former takes care of running the
+registered hooks while the latter silently ignores them.</p>
+</div></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, x):
+    x = self.convtr(x)
+    x = self.norm(x)
+    return x</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.modules.conv.NormConvTranspose2d"><code class="flex name class">
+<span>class <span class="ident">NormConvTranspose2d</span></span>
+<span>(</span><span>*args, norm: str = 'none', norm_kwargs: Dict[str, Any] = {}, **kwargs)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Wrapper around ConvTranspose2d and normalization applied to this conv
+to provide a uniform interface across normalization approaches.</p>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class NormConvTranspose2d(nn.Module):
+    &#34;&#34;&#34;Wrapper around ConvTranspose2d and normalization applied to this conv
+    to provide a uniform interface across normalization approaches.
+    &#34;&#34;&#34;
+    def __init__(self, *args, norm: str = &#39;none&#39;, norm_kwargs: tp.Dict[str, tp.Any] = {}, **kwargs):
+        super().__init__()
+        self.convtr = apply_parametrization_norm(nn.ConvTranspose2d(*args, **kwargs), norm)
+        self.norm = get_norm_module(self.convtr, causal=False, norm=norm, **norm_kwargs)
+
+    def forward(self, x):
+        x = self.convtr(x)
+        x = self.norm(x)
+        return x</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.conv.NormConvTranspose2d.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conv.NormConvTranspose2d.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conv.NormConvTranspose2d.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.modules.conv.NormConvTranspose2d.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, x) ‑> Callable[..., Any]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Defines the computation performed at every call.</p>
+<p>Should be overridden by all subclasses.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>Although the recipe for forward pass needs to be defined within
+this function, one should call the :class:<code>Module</code> instance afterwards
+instead of this since the former takes care of running the
+registered hooks while the latter silently ignores them.</p>
+</div></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, x):
+    x = self.convtr(x)
+    x = self.norm(x)
+    return x</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.modules.conv.StreamableConv1d"><code class="flex name class">
+<span>class <span class="ident">StreamableConv1d</span></span>
+<span>(</span><span>in_channels: int, out_channels: int, kernel_size: int, stride: int = 1, dilation: int = 1, groups: int = 1, bias: bool = True, causal: bool = False, norm: str = 'none', norm_kwargs: Dict[str, Any] = {}, pad_mode: str = 'reflect')</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Conv1d with some builtin handling of asymmetric or causal padding
+and normalization.</p>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class StreamableConv1d(nn.Module):
+    &#34;&#34;&#34;Conv1d with some builtin handling of asymmetric or causal padding
+    and normalization.
+    &#34;&#34;&#34;
+    def __init__(self, in_channels: int, out_channels: int,
+                 kernel_size: int, stride: int = 1, dilation: int = 1,
+                 groups: int = 1, bias: bool = True, causal: bool = False,
+                 norm: str = &#39;none&#39;, norm_kwargs: tp.Dict[str, tp.Any] = {},
+                 pad_mode: str = &#39;reflect&#39;):
+        super().__init__()
+        # warn user on unusual setup between dilation and stride
+        if stride &gt; 1 and dilation &gt; 1:
+            warnings.warn(&#39;StreamableConv1d has been initialized with stride &gt; 1 and dilation &gt; 1&#39;
+                          f&#39; (kernel_size={kernel_size} stride={stride}, dilation={dilation}).&#39;)
+        self.conv = NormConv1d(in_channels, out_channels, kernel_size, stride,
+                               dilation=dilation, groups=groups, bias=bias, causal=causal,
+                               norm=norm, norm_kwargs=norm_kwargs)
+        self.causal = causal
+        self.pad_mode = pad_mode
+
+    def forward(self, x):
+        B, C, T = x.shape
+        kernel_size = self.conv.conv.kernel_size[0]
+        stride = self.conv.conv.stride[0]
+        dilation = self.conv.conv.dilation[0]
+        kernel_size = (kernel_size - 1) * dilation + 1  # effective kernel size with dilations
+        padding_total = kernel_size - stride
+        extra_padding = get_extra_padding_for_conv1d(x, kernel_size, stride, padding_total)
+        if self.causal:
+            # Left padding for causal
+            x = pad1d(x, (padding_total, extra_padding), mode=self.pad_mode)
+        else:
+            # Asymmetric padding required for odd strides
+            padding_right = padding_total // 2
+            padding_left = padding_total - padding_right
+            x = pad1d(x, (padding_left, padding_right + extra_padding), mode=self.pad_mode)
+        return self.conv(x)</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.conv.StreamableConv1d.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conv.StreamableConv1d.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conv.StreamableConv1d.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.modules.conv.StreamableConv1d.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, x) ‑> Callable[..., Any]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Defines the computation performed at every call.</p>
+<p>Should be overridden by all subclasses.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>Although the recipe for forward pass needs to be defined within
+this function, one should call the :class:<code>Module</code> instance afterwards
+instead of this since the former takes care of running the
+registered hooks while the latter silently ignores them.</p>
+</div></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, x):
+    B, C, T = x.shape
+    kernel_size = self.conv.conv.kernel_size[0]
+    stride = self.conv.conv.stride[0]
+    dilation = self.conv.conv.dilation[0]
+    kernel_size = (kernel_size - 1) * dilation + 1  # effective kernel size with dilations
+    padding_total = kernel_size - stride
+    extra_padding = get_extra_padding_for_conv1d(x, kernel_size, stride, padding_total)
+    if self.causal:
+        # Left padding for causal
+        x = pad1d(x, (padding_total, extra_padding), mode=self.pad_mode)
+    else:
+        # Asymmetric padding required for odd strides
+        padding_right = padding_total // 2
+        padding_left = padding_total - padding_right
+        x = pad1d(x, (padding_left, padding_right + extra_padding), mode=self.pad_mode)
+    return self.conv(x)</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.modules.conv.StreamableConvTranspose1d"><code class="flex name class">
+<span>class <span class="ident">StreamableConvTranspose1d</span></span>
+<span>(</span><span>in_channels: int, out_channels: int, kernel_size: int, stride: int = 1, causal: bool = False, norm: str = 'none', trim_right_ratio: float = 1.0, norm_kwargs: Dict[str, Any] = {})</span>
+</code></dt>
+<dd>
+<div class="desc"><p>ConvTranspose1d with some builtin handling of asymmetric or causal padding
+and normalization.</p>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class StreamableConvTranspose1d(nn.Module):
+    &#34;&#34;&#34;ConvTranspose1d with some builtin handling of asymmetric or causal padding
+    and normalization.
+    &#34;&#34;&#34;
+    def __init__(self, in_channels: int, out_channels: int,
+                 kernel_size: int, stride: int = 1, causal: bool = False,
+                 norm: str = &#39;none&#39;, trim_right_ratio: float = 1.,
+                 norm_kwargs: tp.Dict[str, tp.Any] = {}):
+        super().__init__()
+        self.convtr = NormConvTranspose1d(in_channels, out_channels, kernel_size, stride,
+                                          causal=causal, norm=norm, norm_kwargs=norm_kwargs)
+        self.causal = causal
+        self.trim_right_ratio = trim_right_ratio
+        assert self.causal or self.trim_right_ratio == 1., \
+            &#34;`trim_right_ratio` != 1.0 only makes sense for causal convolutions&#34;
+        assert self.trim_right_ratio &gt;= 0. and self.trim_right_ratio &lt;= 1.
+
+    def forward(self, x):
+        kernel_size = self.convtr.convtr.kernel_size[0]
+        stride = self.convtr.convtr.stride[0]
+        padding_total = kernel_size - stride
+
+        y = self.convtr(x)
+
+        # We will only trim fixed padding. Extra padding from `pad_for_conv1d` would be
+        # removed at the very end, when keeping only the right length for the output,
+        # as removing it here would require also passing the length at the matching layer
+        # in the encoder.
+        if self.causal:
+            # Trim the padding on the right according to the specified ratio
+            # if trim_right_ratio = 1.0, trim everything from right
+            padding_right = math.ceil(padding_total * self.trim_right_ratio)
+            padding_left = padding_total - padding_right
+            y = unpad1d(y, (padding_left, padding_right))
+        else:
+            # Asymmetric padding required for odd strides
+            padding_right = padding_total // 2
+            padding_left = padding_total - padding_right
+            y = unpad1d(y, (padding_left, padding_right))
+        return y</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.conv.StreamableConvTranspose1d.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conv.StreamableConvTranspose1d.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.conv.StreamableConvTranspose1d.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.modules.conv.StreamableConvTranspose1d.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, x) ‑> Callable[..., Any]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Defines the computation performed at every call.</p>
+<p>Should be overridden by all subclasses.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>Although the recipe for forward pass needs to be defined within
+this function, one should call the :class:<code>Module</code> instance afterwards
+instead of this since the former takes care of running the
+registered hooks while the latter silently ignores them.</p>
+</div></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, x):
+    kernel_size = self.convtr.convtr.kernel_size[0]
+    stride = self.convtr.convtr.stride[0]
+    padding_total = kernel_size - stride
+
+    y = self.convtr(x)
+
+    # We will only trim fixed padding. Extra padding from `pad_for_conv1d` would be
+    # removed at the very end, when keeping only the right length for the output,
+    # as removing it here would require also passing the length at the matching layer
+    # in the encoder.
+    if self.causal:
+        # Trim the padding on the right according to the specified ratio
+        # if trim_right_ratio = 1.0, trim everything from right
+        padding_right = math.ceil(padding_total * self.trim_right_ratio)
+        padding_left = padding_total - padding_right
+        y = unpad1d(y, (padding_left, padding_right))
+    else:
+        # Asymmetric padding required for odd strides
+        padding_right = padding_total // 2
+        padding_left = padding_total - padding_right
+        y = unpad1d(y, (padding_left, padding_right))
+    return y</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.modules" href="index.html">audiocraft.modules</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-functions">Functions</a></h3>
+<ul class="">
+<li><code><a title="audiocraft.modules.conv.apply_parametrization_norm" href="#audiocraft.modules.conv.apply_parametrization_norm">apply_parametrization_norm</a></code></li>
+<li><code><a title="audiocraft.modules.conv.get_extra_padding_for_conv1d" href="#audiocraft.modules.conv.get_extra_padding_for_conv1d">get_extra_padding_for_conv1d</a></code></li>
+<li><code><a title="audiocraft.modules.conv.get_norm_module" href="#audiocraft.modules.conv.get_norm_module">get_norm_module</a></code></li>
+<li><code><a title="audiocraft.modules.conv.pad1d" href="#audiocraft.modules.conv.pad1d">pad1d</a></code></li>
+<li><code><a title="audiocraft.modules.conv.pad_for_conv1d" href="#audiocraft.modules.conv.pad_for_conv1d">pad_for_conv1d</a></code></li>
+<li><code><a title="audiocraft.modules.conv.unpad1d" href="#audiocraft.modules.conv.unpad1d">unpad1d</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.modules.conv.NormConv1d" href="#audiocraft.modules.conv.NormConv1d">NormConv1d</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.conv.NormConv1d.call_super_init" href="#audiocraft.modules.conv.NormConv1d.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.conv.NormConv1d.dump_patches" href="#audiocraft.modules.conv.NormConv1d.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.conv.NormConv1d.forward" href="#audiocraft.modules.conv.NormConv1d.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.conv.NormConv1d.training" href="#audiocraft.modules.conv.NormConv1d.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.conv.NormConv2d" href="#audiocraft.modules.conv.NormConv2d">NormConv2d</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.conv.NormConv2d.call_super_init" href="#audiocraft.modules.conv.NormConv2d.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.conv.NormConv2d.dump_patches" href="#audiocraft.modules.conv.NormConv2d.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.conv.NormConv2d.forward" href="#audiocraft.modules.conv.NormConv2d.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.conv.NormConv2d.training" href="#audiocraft.modules.conv.NormConv2d.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.conv.NormConvTranspose1d" href="#audiocraft.modules.conv.NormConvTranspose1d">NormConvTranspose1d</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.conv.NormConvTranspose1d.call_super_init" href="#audiocraft.modules.conv.NormConvTranspose1d.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.conv.NormConvTranspose1d.dump_patches" href="#audiocraft.modules.conv.NormConvTranspose1d.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.conv.NormConvTranspose1d.forward" href="#audiocraft.modules.conv.NormConvTranspose1d.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.conv.NormConvTranspose1d.training" href="#audiocraft.modules.conv.NormConvTranspose1d.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.conv.NormConvTranspose2d" href="#audiocraft.modules.conv.NormConvTranspose2d">NormConvTranspose2d</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.conv.NormConvTranspose2d.call_super_init" href="#audiocraft.modules.conv.NormConvTranspose2d.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.conv.NormConvTranspose2d.dump_patches" href="#audiocraft.modules.conv.NormConvTranspose2d.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.conv.NormConvTranspose2d.forward" href="#audiocraft.modules.conv.NormConvTranspose2d.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.conv.NormConvTranspose2d.training" href="#audiocraft.modules.conv.NormConvTranspose2d.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.conv.StreamableConv1d" href="#audiocraft.modules.conv.StreamableConv1d">StreamableConv1d</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.conv.StreamableConv1d.call_super_init" href="#audiocraft.modules.conv.StreamableConv1d.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.conv.StreamableConv1d.dump_patches" href="#audiocraft.modules.conv.StreamableConv1d.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.conv.StreamableConv1d.forward" href="#audiocraft.modules.conv.StreamableConv1d.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.conv.StreamableConv1d.training" href="#audiocraft.modules.conv.StreamableConv1d.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.conv.StreamableConvTranspose1d" href="#audiocraft.modules.conv.StreamableConvTranspose1d">StreamableConvTranspose1d</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.conv.StreamableConvTranspose1d.call_super_init" href="#audiocraft.modules.conv.StreamableConvTranspose1d.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.conv.StreamableConvTranspose1d.dump_patches" href="#audiocraft.modules.conv.StreamableConvTranspose1d.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.conv.StreamableConvTranspose1d.forward" href="#audiocraft.modules.conv.StreamableConvTranspose1d.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.conv.StreamableConvTranspose1d.training" href="#audiocraft.modules.conv.StreamableConvTranspose1d.training">training</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/docs/audiocraft/modules/index.html b/docs/audiocraft/modules/index.html
new file mode 100644
index 00000000..f012a824
--- /dev/null
+++ b/docs/audiocraft/modules/index.html
@@ -0,0 +1,131 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.modules API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.modules</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# flake8: noqa
+from .conv import (
+    NormConv1d,
+    NormConv2d,
+    NormConvTranspose1d,
+    NormConvTranspose2d,
+    StreamableConv1d,
+    StreamableConvTranspose1d,
+    pad_for_conv1d,
+    pad1d,
+    unpad1d,
+)
+from .lstm import StreamableLSTM
+from .seanet import SEANetEncoder, SEANetDecoder</code></pre>
+</details>
+</section>
+<section>
+<h2 class="section-title" id="header-submodules">Sub-modules</h2>
+<dl>
+<dt><code class="name"><a title="audiocraft.modules.activations" href="activations.html">audiocraft.modules.activations</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.modules.codebooks_patterns" href="codebooks_patterns.html">audiocraft.modules.codebooks_patterns</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.modules.conditioners" href="conditioners.html">audiocraft.modules.conditioners</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.modules.conv" href="conv.html">audiocraft.modules.conv</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.modules.lstm" href="lstm.html">audiocraft.modules.lstm</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.modules.rope" href="rope.html">audiocraft.modules.rope</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.modules.seanet" href="seanet.html">audiocraft.modules.seanet</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.modules.streaming" href="streaming.html">audiocraft.modules.streaming</a></code></dt>
+<dd>
+<div class="desc"><p>Streaming module API that should be implemented by all Streaming components,</p></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.modules.transformer" href="transformer.html">audiocraft.modules.transformer</a></code></dt>
+<dd>
+<div class="desc"><p>Transformer model, with streaming support, xformer attention support
+and easy causal attention with a potentially finite receptive field …</p></div>
+</dd>
+</dl>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft" href="../index.html">audiocraft</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-submodules">Sub-modules</a></h3>
+<ul>
+<li><code><a title="audiocraft.modules.activations" href="activations.html">audiocraft.modules.activations</a></code></li>
+<li><code><a title="audiocraft.modules.codebooks_patterns" href="codebooks_patterns.html">audiocraft.modules.codebooks_patterns</a></code></li>
+<li><code><a title="audiocraft.modules.conditioners" href="conditioners.html">audiocraft.modules.conditioners</a></code></li>
+<li><code><a title="audiocraft.modules.conv" href="conv.html">audiocraft.modules.conv</a></code></li>
+<li><code><a title="audiocraft.modules.lstm" href="lstm.html">audiocraft.modules.lstm</a></code></li>
+<li><code><a title="audiocraft.modules.rope" href="rope.html">audiocraft.modules.rope</a></code></li>
+<li><code><a title="audiocraft.modules.seanet" href="seanet.html">audiocraft.modules.seanet</a></code></li>
+<li><code><a title="audiocraft.modules.streaming" href="streaming.html">audiocraft.modules.streaming</a></code></li>
+<li><code><a title="audiocraft.modules.transformer" href="transformer.html">audiocraft.modules.transformer</a></code></li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/docs/audiocraft/modules/lstm.html b/docs/audiocraft/modules/lstm.html
new file mode 100644
index 00000000..ad20d54e
--- /dev/null
+++ b/docs/audiocraft/modules/lstm.html
@@ -0,0 +1,177 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.modules.lstm API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.modules.lstm</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from torch import nn
+
+
+class StreamableLSTM(nn.Module):
+    &#34;&#34;&#34;LSTM without worrying about the hidden state, nor the layout of the data.
+    Expects input as convolutional layout.
+    &#34;&#34;&#34;
+    def __init__(self, dimension: int, num_layers: int = 2, skip: bool = True):
+        super().__init__()
+        self.skip = skip
+        self.lstm = nn.LSTM(dimension, dimension, num_layers)
+
+    def forward(self, x):
+        x = x.permute(2, 0, 1)
+        y, _ = self.lstm(x)
+        if self.skip:
+            y = y + x
+        y = y.permute(1, 2, 0)
+        return y</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.modules.lstm.StreamableLSTM"><code class="flex name class">
+<span>class <span class="ident">StreamableLSTM</span></span>
+<span>(</span><span>dimension: int, num_layers: int = 2, skip: bool = True)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>LSTM without worrying about the hidden state, nor the layout of the data.
+Expects input as convolutional layout.</p>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class StreamableLSTM(nn.Module):
+    &#34;&#34;&#34;LSTM without worrying about the hidden state, nor the layout of the data.
+    Expects input as convolutional layout.
+    &#34;&#34;&#34;
+    def __init__(self, dimension: int, num_layers: int = 2, skip: bool = True):
+        super().__init__()
+        self.skip = skip
+        self.lstm = nn.LSTM(dimension, dimension, num_layers)
+
+    def forward(self, x):
+        x = x.permute(2, 0, 1)
+        y, _ = self.lstm(x)
+        if self.skip:
+            y = y + x
+        y = y.permute(1, 2, 0)
+        return y</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.lstm.StreamableLSTM.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.lstm.StreamableLSTM.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.lstm.StreamableLSTM.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.modules.lstm.StreamableLSTM.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, x) ‑> Callable[..., Any]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Defines the computation performed at every call.</p>
+<p>Should be overridden by all subclasses.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>Although the recipe for forward pass needs to be defined within
+this function, one should call the :class:<code>Module</code> instance afterwards
+instead of this since the former takes care of running the
+registered hooks while the latter silently ignores them.</p>
+</div></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, x):
+    x = x.permute(2, 0, 1)
+    y, _ = self.lstm(x)
+    if self.skip:
+        y = y + x
+    y = y.permute(1, 2, 0)
+    return y</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.modules" href="index.html">audiocraft.modules</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.modules.lstm.StreamableLSTM" href="#audiocraft.modules.lstm.StreamableLSTM">StreamableLSTM</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.lstm.StreamableLSTM.call_super_init" href="#audiocraft.modules.lstm.StreamableLSTM.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.lstm.StreamableLSTM.dump_patches" href="#audiocraft.modules.lstm.StreamableLSTM.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.lstm.StreamableLSTM.forward" href="#audiocraft.modules.lstm.StreamableLSTM.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.lstm.StreamableLSTM.training" href="#audiocraft.modules.lstm.StreamableLSTM.training">training</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/docs/audiocraft/modules/rope.html b/docs/audiocraft/modules/rope.html
new file mode 100644
index 00000000..57e7eb5e
--- /dev/null
+++ b/docs/audiocraft/modules/rope.html
@@ -0,0 +1,595 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.modules.rope API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.modules.rope</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import typing as tp
+
+from torch import nn
+import torch
+
+
+class XPos(nn.Module):
+    &#34;&#34;&#34;Length-extrapolatable positional embedding (xPos) from [Sun et al 2022](https://arxiv.org/abs/2212.10554v1).
+    This applies an exponential decay to the RoPE rotation matrix.
+
+    Args:
+        dim (int): Embedding dimension.
+        smoothing (float): Smoothing factor applied to the decay rates.
+        base_scale (int): Base decay rate, given in terms of scaling time.
+        device (torch.device or None): Device on which to initialize the module.
+        dtype (torch.dtype): dtype to use to generate the embedding.
+    &#34;&#34;&#34;
+    def __init__(self, dim: int, smoothing: float = 0.4, base_scale: int = 512,
+                 device=None, dtype: torch.dtype = torch.float32):
+        super().__init__()
+        assert dim % 2 == 0
+        assert dtype in [torch.float64, torch.float32]
+        self.dtype = dtype
+        self.base_scale = base_scale
+
+        half_dim = dim // 2
+        adim = torch.arange(half_dim, device=device, dtype=dtype)
+        decay_rates = (adim / half_dim + smoothing) / (1.0 + smoothing)
+        self.register_buffer(&#34;decay_rates&#34;, decay_rates)
+        self.decay: tp.Optional[torch.Tensor] = None
+
+    def get_decay(self, start: int, end: int):
+        &#34;&#34;&#34;Create complex decay tensor, cache values for fast computation.
+        &#34;&#34;&#34;
+        if self.decay is None or end &gt; self.decay.shape[0]:
+            assert isinstance(self.decay_rates, torch.Tensor)  # Satisfy type checker.
+            idx = torch.arange(end, device=self.decay_rates.device, dtype=self.dtype)
+            power = idx / self.base_scale
+            scale = self.decay_rates ** power.unsqueeze(-1)
+            self.decay = torch.polar(scale, torch.zeros_like(scale))
+        return self.decay[start:end]  # [T, C/2]
+
+
+class RotaryEmbedding(nn.Module):
+    &#34;&#34;&#34;Rotary positional embedding (RoPE) from [Su et al 2022](https://arxiv.org/abs/2104.09864).
+
+    Args:
+        dim (int): Embedding dimension (twice the number of frequencies).
+        max_period (float): Maximum period of the rotation frequencies.
+        xpos (bool): Use xPos, applies an exponential decay to rotation matrix.
+        scale (float): Scale of positional embedding, set to 0 to deactivate.
+        device (torch.device or None): Device on which to initialize the module.
+        dtype (torch.dtype): dtype to use to generate the embedding.
+    &#34;&#34;&#34;
+    def __init__(self, dim: int, max_period: float = 10000.0, xpos: bool = False,
+                 scale: float = 1.0, device=None, dtype: torch.dtype = torch.float32):
+        super().__init__()
+        assert dim % 2 == 0
+        self.scale = scale
+        assert dtype in [torch.float64, torch.float32]
+        self.dtype = dtype
+
+        adim = torch.arange(0, dim, 2, device=device, dtype=dtype)[: (dim // 2)]
+        frequencies = 1.0 / (max_period ** (adim / dim))
+        self.register_buffer(&#34;frequencies&#34;, frequencies)
+        self.rotation: tp.Optional[torch.Tensor] = None
+
+        self.xpos = XPos(dim, device=device, dtype=dtype) if xpos else None
+
+    def get_rotation(self, start: int, end: int):
+        &#34;&#34;&#34;Create complex rotation tensor, cache values for fast computation.
+        &#34;&#34;&#34;
+        if self.rotation is None or end &gt; self.rotation.shape[0]:
+            assert isinstance(self.frequencies, torch.Tensor)  # Satisfy type checker.
+            idx = torch.arange(end, device=self.frequencies.device, dtype=self.dtype)
+            angles = torch.outer(idx, self.frequencies)
+            self.rotation = torch.polar(torch.ones_like(angles), angles)
+        return self.rotation[start:end]
+
+    def rotate(self, x: torch.Tensor, start: int = 0, invert_decay: bool = False):
+        &#34;&#34;&#34;Apply rope rotation to query or key tensor.
+        &#34;&#34;&#34;
+        T = x.shape[1]
+        rotation = self.get_rotation(start, start + T).unsqueeze(0).unsqueeze(2)
+
+        if self.xpos:
+            decay = self.xpos.get_decay(start, start + T).unsqueeze(0).unsqueeze(2)
+        else:
+            decay = 1.0
+
+        if invert_decay:
+            decay = decay ** -1
+
+        x_complex = torch.view_as_complex(x.to(self.dtype).reshape(*x.shape[:-1], -1, 2))
+        scaled_rotation = (rotation * decay) * self.scale + (1.0 - self.scale)
+        x_out = torch.view_as_real(x_complex * scaled_rotation).flatten(-2)
+
+        return x_out.type_as(x)
+
+    def rotate_qk(self, query: torch.Tensor, key: torch.Tensor, start: int = 0):
+        &#34;&#34;&#34; Apply rope rotation to both query and key tensors.
+        Supports streaming mode, in which query and key are not expected to have the same shape.
+        In streaming mode, key will be of legnth [P + C] with P the cached past timesteps, but
+        query will be [C] (typically C == 1).
+
+        Args:
+            query (torch.Tensor): Query to rotate.
+            key (torch.Tensor): Key to rotate.
+            start (int): Start index of the sequence for time offset.
+        &#34;&#34;&#34;
+        query_timesteps = query.shape[1]
+        key_timesteps = key.shape[1]
+        streaming_offset = key_timesteps - query_timesteps
+
+        query_out = self.rotate(query, start + streaming_offset)
+        key_out = self.rotate(key, start, invert_decay=True)
+
+        return query_out, key_out</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.modules.rope.RotaryEmbedding"><code class="flex name class">
+<span>class <span class="ident">RotaryEmbedding</span></span>
+<span>(</span><span>dim: int, max_period: float = 10000.0, xpos: bool = False, scale: float = 1.0, device=None, dtype: torch.dtype = torch.float32)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Rotary positional embedding (RoPE) from <a href="https://arxiv.org/abs/2104.09864">Su et al 2022</a>.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>dim</code></strong> :&ensp;<code>int</code></dt>
+<dd>Embedding dimension (twice the number of frequencies).</dd>
+<dt><strong><code>max_period</code></strong> :&ensp;<code>float</code></dt>
+<dd>Maximum period of the rotation frequencies.</dd>
+<dt><strong><code>xpos</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Use xPos, applies an exponential decay to rotation matrix.</dd>
+<dt><strong><code>scale</code></strong> :&ensp;<code>float</code></dt>
+<dd>Scale of positional embedding, set to 0 to deactivate.</dd>
+<dt><strong><code>device</code></strong> :&ensp;<code>torch.device</code> or <code>None</code></dt>
+<dd>Device on which to initialize the module.</dd>
+<dt><strong><code>dtype</code></strong> :&ensp;<code>torch.dtype</code></dt>
+<dd>dtype to use to generate the embedding.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class RotaryEmbedding(nn.Module):
+    &#34;&#34;&#34;Rotary positional embedding (RoPE) from [Su et al 2022](https://arxiv.org/abs/2104.09864).
+
+    Args:
+        dim (int): Embedding dimension (twice the number of frequencies).
+        max_period (float): Maximum period of the rotation frequencies.
+        xpos (bool): Use xPos, applies an exponential decay to rotation matrix.
+        scale (float): Scale of positional embedding, set to 0 to deactivate.
+        device (torch.device or None): Device on which to initialize the module.
+        dtype (torch.dtype): dtype to use to generate the embedding.
+    &#34;&#34;&#34;
+    def __init__(self, dim: int, max_period: float = 10000.0, xpos: bool = False,
+                 scale: float = 1.0, device=None, dtype: torch.dtype = torch.float32):
+        super().__init__()
+        assert dim % 2 == 0
+        self.scale = scale
+        assert dtype in [torch.float64, torch.float32]
+        self.dtype = dtype
+
+        adim = torch.arange(0, dim, 2, device=device, dtype=dtype)[: (dim // 2)]
+        frequencies = 1.0 / (max_period ** (adim / dim))
+        self.register_buffer(&#34;frequencies&#34;, frequencies)
+        self.rotation: tp.Optional[torch.Tensor] = None
+
+        self.xpos = XPos(dim, device=device, dtype=dtype) if xpos else None
+
+    def get_rotation(self, start: int, end: int):
+        &#34;&#34;&#34;Create complex rotation tensor, cache values for fast computation.
+        &#34;&#34;&#34;
+        if self.rotation is None or end &gt; self.rotation.shape[0]:
+            assert isinstance(self.frequencies, torch.Tensor)  # Satisfy type checker.
+            idx = torch.arange(end, device=self.frequencies.device, dtype=self.dtype)
+            angles = torch.outer(idx, self.frequencies)
+            self.rotation = torch.polar(torch.ones_like(angles), angles)
+        return self.rotation[start:end]
+
+    def rotate(self, x: torch.Tensor, start: int = 0, invert_decay: bool = False):
+        &#34;&#34;&#34;Apply rope rotation to query or key tensor.
+        &#34;&#34;&#34;
+        T = x.shape[1]
+        rotation = self.get_rotation(start, start + T).unsqueeze(0).unsqueeze(2)
+
+        if self.xpos:
+            decay = self.xpos.get_decay(start, start + T).unsqueeze(0).unsqueeze(2)
+        else:
+            decay = 1.0
+
+        if invert_decay:
+            decay = decay ** -1
+
+        x_complex = torch.view_as_complex(x.to(self.dtype).reshape(*x.shape[:-1], -1, 2))
+        scaled_rotation = (rotation * decay) * self.scale + (1.0 - self.scale)
+        x_out = torch.view_as_real(x_complex * scaled_rotation).flatten(-2)
+
+        return x_out.type_as(x)
+
+    def rotate_qk(self, query: torch.Tensor, key: torch.Tensor, start: int = 0):
+        &#34;&#34;&#34; Apply rope rotation to both query and key tensors.
+        Supports streaming mode, in which query and key are not expected to have the same shape.
+        In streaming mode, key will be of legnth [P + C] with P the cached past timesteps, but
+        query will be [C] (typically C == 1).
+
+        Args:
+            query (torch.Tensor): Query to rotate.
+            key (torch.Tensor): Key to rotate.
+            start (int): Start index of the sequence for time offset.
+        &#34;&#34;&#34;
+        query_timesteps = query.shape[1]
+        key_timesteps = key.shape[1]
+        streaming_offset = key_timesteps - query_timesteps
+
+        query_out = self.rotate(query, start + streaming_offset)
+        key_out = self.rotate(key, start, invert_decay=True)
+
+        return query_out, key_out</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.rope.RotaryEmbedding.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.rope.RotaryEmbedding.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.rope.RotaryEmbedding.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.modules.rope.RotaryEmbedding.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, *input: Any) ‑> None</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Defines the computation performed at every call.</p>
+<p>Should be overridden by all subclasses.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>Although the recipe for forward pass needs to be defined within
+this function, one should call the :class:<code>Module</code> instance afterwards
+instead of this since the former takes care of running the
+registered hooks while the latter silently ignores them.</p>
+</div></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def _forward_unimplemented(self, *input: Any) -&gt; None:
+    r&#34;&#34;&#34;Defines the computation performed at every call.
+
+    Should be overridden by all subclasses.
+
+    .. note::
+        Although the recipe for forward pass needs to be defined within
+        this function, one should call the :class:`Module` instance afterwards
+        instead of this since the former takes care of running the
+        registered hooks while the latter silently ignores them.
+    &#34;&#34;&#34;
+    raise NotImplementedError(f&#34;Module [{type(self).__name__}] is missing the required \&#34;forward\&#34; function&#34;)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.rope.RotaryEmbedding.get_rotation"><code class="name flex">
+<span>def <span class="ident">get_rotation</span></span>(<span>self, start: int, end: int)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Create complex rotation tensor, cache values for fast computation.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_rotation(self, start: int, end: int):
+    &#34;&#34;&#34;Create complex rotation tensor, cache values for fast computation.
+    &#34;&#34;&#34;
+    if self.rotation is None or end &gt; self.rotation.shape[0]:
+        assert isinstance(self.frequencies, torch.Tensor)  # Satisfy type checker.
+        idx = torch.arange(end, device=self.frequencies.device, dtype=self.dtype)
+        angles = torch.outer(idx, self.frequencies)
+        self.rotation = torch.polar(torch.ones_like(angles), angles)
+    return self.rotation[start:end]</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.rope.RotaryEmbedding.rotate"><code class="name flex">
+<span>def <span class="ident">rotate</span></span>(<span>self, x: torch.Tensor, start: int = 0, invert_decay: bool = False)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Apply rope rotation to query or key tensor.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def rotate(self, x: torch.Tensor, start: int = 0, invert_decay: bool = False):
+    &#34;&#34;&#34;Apply rope rotation to query or key tensor.
+    &#34;&#34;&#34;
+    T = x.shape[1]
+    rotation = self.get_rotation(start, start + T).unsqueeze(0).unsqueeze(2)
+
+    if self.xpos:
+        decay = self.xpos.get_decay(start, start + T).unsqueeze(0).unsqueeze(2)
+    else:
+        decay = 1.0
+
+    if invert_decay:
+        decay = decay ** -1
+
+    x_complex = torch.view_as_complex(x.to(self.dtype).reshape(*x.shape[:-1], -1, 2))
+    scaled_rotation = (rotation * decay) * self.scale + (1.0 - self.scale)
+    x_out = torch.view_as_real(x_complex * scaled_rotation).flatten(-2)
+
+    return x_out.type_as(x)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.rope.RotaryEmbedding.rotate_qk"><code class="name flex">
+<span>def <span class="ident">rotate_qk</span></span>(<span>self, query: torch.Tensor, key: torch.Tensor, start: int = 0)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Apply rope rotation to both query and key tensors.
+Supports streaming mode, in which query and key are not expected to have the same shape.
+In streaming mode, key will be of legnth [P + C] with P the cached past timesteps, but
+query will be [C] (typically C == 1).</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>query</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>Query to rotate.</dd>
+<dt><strong><code>key</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>Key to rotate.</dd>
+<dt><strong><code>start</code></strong> :&ensp;<code>int</code></dt>
+<dd>Start index of the sequence for time offset.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def rotate_qk(self, query: torch.Tensor, key: torch.Tensor, start: int = 0):
+    &#34;&#34;&#34; Apply rope rotation to both query and key tensors.
+    Supports streaming mode, in which query and key are not expected to have the same shape.
+    In streaming mode, key will be of legnth [P + C] with P the cached past timesteps, but
+    query will be [C] (typically C == 1).
+
+    Args:
+        query (torch.Tensor): Query to rotate.
+        key (torch.Tensor): Key to rotate.
+        start (int): Start index of the sequence for time offset.
+    &#34;&#34;&#34;
+    query_timesteps = query.shape[1]
+    key_timesteps = key.shape[1]
+    streaming_offset = key_timesteps - query_timesteps
+
+    query_out = self.rotate(query, start + streaming_offset)
+    key_out = self.rotate(key, start, invert_decay=True)
+
+    return query_out, key_out</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.modules.rope.XPos"><code class="flex name class">
+<span>class <span class="ident">XPos</span></span>
+<span>(</span><span>dim: int, smoothing: float = 0.4, base_scale: int = 512, device=None, dtype: torch.dtype = torch.float32)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Length-extrapolatable positional embedding (xPos) from <a href="https://arxiv.org/abs/2212.10554v1">Sun et al 2022</a>.
+This applies an exponential decay to the RoPE rotation matrix.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>dim</code></strong> :&ensp;<code>int</code></dt>
+<dd>Embedding dimension.</dd>
+<dt><strong><code>smoothing</code></strong> :&ensp;<code>float</code></dt>
+<dd>Smoothing factor applied to the decay rates.</dd>
+<dt><strong><code>base_scale</code></strong> :&ensp;<code>int</code></dt>
+<dd>Base decay rate, given in terms of scaling time.</dd>
+<dt><strong><code>device</code></strong> :&ensp;<code>torch.device</code> or <code>None</code></dt>
+<dd>Device on which to initialize the module.</dd>
+<dt><strong><code>dtype</code></strong> :&ensp;<code>torch.dtype</code></dt>
+<dd>dtype to use to generate the embedding.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class XPos(nn.Module):
+    &#34;&#34;&#34;Length-extrapolatable positional embedding (xPos) from [Sun et al 2022](https://arxiv.org/abs/2212.10554v1).
+    This applies an exponential decay to the RoPE rotation matrix.
+
+    Args:
+        dim (int): Embedding dimension.
+        smoothing (float): Smoothing factor applied to the decay rates.
+        base_scale (int): Base decay rate, given in terms of scaling time.
+        device (torch.device or None): Device on which to initialize the module.
+        dtype (torch.dtype): dtype to use to generate the embedding.
+    &#34;&#34;&#34;
+    def __init__(self, dim: int, smoothing: float = 0.4, base_scale: int = 512,
+                 device=None, dtype: torch.dtype = torch.float32):
+        super().__init__()
+        assert dim % 2 == 0
+        assert dtype in [torch.float64, torch.float32]
+        self.dtype = dtype
+        self.base_scale = base_scale
+
+        half_dim = dim // 2
+        adim = torch.arange(half_dim, device=device, dtype=dtype)
+        decay_rates = (adim / half_dim + smoothing) / (1.0 + smoothing)
+        self.register_buffer(&#34;decay_rates&#34;, decay_rates)
+        self.decay: tp.Optional[torch.Tensor] = None
+
+    def get_decay(self, start: int, end: int):
+        &#34;&#34;&#34;Create complex decay tensor, cache values for fast computation.
+        &#34;&#34;&#34;
+        if self.decay is None or end &gt; self.decay.shape[0]:
+            assert isinstance(self.decay_rates, torch.Tensor)  # Satisfy type checker.
+            idx = torch.arange(end, device=self.decay_rates.device, dtype=self.dtype)
+            power = idx / self.base_scale
+            scale = self.decay_rates ** power.unsqueeze(-1)
+            self.decay = torch.polar(scale, torch.zeros_like(scale))
+        return self.decay[start:end]  # [T, C/2]</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.rope.XPos.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.rope.XPos.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.rope.XPos.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.modules.rope.XPos.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, *input: Any) ‑> None</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Defines the computation performed at every call.</p>
+<p>Should be overridden by all subclasses.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>Although the recipe for forward pass needs to be defined within
+this function, one should call the :class:<code>Module</code> instance afterwards
+instead of this since the former takes care of running the
+registered hooks while the latter silently ignores them.</p>
+</div></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def _forward_unimplemented(self, *input: Any) -&gt; None:
+    r&#34;&#34;&#34;Defines the computation performed at every call.
+
+    Should be overridden by all subclasses.
+
+    .. note::
+        Although the recipe for forward pass needs to be defined within
+        this function, one should call the :class:`Module` instance afterwards
+        instead of this since the former takes care of running the
+        registered hooks while the latter silently ignores them.
+    &#34;&#34;&#34;
+    raise NotImplementedError(f&#34;Module [{type(self).__name__}] is missing the required \&#34;forward\&#34; function&#34;)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.rope.XPos.get_decay"><code class="name flex">
+<span>def <span class="ident">get_decay</span></span>(<span>self, start: int, end: int)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Create complex decay tensor, cache values for fast computation.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_decay(self, start: int, end: int):
+    &#34;&#34;&#34;Create complex decay tensor, cache values for fast computation.
+    &#34;&#34;&#34;
+    if self.decay is None or end &gt; self.decay.shape[0]:
+        assert isinstance(self.decay_rates, torch.Tensor)  # Satisfy type checker.
+        idx = torch.arange(end, device=self.decay_rates.device, dtype=self.dtype)
+        power = idx / self.base_scale
+        scale = self.decay_rates ** power.unsqueeze(-1)
+        self.decay = torch.polar(scale, torch.zeros_like(scale))
+    return self.decay[start:end]  # [T, C/2]</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.modules" href="index.html">audiocraft.modules</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.modules.rope.RotaryEmbedding" href="#audiocraft.modules.rope.RotaryEmbedding">RotaryEmbedding</a></code></h4>
+<ul class="two-column">
+<li><code><a title="audiocraft.modules.rope.RotaryEmbedding.call_super_init" href="#audiocraft.modules.rope.RotaryEmbedding.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.rope.RotaryEmbedding.dump_patches" href="#audiocraft.modules.rope.RotaryEmbedding.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.rope.RotaryEmbedding.forward" href="#audiocraft.modules.rope.RotaryEmbedding.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.rope.RotaryEmbedding.get_rotation" href="#audiocraft.modules.rope.RotaryEmbedding.get_rotation">get_rotation</a></code></li>
+<li><code><a title="audiocraft.modules.rope.RotaryEmbedding.rotate" href="#audiocraft.modules.rope.RotaryEmbedding.rotate">rotate</a></code></li>
+<li><code><a title="audiocraft.modules.rope.RotaryEmbedding.rotate_qk" href="#audiocraft.modules.rope.RotaryEmbedding.rotate_qk">rotate_qk</a></code></li>
+<li><code><a title="audiocraft.modules.rope.RotaryEmbedding.training" href="#audiocraft.modules.rope.RotaryEmbedding.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.rope.XPos" href="#audiocraft.modules.rope.XPos">XPos</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.rope.XPos.call_super_init" href="#audiocraft.modules.rope.XPos.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.rope.XPos.dump_patches" href="#audiocraft.modules.rope.XPos.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.rope.XPos.forward" href="#audiocraft.modules.rope.XPos.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.rope.XPos.get_decay" href="#audiocraft.modules.rope.XPos.get_decay">get_decay</a></code></li>
+<li><code><a title="audiocraft.modules.rope.XPos.training" href="#audiocraft.modules.rope.XPos.training">training</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/docs/audiocraft/modules/seanet.html b/docs/audiocraft/modules/seanet.html
new file mode 100644
index 00000000..831a462b
--- /dev/null
+++ b/docs/audiocraft/modules/seanet.html
@@ -0,0 +1,879 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.modules.seanet API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.modules.seanet</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import typing as tp
+
+import numpy as np
+import torch.nn as nn
+
+from .conv import StreamableConv1d, StreamableConvTranspose1d
+from .lstm import StreamableLSTM
+
+
+class SEANetResnetBlock(nn.Module):
+    &#34;&#34;&#34;Residual block from SEANet model.
+
+    Args:
+        dim (int): Dimension of the input/output.
+        kernel_sizes (list): List of kernel sizes for the convolutions.
+        dilations (list): List of dilations for the convolutions.
+        activation (str): Activation function.
+        activation_params (dict): Parameters to provide to the activation function.
+        norm (str): Normalization method.
+        norm_params (dict): Parameters to provide to the underlying normalization used along with the convolution.
+        causal (bool): Whether to use fully causal convolution.
+        pad_mode (str): Padding mode for the convolutions.
+        compress (int): Reduced dimensionality in residual branches (from Demucs v3).
+        true_skip (bool): Whether to use true skip connection or a simple
+            (streamable) convolution as the skip connection.
+    &#34;&#34;&#34;
+    def __init__(self, dim: int, kernel_sizes: tp.List[int] = [3, 1], dilations: tp.List[int] = [1, 1],
+                 activation: str = &#39;ELU&#39;, activation_params: dict = {&#39;alpha&#39;: 1.0},
+                 norm: str = &#39;none&#39;, norm_params: tp.Dict[str, tp.Any] = {}, causal: bool = False,
+                 pad_mode: str = &#39;reflect&#39;, compress: int = 2, true_skip: bool = True):
+        super().__init__()
+        assert len(kernel_sizes) == len(dilations), &#39;Number of kernel sizes should match number of dilations&#39;
+        act = getattr(nn, activation)
+        hidden = dim // compress
+        block = []
+        for i, (kernel_size, dilation) in enumerate(zip(kernel_sizes, dilations)):
+            in_chs = dim if i == 0 else hidden
+            out_chs = dim if i == len(kernel_sizes) - 1 else hidden
+            block += [
+                act(**activation_params),
+                StreamableConv1d(in_chs, out_chs, kernel_size=kernel_size, dilation=dilation,
+                                 norm=norm, norm_kwargs=norm_params,
+                                 causal=causal, pad_mode=pad_mode),
+            ]
+        self.block = nn.Sequential(*block)
+        self.shortcut: nn.Module
+        if true_skip:
+            self.shortcut = nn.Identity()
+        else:
+            self.shortcut = StreamableConv1d(dim, dim, kernel_size=1, norm=norm, norm_kwargs=norm_params,
+                                             causal=causal, pad_mode=pad_mode)
+
+    def forward(self, x):
+        return self.shortcut(x) + self.block(x)
+
+
+class SEANetEncoder(nn.Module):
+    &#34;&#34;&#34;SEANet encoder.
+
+    Args:
+        channels (int): Audio channels.
+        dimension (int): Intermediate representation dimension.
+        n_filters (int): Base width for the model.
+        n_residual_layers (int): nb of residual layers.
+        ratios (Sequence[int]): kernel size and stride ratios. The encoder uses downsampling ratios instead of
+            upsampling ratios, hence it will use the ratios in the reverse order to the ones specified here
+            that must match the decoder order. We use the decoder order as some models may only employ the decoder.
+        activation (str): Activation function.
+        activation_params (dict): Parameters to provide to the activation function.
+        norm (str): Normalization method.
+        norm_params (dict): Parameters to provide to the underlying normalization used along with the convolution.
+        kernel_size (int): Kernel size for the initial convolution.
+        last_kernel_size (int): Kernel size for the initial convolution.
+        residual_kernel_size (int): Kernel size for the residual layers.
+        dilation_base (int): How much to increase the dilation with each layer.
+        causal (bool): Whether to use fully causal convolution.
+        pad_mode (str): Padding mode for the convolutions.
+        true_skip (bool): Whether to use true skip connection or a simple
+            (streamable) convolution as the skip connection in the residual network blocks.
+        compress (int): Reduced dimensionality in residual branches (from Demucs v3).
+        lstm (int): Number of LSTM layers at the end of the encoder.
+        disable_norm_outer_blocks (int): Number of blocks for which we don&#39;t apply norm.
+            For the encoder, it corresponds to the N first blocks.
+    &#34;&#34;&#34;
+    def __init__(self, channels: int = 1, dimension: int = 128, n_filters: int = 32, n_residual_layers: int = 3,
+                 ratios: tp.List[int] = [8, 5, 4, 2], activation: str = &#39;ELU&#39;, activation_params: dict = {&#39;alpha&#39;: 1.0},
+                 norm: str = &#39;none&#39;, norm_params: tp.Dict[str, tp.Any] = {}, kernel_size: int = 7,
+                 last_kernel_size: int = 7, residual_kernel_size: int = 3, dilation_base: int = 2, causal: bool = False,
+                 pad_mode: str = &#39;reflect&#39;, true_skip: bool = True, compress: int = 2, lstm: int = 0,
+                 disable_norm_outer_blocks: int = 0):
+        super().__init__()
+        self.channels = channels
+        self.dimension = dimension
+        self.n_filters = n_filters
+        self.ratios = list(reversed(ratios))
+        del ratios
+        self.n_residual_layers = n_residual_layers
+        self.hop_length = np.prod(self.ratios)
+        self.n_blocks = len(self.ratios) + 2  # first and last conv + residual blocks
+        self.disable_norm_outer_blocks = disable_norm_outer_blocks
+        assert self.disable_norm_outer_blocks &gt;= 0 and self.disable_norm_outer_blocks &lt;= self.n_blocks, \
+            &#34;Number of blocks for which to disable norm is invalid.&#34; \
+            &#34;It should be lower or equal to the actual number of blocks in the network and greater or equal to 0.&#34;
+
+        act = getattr(nn, activation)
+        mult = 1
+        model: tp.List[nn.Module] = [
+            StreamableConv1d(channels, mult * n_filters, kernel_size,
+                             norm=&#39;none&#39; if self.disable_norm_outer_blocks &gt;= 1 else norm,
+                             norm_kwargs=norm_params, causal=causal, pad_mode=pad_mode)
+        ]
+        # Downsample to raw audio scale
+        for i, ratio in enumerate(self.ratios):
+            block_norm = &#39;none&#39; if self.disable_norm_outer_blocks &gt;= i + 2 else norm
+            # Add residual layers
+            for j in range(n_residual_layers):
+                model += [
+                    SEANetResnetBlock(mult * n_filters, kernel_sizes=[residual_kernel_size, 1],
+                                      dilations=[dilation_base ** j, 1],
+                                      norm=block_norm, norm_params=norm_params,
+                                      activation=activation, activation_params=activation_params,
+                                      causal=causal, pad_mode=pad_mode, compress=compress, true_skip=true_skip)]
+
+            # Add downsampling layers
+            model += [
+                act(**activation_params),
+                StreamableConv1d(mult * n_filters, mult * n_filters * 2,
+                                 kernel_size=ratio * 2, stride=ratio,
+                                 norm=block_norm, norm_kwargs=norm_params,
+                                 causal=causal, pad_mode=pad_mode),
+            ]
+            mult *= 2
+
+        if lstm:
+            model += [StreamableLSTM(mult * n_filters, num_layers=lstm)]
+
+        model += [
+            act(**activation_params),
+            StreamableConv1d(mult * n_filters, dimension, last_kernel_size,
+                             norm=&#39;none&#39; if self.disable_norm_outer_blocks == self.n_blocks else norm,
+                             norm_kwargs=norm_params, causal=causal, pad_mode=pad_mode)
+        ]
+
+        self.model = nn.Sequential(*model)
+
+    def forward(self, x):
+        return self.model(x)
+
+
+class SEANetDecoder(nn.Module):
+    &#34;&#34;&#34;SEANet decoder.
+
+    Args:
+        channels (int): Audio channels.
+        dimension (int): Intermediate representation dimension.
+        n_filters (int): Base width for the model.
+        n_residual_layers (int): nb of residual layers.
+        ratios (Sequence[int]): kernel size and stride ratios.
+        activation (str): Activation function.
+        activation_params (dict): Parameters to provide to the activation function.
+        final_activation (str): Final activation function after all convolutions.
+        final_activation_params (dict): Parameters to provide to the activation function.
+        norm (str): Normalization method.
+        norm_params (dict): Parameters to provide to the underlying normalization used along with the convolution.
+        kernel_size (int): Kernel size for the initial convolution.
+        last_kernel_size (int): Kernel size for the initial convolution.
+        residual_kernel_size (int): Kernel size for the residual layers.
+        dilation_base (int): How much to increase the dilation with each layer.
+        causal (bool): Whether to use fully causal convolution.
+        pad_mode (str): Padding mode for the convolutions.
+        true_skip (bool): Whether to use true skip connection or a simple.
+            (streamable) convolution as the skip connection in the residual network blocks.
+        compress (int): Reduced dimensionality in residual branches (from Demucs v3).
+        lstm (int): Number of LSTM layers at the end of the encoder.
+        disable_norm_outer_blocks (int): Number of blocks for which we don&#39;t apply norm.
+            For the decoder, it corresponds to the N last blocks.
+        trim_right_ratio (float): Ratio for trimming at the right of the transposed convolution under the causal setup.
+            If equal to 1.0, it means that all the trimming is done at the right.
+    &#34;&#34;&#34;
+    def __init__(self, channels: int = 1, dimension: int = 128, n_filters: int = 32, n_residual_layers: int = 3,
+                 ratios: tp.List[int] = [8, 5, 4, 2], activation: str = &#39;ELU&#39;, activation_params: dict = {&#39;alpha&#39;: 1.0},
+                 final_activation: tp.Optional[str] = None, final_activation_params: tp.Optional[dict] = None,
+                 norm: str = &#39;none&#39;, norm_params: tp.Dict[str, tp.Any] = {}, kernel_size: int = 7,
+                 last_kernel_size: int = 7, residual_kernel_size: int = 3, dilation_base: int = 2, causal: bool = False,
+                 pad_mode: str = &#39;reflect&#39;, true_skip: bool = True, compress: int = 2, lstm: int = 0,
+                 disable_norm_outer_blocks: int = 0, trim_right_ratio: float = 1.0):
+        super().__init__()
+        self.dimension = dimension
+        self.channels = channels
+        self.n_filters = n_filters
+        self.ratios = ratios
+        del ratios
+        self.n_residual_layers = n_residual_layers
+        self.hop_length = np.prod(self.ratios)
+        self.n_blocks = len(self.ratios) + 2  # first and last conv + residual blocks
+        self.disable_norm_outer_blocks = disable_norm_outer_blocks
+        assert self.disable_norm_outer_blocks &gt;= 0 and self.disable_norm_outer_blocks &lt;= self.n_blocks, \
+            &#34;Number of blocks for which to disable norm is invalid.&#34; \
+            &#34;It should be lower or equal to the actual number of blocks in the network and greater or equal to 0.&#34;
+
+        act = getattr(nn, activation)
+        mult = int(2 ** len(self.ratios))
+        model: tp.List[nn.Module] = [
+            StreamableConv1d(dimension, mult * n_filters, kernel_size,
+                             norm=&#39;none&#39; if self.disable_norm_outer_blocks == self.n_blocks else norm,
+                             norm_kwargs=norm_params, causal=causal, pad_mode=pad_mode)
+        ]
+
+        if lstm:
+            model += [StreamableLSTM(mult * n_filters, num_layers=lstm)]
+
+        # Upsample to raw audio scale
+        for i, ratio in enumerate(self.ratios):
+            block_norm = &#39;none&#39; if self.disable_norm_outer_blocks &gt;= self.n_blocks - (i + 1) else norm
+            # Add upsampling layers
+            model += [
+                act(**activation_params),
+                StreamableConvTranspose1d(mult * n_filters, mult * n_filters // 2,
+                                          kernel_size=ratio * 2, stride=ratio,
+                                          norm=block_norm, norm_kwargs=norm_params,
+                                          causal=causal, trim_right_ratio=trim_right_ratio),
+            ]
+            # Add residual layers
+            for j in range(n_residual_layers):
+                model += [
+                    SEANetResnetBlock(mult * n_filters // 2, kernel_sizes=[residual_kernel_size, 1],
+                                      dilations=[dilation_base ** j, 1],
+                                      activation=activation, activation_params=activation_params,
+                                      norm=block_norm, norm_params=norm_params, causal=causal,
+                                      pad_mode=pad_mode, compress=compress, true_skip=true_skip)]
+
+            mult //= 2
+
+        # Add final layers
+        model += [
+            act(**activation_params),
+            StreamableConv1d(n_filters, channels, last_kernel_size,
+                             norm=&#39;none&#39; if self.disable_norm_outer_blocks &gt;= 1 else norm,
+                             norm_kwargs=norm_params, causal=causal, pad_mode=pad_mode)
+        ]
+        # Add optional final activation to decoder (eg. tanh)
+        if final_activation is not None:
+            final_act = getattr(nn, final_activation)
+            final_activation_params = final_activation_params or {}
+            model += [
+                final_act(**final_activation_params)
+            ]
+        self.model = nn.Sequential(*model)
+
+    def forward(self, z):
+        y = self.model(z)
+        return y</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.modules.seanet.SEANetDecoder"><code class="flex name class">
+<span>class <span class="ident">SEANetDecoder</span></span>
+<span>(</span><span>channels: int = 1, dimension: int = 128, n_filters: int = 32, n_residual_layers: int = 3, ratios: List[int] = [8, 5, 4, 2], activation: str = 'ELU', activation_params: dict = {'alpha': 1.0}, final_activation: Optional[str] = None, final_activation_params: Optional[dict] = None, norm: str = 'none', norm_params: Dict[str, Any] = {}, kernel_size: int = 7, last_kernel_size: int = 7, residual_kernel_size: int = 3, dilation_base: int = 2, causal: bool = False, pad_mode: str = 'reflect', true_skip: bool = True, compress: int = 2, lstm: int = 0, disable_norm_outer_blocks: int = 0, trim_right_ratio: float = 1.0)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>SEANet decoder.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>channels</code></strong> :&ensp;<code>int</code></dt>
+<dd>Audio channels.</dd>
+<dt><strong><code>dimension</code></strong> :&ensp;<code>int</code></dt>
+<dd>Intermediate representation dimension.</dd>
+<dt><strong><code>n_filters</code></strong> :&ensp;<code>int</code></dt>
+<dd>Base width for the model.</dd>
+<dt><strong><code>n_residual_layers</code></strong> :&ensp;<code>int</code></dt>
+<dd>nb of residual layers.</dd>
+<dt><strong><code>ratios</code></strong> :&ensp;<code>Sequence[int]</code></dt>
+<dd>kernel size and stride ratios.</dd>
+<dt><strong><code>activation</code></strong> :&ensp;<code>str</code></dt>
+<dd>Activation function.</dd>
+<dt><strong><code>activation_params</code></strong> :&ensp;<code>dict</code></dt>
+<dd>Parameters to provide to the activation function.</dd>
+<dt><strong><code>final_activation</code></strong> :&ensp;<code>str</code></dt>
+<dd>Final activation function after all convolutions.</dd>
+<dt><strong><code>final_activation_params</code></strong> :&ensp;<code>dict</code></dt>
+<dd>Parameters to provide to the activation function.</dd>
+<dt><strong><code>norm</code></strong> :&ensp;<code>str</code></dt>
+<dd>Normalization method.</dd>
+<dt><strong><code>norm_params</code></strong> :&ensp;<code>dict</code></dt>
+<dd>Parameters to provide to the underlying normalization used along with the convolution.</dd>
+<dt><strong><code>kernel_size</code></strong> :&ensp;<code>int</code></dt>
+<dd>Kernel size for the initial convolution.</dd>
+<dt><strong><code>last_kernel_size</code></strong> :&ensp;<code>int</code></dt>
+<dd>Kernel size for the initial convolution.</dd>
+<dt><strong><code>residual_kernel_size</code></strong> :&ensp;<code>int</code></dt>
+<dd>Kernel size for the residual layers.</dd>
+<dt><strong><code>dilation_base</code></strong> :&ensp;<code>int</code></dt>
+<dd>How much to increase the dilation with each layer.</dd>
+<dt><strong><code>causal</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to use fully causal convolution.</dd>
+<dt><strong><code>pad_mode</code></strong> :&ensp;<code>str</code></dt>
+<dd>Padding mode for the convolutions.</dd>
+<dt><strong><code>true_skip</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to use true skip connection or a simple.
+(streamable) convolution as the skip connection in the residual network blocks.</dd>
+<dt><strong><code>compress</code></strong> :&ensp;<code>int</code></dt>
+<dd>Reduced dimensionality in residual branches (from Demucs v3).</dd>
+<dt><strong><code>lstm</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of LSTM layers at the end of the encoder.</dd>
+<dt><strong><code>disable_norm_outer_blocks</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of blocks for which we don't apply norm.
+For the decoder, it corresponds to the N last blocks.</dd>
+<dt><strong><code>trim_right_ratio</code></strong> :&ensp;<code>float</code></dt>
+<dd>Ratio for trimming at the right of the transposed convolution under the causal setup.
+If equal to 1.0, it means that all the trimming is done at the right.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class SEANetDecoder(nn.Module):
+    &#34;&#34;&#34;SEANet decoder.
+
+    Args:
+        channels (int): Audio channels.
+        dimension (int): Intermediate representation dimension.
+        n_filters (int): Base width for the model.
+        n_residual_layers (int): nb of residual layers.
+        ratios (Sequence[int]): kernel size and stride ratios.
+        activation (str): Activation function.
+        activation_params (dict): Parameters to provide to the activation function.
+        final_activation (str): Final activation function after all convolutions.
+        final_activation_params (dict): Parameters to provide to the activation function.
+        norm (str): Normalization method.
+        norm_params (dict): Parameters to provide to the underlying normalization used along with the convolution.
+        kernel_size (int): Kernel size for the initial convolution.
+        last_kernel_size (int): Kernel size for the initial convolution.
+        residual_kernel_size (int): Kernel size for the residual layers.
+        dilation_base (int): How much to increase the dilation with each layer.
+        causal (bool): Whether to use fully causal convolution.
+        pad_mode (str): Padding mode for the convolutions.
+        true_skip (bool): Whether to use true skip connection or a simple.
+            (streamable) convolution as the skip connection in the residual network blocks.
+        compress (int): Reduced dimensionality in residual branches (from Demucs v3).
+        lstm (int): Number of LSTM layers at the end of the encoder.
+        disable_norm_outer_blocks (int): Number of blocks for which we don&#39;t apply norm.
+            For the decoder, it corresponds to the N last blocks.
+        trim_right_ratio (float): Ratio for trimming at the right of the transposed convolution under the causal setup.
+            If equal to 1.0, it means that all the trimming is done at the right.
+    &#34;&#34;&#34;
+    def __init__(self, channels: int = 1, dimension: int = 128, n_filters: int = 32, n_residual_layers: int = 3,
+                 ratios: tp.List[int] = [8, 5, 4, 2], activation: str = &#39;ELU&#39;, activation_params: dict = {&#39;alpha&#39;: 1.0},
+                 final_activation: tp.Optional[str] = None, final_activation_params: tp.Optional[dict] = None,
+                 norm: str = &#39;none&#39;, norm_params: tp.Dict[str, tp.Any] = {}, kernel_size: int = 7,
+                 last_kernel_size: int = 7, residual_kernel_size: int = 3, dilation_base: int = 2, causal: bool = False,
+                 pad_mode: str = &#39;reflect&#39;, true_skip: bool = True, compress: int = 2, lstm: int = 0,
+                 disable_norm_outer_blocks: int = 0, trim_right_ratio: float = 1.0):
+        super().__init__()
+        self.dimension = dimension
+        self.channels = channels
+        self.n_filters = n_filters
+        self.ratios = ratios
+        del ratios
+        self.n_residual_layers = n_residual_layers
+        self.hop_length = np.prod(self.ratios)
+        self.n_blocks = len(self.ratios) + 2  # first and last conv + residual blocks
+        self.disable_norm_outer_blocks = disable_norm_outer_blocks
+        assert self.disable_norm_outer_blocks &gt;= 0 and self.disable_norm_outer_blocks &lt;= self.n_blocks, \
+            &#34;Number of blocks for which to disable norm is invalid.&#34; \
+            &#34;It should be lower or equal to the actual number of blocks in the network and greater or equal to 0.&#34;
+
+        act = getattr(nn, activation)
+        mult = int(2 ** len(self.ratios))
+        model: tp.List[nn.Module] = [
+            StreamableConv1d(dimension, mult * n_filters, kernel_size,
+                             norm=&#39;none&#39; if self.disable_norm_outer_blocks == self.n_blocks else norm,
+                             norm_kwargs=norm_params, causal=causal, pad_mode=pad_mode)
+        ]
+
+        if lstm:
+            model += [StreamableLSTM(mult * n_filters, num_layers=lstm)]
+
+        # Upsample to raw audio scale
+        for i, ratio in enumerate(self.ratios):
+            block_norm = &#39;none&#39; if self.disable_norm_outer_blocks &gt;= self.n_blocks - (i + 1) else norm
+            # Add upsampling layers
+            model += [
+                act(**activation_params),
+                StreamableConvTranspose1d(mult * n_filters, mult * n_filters // 2,
+                                          kernel_size=ratio * 2, stride=ratio,
+                                          norm=block_norm, norm_kwargs=norm_params,
+                                          causal=causal, trim_right_ratio=trim_right_ratio),
+            ]
+            # Add residual layers
+            for j in range(n_residual_layers):
+                model += [
+                    SEANetResnetBlock(mult * n_filters // 2, kernel_sizes=[residual_kernel_size, 1],
+                                      dilations=[dilation_base ** j, 1],
+                                      activation=activation, activation_params=activation_params,
+                                      norm=block_norm, norm_params=norm_params, causal=causal,
+                                      pad_mode=pad_mode, compress=compress, true_skip=true_skip)]
+
+            mult //= 2
+
+        # Add final layers
+        model += [
+            act(**activation_params),
+            StreamableConv1d(n_filters, channels, last_kernel_size,
+                             norm=&#39;none&#39; if self.disable_norm_outer_blocks &gt;= 1 else norm,
+                             norm_kwargs=norm_params, causal=causal, pad_mode=pad_mode)
+        ]
+        # Add optional final activation to decoder (eg. tanh)
+        if final_activation is not None:
+            final_act = getattr(nn, final_activation)
+            final_activation_params = final_activation_params or {}
+            model += [
+                final_act(**final_activation_params)
+            ]
+        self.model = nn.Sequential(*model)
+
+    def forward(self, z):
+        y = self.model(z)
+        return y</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.seanet.SEANetDecoder.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.seanet.SEANetDecoder.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.seanet.SEANetDecoder.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.modules.seanet.SEANetDecoder.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, z) ‑> Callable[..., Any]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Defines the computation performed at every call.</p>
+<p>Should be overridden by all subclasses.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>Although the recipe for forward pass needs to be defined within
+this function, one should call the :class:<code>Module</code> instance afterwards
+instead of this since the former takes care of running the
+registered hooks while the latter silently ignores them.</p>
+</div></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, z):
+    y = self.model(z)
+    return y</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.modules.seanet.SEANetEncoder"><code class="flex name class">
+<span>class <span class="ident">SEANetEncoder</span></span>
+<span>(</span><span>channels: int = 1, dimension: int = 128, n_filters: int = 32, n_residual_layers: int = 3, ratios: List[int] = [8, 5, 4, 2], activation: str = 'ELU', activation_params: dict = {'alpha': 1.0}, norm: str = 'none', norm_params: Dict[str, Any] = {}, kernel_size: int = 7, last_kernel_size: int = 7, residual_kernel_size: int = 3, dilation_base: int = 2, causal: bool = False, pad_mode: str = 'reflect', true_skip: bool = True, compress: int = 2, lstm: int = 0, disable_norm_outer_blocks: int = 0)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>SEANet encoder.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>channels</code></strong> :&ensp;<code>int</code></dt>
+<dd>Audio channels.</dd>
+<dt><strong><code>dimension</code></strong> :&ensp;<code>int</code></dt>
+<dd>Intermediate representation dimension.</dd>
+<dt><strong><code>n_filters</code></strong> :&ensp;<code>int</code></dt>
+<dd>Base width for the model.</dd>
+<dt><strong><code>n_residual_layers</code></strong> :&ensp;<code>int</code></dt>
+<dd>nb of residual layers.</dd>
+<dt><strong><code>ratios</code></strong> :&ensp;<code>Sequence[int]</code></dt>
+<dd>kernel size and stride ratios. The encoder uses downsampling ratios instead of
+upsampling ratios, hence it will use the ratios in the reverse order to the ones specified here
+that must match the decoder order. We use the decoder order as some models may only employ the decoder.</dd>
+<dt><strong><code>activation</code></strong> :&ensp;<code>str</code></dt>
+<dd>Activation function.</dd>
+<dt><strong><code>activation_params</code></strong> :&ensp;<code>dict</code></dt>
+<dd>Parameters to provide to the activation function.</dd>
+<dt><strong><code>norm</code></strong> :&ensp;<code>str</code></dt>
+<dd>Normalization method.</dd>
+<dt><strong><code>norm_params</code></strong> :&ensp;<code>dict</code></dt>
+<dd>Parameters to provide to the underlying normalization used along with the convolution.</dd>
+<dt><strong><code>kernel_size</code></strong> :&ensp;<code>int</code></dt>
+<dd>Kernel size for the initial convolution.</dd>
+<dt><strong><code>last_kernel_size</code></strong> :&ensp;<code>int</code></dt>
+<dd>Kernel size for the initial convolution.</dd>
+<dt><strong><code>residual_kernel_size</code></strong> :&ensp;<code>int</code></dt>
+<dd>Kernel size for the residual layers.</dd>
+<dt><strong><code>dilation_base</code></strong> :&ensp;<code>int</code></dt>
+<dd>How much to increase the dilation with each layer.</dd>
+<dt><strong><code>causal</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to use fully causal convolution.</dd>
+<dt><strong><code>pad_mode</code></strong> :&ensp;<code>str</code></dt>
+<dd>Padding mode for the convolutions.</dd>
+<dt><strong><code>true_skip</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to use true skip connection or a simple
+(streamable) convolution as the skip connection in the residual network blocks.</dd>
+<dt><strong><code>compress</code></strong> :&ensp;<code>int</code></dt>
+<dd>Reduced dimensionality in residual branches (from Demucs v3).</dd>
+<dt><strong><code>lstm</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of LSTM layers at the end of the encoder.</dd>
+<dt><strong><code>disable_norm_outer_blocks</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of blocks for which we don't apply norm.
+For the encoder, it corresponds to the N first blocks.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class SEANetEncoder(nn.Module):
+    &#34;&#34;&#34;SEANet encoder.
+
+    Args:
+        channels (int): Audio channels.
+        dimension (int): Intermediate representation dimension.
+        n_filters (int): Base width for the model.
+        n_residual_layers (int): nb of residual layers.
+        ratios (Sequence[int]): kernel size and stride ratios. The encoder uses downsampling ratios instead of
+            upsampling ratios, hence it will use the ratios in the reverse order to the ones specified here
+            that must match the decoder order. We use the decoder order as some models may only employ the decoder.
+        activation (str): Activation function.
+        activation_params (dict): Parameters to provide to the activation function.
+        norm (str): Normalization method.
+        norm_params (dict): Parameters to provide to the underlying normalization used along with the convolution.
+        kernel_size (int): Kernel size for the initial convolution.
+        last_kernel_size (int): Kernel size for the initial convolution.
+        residual_kernel_size (int): Kernel size for the residual layers.
+        dilation_base (int): How much to increase the dilation with each layer.
+        causal (bool): Whether to use fully causal convolution.
+        pad_mode (str): Padding mode for the convolutions.
+        true_skip (bool): Whether to use true skip connection or a simple
+            (streamable) convolution as the skip connection in the residual network blocks.
+        compress (int): Reduced dimensionality in residual branches (from Demucs v3).
+        lstm (int): Number of LSTM layers at the end of the encoder.
+        disable_norm_outer_blocks (int): Number of blocks for which we don&#39;t apply norm.
+            For the encoder, it corresponds to the N first blocks.
+    &#34;&#34;&#34;
+    def __init__(self, channels: int = 1, dimension: int = 128, n_filters: int = 32, n_residual_layers: int = 3,
+                 ratios: tp.List[int] = [8, 5, 4, 2], activation: str = &#39;ELU&#39;, activation_params: dict = {&#39;alpha&#39;: 1.0},
+                 norm: str = &#39;none&#39;, norm_params: tp.Dict[str, tp.Any] = {}, kernel_size: int = 7,
+                 last_kernel_size: int = 7, residual_kernel_size: int = 3, dilation_base: int = 2, causal: bool = False,
+                 pad_mode: str = &#39;reflect&#39;, true_skip: bool = True, compress: int = 2, lstm: int = 0,
+                 disable_norm_outer_blocks: int = 0):
+        super().__init__()
+        self.channels = channels
+        self.dimension = dimension
+        self.n_filters = n_filters
+        self.ratios = list(reversed(ratios))
+        del ratios
+        self.n_residual_layers = n_residual_layers
+        self.hop_length = np.prod(self.ratios)
+        self.n_blocks = len(self.ratios) + 2  # first and last conv + residual blocks
+        self.disable_norm_outer_blocks = disable_norm_outer_blocks
+        assert self.disable_norm_outer_blocks &gt;= 0 and self.disable_norm_outer_blocks &lt;= self.n_blocks, \
+            &#34;Number of blocks for which to disable norm is invalid.&#34; \
+            &#34;It should be lower or equal to the actual number of blocks in the network and greater or equal to 0.&#34;
+
+        act = getattr(nn, activation)
+        mult = 1
+        model: tp.List[nn.Module] = [
+            StreamableConv1d(channels, mult * n_filters, kernel_size,
+                             norm=&#39;none&#39; if self.disable_norm_outer_blocks &gt;= 1 else norm,
+                             norm_kwargs=norm_params, causal=causal, pad_mode=pad_mode)
+        ]
+        # Downsample to raw audio scale
+        for i, ratio in enumerate(self.ratios):
+            block_norm = &#39;none&#39; if self.disable_norm_outer_blocks &gt;= i + 2 else norm
+            # Add residual layers
+            for j in range(n_residual_layers):
+                model += [
+                    SEANetResnetBlock(mult * n_filters, kernel_sizes=[residual_kernel_size, 1],
+                                      dilations=[dilation_base ** j, 1],
+                                      norm=block_norm, norm_params=norm_params,
+                                      activation=activation, activation_params=activation_params,
+                                      causal=causal, pad_mode=pad_mode, compress=compress, true_skip=true_skip)]
+
+            # Add downsampling layers
+            model += [
+                act(**activation_params),
+                StreamableConv1d(mult * n_filters, mult * n_filters * 2,
+                                 kernel_size=ratio * 2, stride=ratio,
+                                 norm=block_norm, norm_kwargs=norm_params,
+                                 causal=causal, pad_mode=pad_mode),
+            ]
+            mult *= 2
+
+        if lstm:
+            model += [StreamableLSTM(mult * n_filters, num_layers=lstm)]
+
+        model += [
+            act(**activation_params),
+            StreamableConv1d(mult * n_filters, dimension, last_kernel_size,
+                             norm=&#39;none&#39; if self.disable_norm_outer_blocks == self.n_blocks else norm,
+                             norm_kwargs=norm_params, causal=causal, pad_mode=pad_mode)
+        ]
+
+        self.model = nn.Sequential(*model)
+
+    def forward(self, x):
+        return self.model(x)</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.seanet.SEANetEncoder.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.seanet.SEANetEncoder.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.seanet.SEANetEncoder.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.modules.seanet.SEANetEncoder.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, x) ‑> Callable[..., Any]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Defines the computation performed at every call.</p>
+<p>Should be overridden by all subclasses.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>Although the recipe for forward pass needs to be defined within
+this function, one should call the :class:<code>Module</code> instance afterwards
+instead of this since the former takes care of running the
+registered hooks while the latter silently ignores them.</p>
+</div></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, x):
+    return self.model(x)</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.modules.seanet.SEANetResnetBlock"><code class="flex name class">
+<span>class <span class="ident">SEANetResnetBlock</span></span>
+<span>(</span><span>dim: int, kernel_sizes: List[int] = [3, 1], dilations: List[int] = [1, 1], activation: str = 'ELU', activation_params: dict = {'alpha': 1.0}, norm: str = 'none', norm_params: Dict[str, Any] = {}, causal: bool = False, pad_mode: str = 'reflect', compress: int = 2, true_skip: bool = True)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Residual block from SEANet model.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>dim</code></strong> :&ensp;<code>int</code></dt>
+<dd>Dimension of the input/output.</dd>
+<dt><strong><code>kernel_sizes</code></strong> :&ensp;<code>list</code></dt>
+<dd>List of kernel sizes for the convolutions.</dd>
+<dt><strong><code>dilations</code></strong> :&ensp;<code>list</code></dt>
+<dd>List of dilations for the convolutions.</dd>
+<dt><strong><code>activation</code></strong> :&ensp;<code>str</code></dt>
+<dd>Activation function.</dd>
+<dt><strong><code>activation_params</code></strong> :&ensp;<code>dict</code></dt>
+<dd>Parameters to provide to the activation function.</dd>
+<dt><strong><code>norm</code></strong> :&ensp;<code>str</code></dt>
+<dd>Normalization method.</dd>
+<dt><strong><code>norm_params</code></strong> :&ensp;<code>dict</code></dt>
+<dd>Parameters to provide to the underlying normalization used along with the convolution.</dd>
+<dt><strong><code>causal</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to use fully causal convolution.</dd>
+<dt><strong><code>pad_mode</code></strong> :&ensp;<code>str</code></dt>
+<dd>Padding mode for the convolutions.</dd>
+<dt><strong><code>compress</code></strong> :&ensp;<code>int</code></dt>
+<dd>Reduced dimensionality in residual branches (from Demucs v3).</dd>
+<dt><strong><code>true_skip</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to use true skip connection or a simple
+(streamable) convolution as the skip connection.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class SEANetResnetBlock(nn.Module):
+    &#34;&#34;&#34;Residual block from SEANet model.
+
+    Args:
+        dim (int): Dimension of the input/output.
+        kernel_sizes (list): List of kernel sizes for the convolutions.
+        dilations (list): List of dilations for the convolutions.
+        activation (str): Activation function.
+        activation_params (dict): Parameters to provide to the activation function.
+        norm (str): Normalization method.
+        norm_params (dict): Parameters to provide to the underlying normalization used along with the convolution.
+        causal (bool): Whether to use fully causal convolution.
+        pad_mode (str): Padding mode for the convolutions.
+        compress (int): Reduced dimensionality in residual branches (from Demucs v3).
+        true_skip (bool): Whether to use true skip connection or a simple
+            (streamable) convolution as the skip connection.
+    &#34;&#34;&#34;
+    def __init__(self, dim: int, kernel_sizes: tp.List[int] = [3, 1], dilations: tp.List[int] = [1, 1],
+                 activation: str = &#39;ELU&#39;, activation_params: dict = {&#39;alpha&#39;: 1.0},
+                 norm: str = &#39;none&#39;, norm_params: tp.Dict[str, tp.Any] = {}, causal: bool = False,
+                 pad_mode: str = &#39;reflect&#39;, compress: int = 2, true_skip: bool = True):
+        super().__init__()
+        assert len(kernel_sizes) == len(dilations), &#39;Number of kernel sizes should match number of dilations&#39;
+        act = getattr(nn, activation)
+        hidden = dim // compress
+        block = []
+        for i, (kernel_size, dilation) in enumerate(zip(kernel_sizes, dilations)):
+            in_chs = dim if i == 0 else hidden
+            out_chs = dim if i == len(kernel_sizes) - 1 else hidden
+            block += [
+                act(**activation_params),
+                StreamableConv1d(in_chs, out_chs, kernel_size=kernel_size, dilation=dilation,
+                                 norm=norm, norm_kwargs=norm_params,
+                                 causal=causal, pad_mode=pad_mode),
+            ]
+        self.block = nn.Sequential(*block)
+        self.shortcut: nn.Module
+        if true_skip:
+            self.shortcut = nn.Identity()
+        else:
+            self.shortcut = StreamableConv1d(dim, dim, kernel_size=1, norm=norm, norm_kwargs=norm_params,
+                                             causal=causal, pad_mode=pad_mode)
+
+    def forward(self, x):
+        return self.shortcut(x) + self.block(x)</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.seanet.SEANetResnetBlock.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.seanet.SEANetResnetBlock.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.seanet.SEANetResnetBlock.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.modules.seanet.SEANetResnetBlock.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, x) ‑> Callable[..., Any]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Defines the computation performed at every call.</p>
+<p>Should be overridden by all subclasses.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>Although the recipe for forward pass needs to be defined within
+this function, one should call the :class:<code>Module</code> instance afterwards
+instead of this since the former takes care of running the
+registered hooks while the latter silently ignores them.</p>
+</div></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, x):
+    return self.shortcut(x) + self.block(x)</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.modules" href="index.html">audiocraft.modules</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.modules.seanet.SEANetDecoder" href="#audiocraft.modules.seanet.SEANetDecoder">SEANetDecoder</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.seanet.SEANetDecoder.call_super_init" href="#audiocraft.modules.seanet.SEANetDecoder.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.seanet.SEANetDecoder.dump_patches" href="#audiocraft.modules.seanet.SEANetDecoder.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.seanet.SEANetDecoder.forward" href="#audiocraft.modules.seanet.SEANetDecoder.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.seanet.SEANetDecoder.training" href="#audiocraft.modules.seanet.SEANetDecoder.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.seanet.SEANetEncoder" href="#audiocraft.modules.seanet.SEANetEncoder">SEANetEncoder</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.seanet.SEANetEncoder.call_super_init" href="#audiocraft.modules.seanet.SEANetEncoder.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.seanet.SEANetEncoder.dump_patches" href="#audiocraft.modules.seanet.SEANetEncoder.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.seanet.SEANetEncoder.forward" href="#audiocraft.modules.seanet.SEANetEncoder.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.seanet.SEANetEncoder.training" href="#audiocraft.modules.seanet.SEANetEncoder.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.seanet.SEANetResnetBlock" href="#audiocraft.modules.seanet.SEANetResnetBlock">SEANetResnetBlock</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.seanet.SEANetResnetBlock.call_super_init" href="#audiocraft.modules.seanet.SEANetResnetBlock.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.seanet.SEANetResnetBlock.dump_patches" href="#audiocraft.modules.seanet.SEANetResnetBlock.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.seanet.SEANetResnetBlock.forward" href="#audiocraft.modules.seanet.SEANetResnetBlock.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.seanet.SEANetResnetBlock.training" href="#audiocraft.modules.seanet.SEANetResnetBlock.training">training</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/docs/audiocraft/modules/streaming.html b/docs/audiocraft/modules/streaming.html
new file mode 100644
index 00000000..9e2df1f9
--- /dev/null
+++ b/docs/audiocraft/modules/streaming.html
@@ -0,0 +1,573 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.modules.streaming API documentation</title>
+<meta name="description" content="Streaming module API that should be implemented by all Streaming components," />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.modules.streaming</code></h1>
+</header>
+<section id="section-intro">
+<p>Streaming module API that should be implemented by all Streaming components,</p>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+&#34;&#34;&#34;
+Streaming module API that should be implemented by all Streaming components,
+&#34;&#34;&#34;
+
+from contextlib import contextmanager
+import typing as tp
+from torch import nn
+import torch
+
+
+State = tp.Dict[str, torch.Tensor]
+
+
+class StreamingModule(nn.Module):
+    &#34;&#34;&#34;Common API for streaming components.
+
+    Each streaming component has a streaming state, which is just a dict[str, Tensor].
+    By convention, the first dim of each tensor must be the batch size.
+    Don&#39;t use dots in the key names, as this would clash with submodules
+    (like in state_dict).
+
+    If `self._is_streaming` is True, the component should use and remember
+    the proper state inside `self._streaming_state`.
+
+    To set a streaming component in streaming state, use
+
+        with module.streaming():
+            ...
+
+    This will automatically reset the streaming state when exiting the context manager.
+    This also automatically propagates to all streaming children module.
+
+    Some module might also implement the `StreamingModule.flush` method, although
+    this one is trickier, as all parents module must be StreamingModule and implement
+    it as well for it to work properly. See `StreamingSequential` after.
+    &#34;&#34;&#34;
+    def __init__(self) -&gt; None:
+        super().__init__()
+        self._streaming_state: State = {}
+        self._is_streaming = False
+
+    def _apply_named_streaming(self, fn: tp.Any):
+        for name, module in self.named_modules():
+            if isinstance(module, StreamingModule):
+                fn(name, module)
+
+    def _set_streaming(self, streaming: bool):
+        def _set_streaming(name, module):
+            module._is_streaming = streaming
+        self._apply_named_streaming(_set_streaming)
+
+    @contextmanager
+    def streaming(self):
+        &#34;&#34;&#34;Context manager to enter streaming mode. Reset streaming state on exit.
+        &#34;&#34;&#34;
+        self._set_streaming(True)
+        try:
+            yield
+        finally:
+            self._set_streaming(False)
+            self.reset_streaming()
+
+    def reset_streaming(self):
+        &#34;&#34;&#34;Reset the streaming state.
+        &#34;&#34;&#34;
+        def _reset(name: str, module: StreamingModule):
+            module._streaming_state.clear()
+
+        self._apply_named_streaming(_reset)
+
+    def get_streaming_state(self) -&gt; State:
+        &#34;&#34;&#34;Return the streaming state, including that of sub-modules.
+        &#34;&#34;&#34;
+        state: State = {}
+
+        def _add(name: str, module: StreamingModule):
+            if name:
+                name += &#34;.&#34;
+            for key, value in module._streaming_state.items():
+                state[name + key] = value
+
+        self._apply_named_streaming(_add)
+        return state
+
+    def set_streaming_state(self, state: State):
+        &#34;&#34;&#34;Set the streaming state, including that of sub-modules.
+        &#34;&#34;&#34;
+        state = dict(state)
+
+        def _set(name: str, module: StreamingModule):
+            if name:
+                name += &#34;.&#34;
+            module._streaming_state.clear()
+            for key, value in list(state.items()):
+                # complexity is not ideal here, but probably fine.
+                if key.startswith(name):
+                    local_key = key[len(name):]
+                    if &#39;.&#39; not in local_key:
+                        module._streaming_state[local_key] = value
+                        del state[key]
+
+        self._apply_named_streaming(_set)
+        assert len(state) == 0, list(state.keys())
+
+    def flush(self, x: tp.Optional[torch.Tensor] = None):
+        &#34;&#34;&#34;Flush any remaining outputs that were waiting for completion.
+        Typically, for convolutions, this will add the final padding
+        and process the last buffer.
+
+        This should take an optional argument `x`, which will be provided
+        if a module before this one in the streaming pipeline has already
+        spitted out a flushed out buffer.
+        &#34;&#34;&#34;
+        if x is None:
+            return None
+        else:
+            return self(x)
+
+
+class StreamingSequential(StreamingModule, nn.Sequential):
+    &#34;&#34;&#34;A streaming compatible alternative of `nn.Sequential`.
+    &#34;&#34;&#34;
+    def flush(self, x: tp.Optional[torch.Tensor] = None):
+        for module in self:
+            if isinstance(module, StreamingModule):
+                x = module.flush(x)
+            elif x is not None:
+                x = module(x)
+        return x</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.modules.streaming.StreamingModule"><code class="flex name class">
+<span>class <span class="ident">StreamingModule</span></span>
+</code></dt>
+<dd>
+<div class="desc"><p>Common API for streaming components.</p>
+<p>Each streaming component has a streaming state, which is just a dict[str, Tensor].
+By convention, the first dim of each tensor must be the batch size.
+Don't use dots in the key names, as this would clash with submodules
+(like in state_dict).</p>
+<p>If <code>self._is_streaming</code> is True, the component should use and remember
+the proper state inside <code>self._streaming_state</code>.</p>
+<p>To set a streaming component in streaming state, use</p>
+<pre><code>with module.streaming():
+    ...
+</code></pre>
+<p>This will automatically reset the streaming state when exiting the context manager.
+This also automatically propagates to all streaming children module.</p>
+<p>Some module might also implement the <code><a title="audiocraft.modules.streaming.StreamingModule.flush" href="#audiocraft.modules.streaming.StreamingModule.flush">StreamingModule.flush()</a></code> method, although
+this one is trickier, as all parents module must be StreamingModule and implement
+it as well for it to work properly. See <code><a title="audiocraft.modules.streaming.StreamingSequential" href="#audiocraft.modules.streaming.StreamingSequential">StreamingSequential</a></code> after.</p>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class StreamingModule(nn.Module):
+    &#34;&#34;&#34;Common API for streaming components.
+
+    Each streaming component has a streaming state, which is just a dict[str, Tensor].
+    By convention, the first dim of each tensor must be the batch size.
+    Don&#39;t use dots in the key names, as this would clash with submodules
+    (like in state_dict).
+
+    If `self._is_streaming` is True, the component should use and remember
+    the proper state inside `self._streaming_state`.
+
+    To set a streaming component in streaming state, use
+
+        with module.streaming():
+            ...
+
+    This will automatically reset the streaming state when exiting the context manager.
+    This also automatically propagates to all streaming children module.
+
+    Some module might also implement the `StreamingModule.flush` method, although
+    this one is trickier, as all parents module must be StreamingModule and implement
+    it as well for it to work properly. See `StreamingSequential` after.
+    &#34;&#34;&#34;
+    def __init__(self) -&gt; None:
+        super().__init__()
+        self._streaming_state: State = {}
+        self._is_streaming = False
+
+    def _apply_named_streaming(self, fn: tp.Any):
+        for name, module in self.named_modules():
+            if isinstance(module, StreamingModule):
+                fn(name, module)
+
+    def _set_streaming(self, streaming: bool):
+        def _set_streaming(name, module):
+            module._is_streaming = streaming
+        self._apply_named_streaming(_set_streaming)
+
+    @contextmanager
+    def streaming(self):
+        &#34;&#34;&#34;Context manager to enter streaming mode. Reset streaming state on exit.
+        &#34;&#34;&#34;
+        self._set_streaming(True)
+        try:
+            yield
+        finally:
+            self._set_streaming(False)
+            self.reset_streaming()
+
+    def reset_streaming(self):
+        &#34;&#34;&#34;Reset the streaming state.
+        &#34;&#34;&#34;
+        def _reset(name: str, module: StreamingModule):
+            module._streaming_state.clear()
+
+        self._apply_named_streaming(_reset)
+
+    def get_streaming_state(self) -&gt; State:
+        &#34;&#34;&#34;Return the streaming state, including that of sub-modules.
+        &#34;&#34;&#34;
+        state: State = {}
+
+        def _add(name: str, module: StreamingModule):
+            if name:
+                name += &#34;.&#34;
+            for key, value in module._streaming_state.items():
+                state[name + key] = value
+
+        self._apply_named_streaming(_add)
+        return state
+
+    def set_streaming_state(self, state: State):
+        &#34;&#34;&#34;Set the streaming state, including that of sub-modules.
+        &#34;&#34;&#34;
+        state = dict(state)
+
+        def _set(name: str, module: StreamingModule):
+            if name:
+                name += &#34;.&#34;
+            module._streaming_state.clear()
+            for key, value in list(state.items()):
+                # complexity is not ideal here, but probably fine.
+                if key.startswith(name):
+                    local_key = key[len(name):]
+                    if &#39;.&#39; not in local_key:
+                        module._streaming_state[local_key] = value
+                        del state[key]
+
+        self._apply_named_streaming(_set)
+        assert len(state) == 0, list(state.keys())
+
+    def flush(self, x: tp.Optional[torch.Tensor] = None):
+        &#34;&#34;&#34;Flush any remaining outputs that were waiting for completion.
+        Typically, for convolutions, this will add the final padding
+        and process the last buffer.
+
+        This should take an optional argument `x`, which will be provided
+        if a module before this one in the streaming pipeline has already
+        spitted out a flushed out buffer.
+        &#34;&#34;&#34;
+        if x is None:
+            return None
+        else:
+            return self(x)</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Subclasses</h3>
+<ul class="hlist">
+<li><a title="audiocraft.models.lm.LMModel" href="../models/lm.html#audiocraft.models.lm.LMModel">LMModel</a></li>
+<li><a title="audiocraft.modules.conditioners.ConditionFuser" href="conditioners.html#audiocraft.modules.conditioners.ConditionFuser">ConditionFuser</a></li>
+<li><a title="audiocraft.modules.streaming.StreamingSequential" href="#audiocraft.modules.streaming.StreamingSequential">StreamingSequential</a></li>
+<li><a title="audiocraft.modules.transformer.StreamingMultiheadAttention" href="transformer.html#audiocraft.modules.transformer.StreamingMultiheadAttention">StreamingMultiheadAttention</a></li>
+<li><a title="audiocraft.modules.transformer.StreamingTransformer" href="transformer.html#audiocraft.modules.transformer.StreamingTransformer">StreamingTransformer</a></li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.streaming.StreamingModule.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.streaming.StreamingModule.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.streaming.StreamingModule.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.modules.streaming.StreamingModule.flush"><code class="name flex">
+<span>def <span class="ident">flush</span></span>(<span>self, x: Optional[torch.Tensor] = None)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Flush any remaining outputs that were waiting for completion.
+Typically, for convolutions, this will add the final padding
+and process the last buffer.</p>
+<p>This should take an optional argument <code>x</code>, which will be provided
+if a module before this one in the streaming pipeline has already
+spitted out a flushed out buffer.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def flush(self, x: tp.Optional[torch.Tensor] = None):
+    &#34;&#34;&#34;Flush any remaining outputs that were waiting for completion.
+    Typically, for convolutions, this will add the final padding
+    and process the last buffer.
+
+    This should take an optional argument `x`, which will be provided
+    if a module before this one in the streaming pipeline has already
+    spitted out a flushed out buffer.
+    &#34;&#34;&#34;
+    if x is None:
+        return None
+    else:
+        return self(x)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.streaming.StreamingModule.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, *input: Any) ‑> None</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Defines the computation performed at every call.</p>
+<p>Should be overridden by all subclasses.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>Although the recipe for forward pass needs to be defined within
+this function, one should call the :class:<code>Module</code> instance afterwards
+instead of this since the former takes care of running the
+registered hooks while the latter silently ignores them.</p>
+</div></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def _forward_unimplemented(self, *input: Any) -&gt; None:
+    r&#34;&#34;&#34;Defines the computation performed at every call.
+
+    Should be overridden by all subclasses.
+
+    .. note::
+        Although the recipe for forward pass needs to be defined within
+        this function, one should call the :class:`Module` instance afterwards
+        instead of this since the former takes care of running the
+        registered hooks while the latter silently ignores them.
+    &#34;&#34;&#34;
+    raise NotImplementedError(f&#34;Module [{type(self).__name__}] is missing the required \&#34;forward\&#34; function&#34;)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.streaming.StreamingModule.get_streaming_state"><code class="name flex">
+<span>def <span class="ident">get_streaming_state</span></span>(<span>self) ‑> Dict[str, torch.Tensor]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Return the streaming state, including that of sub-modules.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_streaming_state(self) -&gt; State:
+    &#34;&#34;&#34;Return the streaming state, including that of sub-modules.
+    &#34;&#34;&#34;
+    state: State = {}
+
+    def _add(name: str, module: StreamingModule):
+        if name:
+            name += &#34;.&#34;
+        for key, value in module._streaming_state.items():
+            state[name + key] = value
+
+    self._apply_named_streaming(_add)
+    return state</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.streaming.StreamingModule.reset_streaming"><code class="name flex">
+<span>def <span class="ident">reset_streaming</span></span>(<span>self)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Reset the streaming state.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def reset_streaming(self):
+    &#34;&#34;&#34;Reset the streaming state.
+    &#34;&#34;&#34;
+    def _reset(name: str, module: StreamingModule):
+        module._streaming_state.clear()
+
+    self._apply_named_streaming(_reset)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.streaming.StreamingModule.set_streaming_state"><code class="name flex">
+<span>def <span class="ident">set_streaming_state</span></span>(<span>self, state: Dict[str, torch.Tensor])</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Set the streaming state, including that of sub-modules.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def set_streaming_state(self, state: State):
+    &#34;&#34;&#34;Set the streaming state, including that of sub-modules.
+    &#34;&#34;&#34;
+    state = dict(state)
+
+    def _set(name: str, module: StreamingModule):
+        if name:
+            name += &#34;.&#34;
+        module._streaming_state.clear()
+        for key, value in list(state.items()):
+            # complexity is not ideal here, but probably fine.
+            if key.startswith(name):
+                local_key = key[len(name):]
+                if &#39;.&#39; not in local_key:
+                    module._streaming_state[local_key] = value
+                    del state[key]
+
+    self._apply_named_streaming(_set)
+    assert len(state) == 0, list(state.keys())</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.streaming.StreamingModule.streaming"><code class="name flex">
+<span>def <span class="ident">streaming</span></span>(<span>self)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Context manager to enter streaming mode. Reset streaming state on exit.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@contextmanager
+def streaming(self):
+    &#34;&#34;&#34;Context manager to enter streaming mode. Reset streaming state on exit.
+    &#34;&#34;&#34;
+    self._set_streaming(True)
+    try:
+        yield
+    finally:
+        self._set_streaming(False)
+        self.reset_streaming()</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.modules.streaming.StreamingSequential"><code class="flex name class">
+<span>class <span class="ident">StreamingSequential</span></span>
+</code></dt>
+<dd>
+<div class="desc"><p>A streaming compatible alternative of <code>nn.Sequential</code>.</p>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class StreamingSequential(StreamingModule, nn.Sequential):
+    &#34;&#34;&#34;A streaming compatible alternative of `nn.Sequential`.
+    &#34;&#34;&#34;
+    def flush(self, x: tp.Optional[torch.Tensor] = None):
+        for module in self:
+            if isinstance(module, StreamingModule):
+                x = module.flush(x)
+            elif x is not None:
+                x = module(x)
+        return x</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.streaming.StreamingModule" href="#audiocraft.modules.streaming.StreamingModule">StreamingModule</a></li>
+<li>torch.nn.modules.container.Sequential</li>
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Inherited members</h3>
+<ul class="hlist">
+<li><code><b><a title="audiocraft.modules.streaming.StreamingModule" href="#audiocraft.modules.streaming.StreamingModule">StreamingModule</a></b></code>:
+<ul class="hlist">
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.flush" href="#audiocraft.modules.streaming.StreamingModule.flush">flush</a></code></li>
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.forward" href="#audiocraft.modules.streaming.StreamingModule.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.get_streaming_state" href="#audiocraft.modules.streaming.StreamingModule.get_streaming_state">get_streaming_state</a></code></li>
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.reset_streaming" href="#audiocraft.modules.streaming.StreamingModule.reset_streaming">reset_streaming</a></code></li>
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.set_streaming_state" href="#audiocraft.modules.streaming.StreamingModule.set_streaming_state">set_streaming_state</a></code></li>
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.streaming" href="#audiocraft.modules.streaming.StreamingModule.streaming">streaming</a></code></li>
+</ul>
+</li>
+</ul>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.modules" href="index.html">audiocraft.modules</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.modules.streaming.StreamingModule" href="#audiocraft.modules.streaming.StreamingModule">StreamingModule</a></code></h4>
+<ul class="two-column">
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.call_super_init" href="#audiocraft.modules.streaming.StreamingModule.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.dump_patches" href="#audiocraft.modules.streaming.StreamingModule.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.flush" href="#audiocraft.modules.streaming.StreamingModule.flush">flush</a></code></li>
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.forward" href="#audiocraft.modules.streaming.StreamingModule.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.get_streaming_state" href="#audiocraft.modules.streaming.StreamingModule.get_streaming_state">get_streaming_state</a></code></li>
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.reset_streaming" href="#audiocraft.modules.streaming.StreamingModule.reset_streaming">reset_streaming</a></code></li>
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.set_streaming_state" href="#audiocraft.modules.streaming.StreamingModule.set_streaming_state">set_streaming_state</a></code></li>
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.streaming" href="#audiocraft.modules.streaming.StreamingModule.streaming">streaming</a></code></li>
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.training" href="#audiocraft.modules.streaming.StreamingModule.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.streaming.StreamingSequential" href="#audiocraft.modules.streaming.StreamingSequential">StreamingSequential</a></code></h4>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/docs/audiocraft/modules/transformer.html b/docs/audiocraft/modules/transformer.html
new file mode 100644
index 00000000..d6a79793
--- /dev/null
+++ b/docs/audiocraft/modules/transformer.html
@@ -0,0 +1,2012 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.modules.transformer API documentation</title>
+<meta name="description" content="Transformer model, with streaming support, xformer attention support
+and easy causal attention with a potentially finite receptive field …" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.modules.transformer</code></h1>
+</header>
+<section id="section-intro">
+<p>Transformer model, with streaming support, xformer attention support
+and easy causal attention with a potentially finite receptive field.</p>
+<p>See <code><a title="audiocraft.modules.transformer.StreamingTransformer" href="#audiocraft.modules.transformer.StreamingTransformer">StreamingTransformer</a></code> for more information.</p>
+<p>Unlike regular PyTorch Transformer, we make the hard choice that batches are first.</p>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+&#34;&#34;&#34;
+Transformer model, with streaming support, xformer attention support
+and easy causal attention with a potentially finite receptive field.
+
+See `StreamingTransformer` for more information.
+
+Unlike regular PyTorch Transformer, we make the hard choice that batches are first.
+&#34;&#34;&#34;
+
+import typing as tp
+
+from einops import rearrange
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from torch.utils.checkpoint import checkpoint as torch_checkpoint
+from xformers import ops
+
+from .rope import RotaryEmbedding
+from .streaming import StreamingModule
+
+_efficient_attention_backend: str = &#39;torch&#39;
+
+
+def set_efficient_attention_backend(backend: str = &#39;torch&#39;):
+    # Using torch by default, it seems a bit faster on older P100 GPUs (~20% faster).
+    global _efficient_attention_backend
+    assert _efficient_attention_backend in [&#39;xformers&#39;, &#39;torch&#39;]
+    _efficient_attention_backend = backend
+
+
+def _get_attention_time_dimension() -&gt; int:
+    if _efficient_attention_backend == &#39;torch&#39;:
+        return 2
+    else:
+        return 1
+
+
+def _is_profiled() -&gt; bool:
+    # Return true if we are currently running with a xformers profiler activated.
+    try:
+        from xformers.profiler import profiler
+    except ImportError:
+        return False
+    return profiler._Profiler._CURRENT_PROFILER is not None
+
+
+def create_norm_fn(norm_type: str, dim: int, **kwargs) -&gt; nn.Module:
+    &#34;&#34;&#34;Create normalization module for transformer encoder layer.
+
+    Args:
+        norm_type (str): Normalization method.
+        dim (int): Dimension of the normalized layer.
+        **kwargs (dict): Additional parameters for normalization layer.
+    Returns:
+        nn.Module: Normalization module.
+    &#34;&#34;&#34;
+    if norm_type == &#39;layer_norm&#39;:
+        return nn.LayerNorm(dim, eps=1e-5, **kwargs)
+    else:
+        raise ValueError(f&#34;Unknown norm type: {norm_type}&#34;)
+
+
+def create_sin_embedding(positions: torch.Tensor, dim: int, max_period: float = 10000,
+                         dtype: torch.dtype = torch.float32) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Create sinusoidal positional embedding, with shape `[B, T, C]`.
+
+    Args:
+        positions (torch.Tensor): LongTensor of positions.
+        dim (int): Dimension of the embedding.
+        max_period (float): Maximum period of the cosine/sine functions.
+        dtype (torch.dtype or str): dtype to use to generate the embedding.
+    Returns:
+        torch.Tensor: Sinusoidal positional embedding.
+    &#34;&#34;&#34;
+    # We aim for BTC format
+    assert dim % 2 == 0
+    half_dim = dim // 2
+    positions = positions.to(dtype)
+    adim = torch.arange(half_dim, device=positions.device, dtype=dtype).view(1, 1, -1)
+    max_period_tensor = torch.full([], max_period, device=positions.device, dtype=dtype)  # avoid sync point
+    phase = positions / (max_period_tensor ** (adim / (half_dim - 1)))
+    return torch.cat([torch.cos(phase), torch.sin(phase)], dim=-1)
+
+
+def expand_repeated_kv(x: torch.Tensor, n_rep: int) -&gt; torch.Tensor:
+    &#34;&#34;&#34;torch.repeat_interleave(x, dim=2, repeats=n_rep) from xlformers&#34;&#34;&#34;
+    if n_rep == 1:
+        return x
+    if _efficient_attention_backend == &#39;torch&#39;:
+        bs, n_kv_heads, slen, head_dim = x.shape
+        return (
+            x[:, :, None, :, :]
+            .expand(bs, n_kv_heads, n_rep, slen, head_dim)
+            .reshape(bs, n_kv_heads * n_rep, slen, head_dim)
+        )
+    else:
+        bs, slen, n_kv_heads, head_dim = x.shape
+        return (
+            x[:, :, :, None, :]
+            .expand(bs, slen, n_kv_heads, n_rep, head_dim)
+            .reshape(bs, slen, n_kv_heads * n_rep, head_dim)
+        )
+
+
+class LayerScale(nn.Module):
+    &#34;&#34;&#34;Layer scale from [Touvron et al 2021] (https://arxiv.org/pdf/2103.17239.pdf).
+    This rescales diagonaly the residual outputs close to 0, with a learnt scale.
+
+    Args:
+        channels (int): Number of channels.
+        init (float): Initial scale.
+        channel_last (bool): If True, expect `[*, C]` shaped tensors, otherwise, `[*, C, T]`.
+        device (torch.device or None): Device on which to initialize the module.
+        dtype (torch.dtype or None): dtype to use to initialize the module.
+    &#34;&#34;&#34;
+    def __init__(self, channels: int, init: float = 1e-4, channel_last: bool = True,
+                 device=None, dtype=None):
+        super().__init__()
+        self.channel_last = channel_last
+        self.scale = nn.Parameter(
+            torch.full((channels,), init,
+                       requires_grad=True, device=device, dtype=dtype))
+
+    def forward(self, x: torch.Tensor):
+        if self.channel_last:
+            return self.scale * x
+        else:
+            return self.scale[:, None] * x
+
+
+class StreamingMultiheadAttention(StreamingModule):
+    &#34;&#34;&#34;Similar to `nn.MultiheadAttention` but with support for streaming, causal evaluation.
+
+    Args:
+        embed_dim (int): Dimension to project to.
+        num_heads (int): Number of heads.
+        dropout (float): Dropout level.
+        bias (bool): Use bias in projections.
+        causal (bool): Causal mask applied automatically.
+        past_context (int or None): Receptive field for the causal mask, infinite if None.
+        custom (bool): Use custom MHA implementation, for testing / benchmarking.
+        memory_efficient (bool): Use xformers based memory efficient attention.
+        attention_as_float32 (bool): Perform the attention as float32
+            (especially important with memory_efficient as autocast won&#39;t do this automatically).
+        rope (`RotaryEmbedding` or None): Rope embedding to use.
+        cross_attention: Should be true when used as a cross attention.
+            All keys and values must be available at once, streaming is only for the queries.
+            Cannot be used with `causal` or `rope` (as it wouldn&#39;t make sens to
+            intepret the time steps in the keys relative to those in the queries).
+        safe_streaming (bool): Bug fix, will go away with xformers update.
+        qk_layer_norm (bool): Layer normalization applied to queries and keys before dot product.
+        kv_repeat (int): If &gt; 1, will repeat keys and queries multiple times (need to divide num_heads).
+            This will lead to faster decoding time on A100 or other GPUs with tensorcore.
+        device (torch.device or None): Sevice on which to initialize.
+        dtype (torch.dtype or None): dtype to use.
+    &#34;&#34;&#34;
+    def __init__(self, embed_dim: int, num_heads: int, dropout: float = 0.0, bias: bool = True,
+                 causal: bool = False, past_context: tp.Optional[int] = None, custom: bool = False,
+                 memory_efficient: bool = False, attention_as_float32: bool = False,
+                 rope: tp.Optional[RotaryEmbedding] = None, cross_attention: bool = False,
+                 safe_streaming: bool = True, qk_layer_norm: bool = False, kv_repeat: int = 1,
+                 device=None, dtype=None):
+        super().__init__()
+        factory_kwargs = {&#39;device&#39;: device, &#39;dtype&#39;: dtype}
+        if past_context is not None:
+            assert causal
+
+        self.embed_dim = embed_dim
+        self.causal = causal
+        self.past_context = past_context
+        self.memory_efficient = memory_efficient
+        self.attention_as_float32 = attention_as_float32
+        self.rope = rope
+        self.cross_attention = cross_attention
+        self.safe_streaming = safe_streaming
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.kv_repeat = kv_repeat
+        if cross_attention:
+            assert not causal, &#34;Causal cannot work with cross attention.&#34;
+            assert rope is None, &#34;Rope cannot work with cross attention.&#34;
+
+        if memory_efficient:
+            _verify_xformers_memory_efficient_compat()
+
+        self.custom = _is_custom(custom, memory_efficient)
+        if self.custom:
+            out_dim = embed_dim
+            assert num_heads % kv_repeat == 0
+            assert not cross_attention or kv_repeat == 1
+            num_kv = num_heads // kv_repeat
+            kv_dim = (embed_dim // num_heads) * num_kv
+            out_dim += 2 * kv_dim
+            in_proj = nn.Linear(embed_dim, out_dim, bias=bias, **factory_kwargs)
+            # We try to follow the default PyTorch MHA convention, to easily compare results.
+            self.in_proj_weight = in_proj.weight
+            self.in_proj_bias = in_proj.bias
+            if bias:
+                self.in_proj_bias.data.zero_()  # Following Pytorch convention
+            self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias, **factory_kwargs)
+            if bias:
+                self.out_proj.bias.data.zero_()
+        else:
+            assert not qk_layer_norm
+            assert kv_repeat == 1
+            self.mha = nn.MultiheadAttention(
+                embed_dim, num_heads, dropout=dropout, bias=bias, batch_first=True,
+                **factory_kwargs)
+        self.qk_layer_norm = qk_layer_norm
+        if qk_layer_norm:
+            assert self.custom
+            assert kv_repeat == 1
+            ln_dim = embed_dim
+            self.q_layer_norm = nn.LayerNorm(ln_dim)
+            self.k_layer_norm = nn.LayerNorm(ln_dim)
+
+    def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs):
+        if not self.custom:
+            # Support compat with regular MHA
+            keys = [n for n, _ in self.mha.named_parameters()]
+            for key in keys:
+                if prefix + key in state_dict:
+                    state_dict[prefix + &#34;mha.&#34; + key] = state_dict.pop(prefix + key)
+        super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
+
+    def _get_mask(self, current_steps: int, device: torch.device, dtype: torch.dtype):
+        # Return a causal mask, accounting for potentially stored past keys/values
+        # We actually return a bias for the attention score, as this has the same
+        # convention both in the builtin MHA in Pytorch, and Xformers functions.
+        time_dim = _get_attention_time_dimension()
+        if self.memory_efficient:
+            from xformers.ops import LowerTriangularMask
+            if current_steps == 1:
+                # If we only have one step, then we do not need a mask.
+                return None
+            elif &#39;past_keys&#39; in self._streaming_state:
+                raise RuntimeError(&#39;Not supported at the moment&#39;)
+            else:
+                # Then we can safely use a lower triangular mask
+                return LowerTriangularMask()
+        if self._streaming_state:
+            past_keys = self._streaming_state[&#39;past_keys&#39;]
+            past_steps = past_keys.shape[time_dim]
+        else:
+            past_steps = 0
+
+        queries_pos = torch.arange(
+            past_steps, current_steps + past_steps, device=device).view(-1, 1)
+        keys_pos = torch.arange(past_steps + current_steps, device=device).view(1, -1)
+        delta = queries_pos - keys_pos
+        valid = delta &gt;= 0
+        if self.past_context is not None:
+            valid &amp;= (delta &lt;= self.past_context)
+        return torch.where(
+            valid,
+            torch.zeros([], device=device, dtype=dtype),
+            torch.full([], float(&#39;-inf&#39;), device=device, dtype=dtype))
+
+    def _complete_kv(self, k, v):
+        time_dim = _get_attention_time_dimension()
+        if self.cross_attention:
+            # With cross attention we assume all keys and values
+            # are already available, and streaming is with respect
+            # to the queries only.
+            return k, v
+        # Complete the key/value pair using the streaming state.
+        if self._streaming_state:
+            pk = self._streaming_state[&#39;past_keys&#39;]
+            nk = torch.cat([pk, k], dim=time_dim)
+            if v is k:
+                nv = nk
+            else:
+                pv = self._streaming_state[&#39;past_values&#39;]
+                nv = torch.cat([pv, v], dim=time_dim)
+        else:
+            nk = k
+            nv = v
+
+        assert nk.shape[time_dim] == nv.shape[time_dim]
+        offset = 0
+        if self.past_context is not None:
+            offset = max(0, nk.shape[time_dim] - self.past_context)
+        if self._is_streaming:
+            self._streaming_state[&#39;past_keys&#39;] = nk[:, offset:]
+            if v is not k:
+                self._streaming_state[&#39;past_values&#39;] = nv[:, offset:]
+            if &#39;offset&#39; in self._streaming_state:
+                self._streaming_state[&#39;offset&#39;] += offset
+            else:
+                self._streaming_state[&#39;offset&#39;] = torch.tensor(0)
+        return nk, nv
+
+    def _apply_rope(self, query: torch.Tensor, key: torch.Tensor):
+        # TODO: fix and verify layout.
+        assert _efficient_attention_backend == &#39;xformers&#39;, &#39;Rope not supported with torch attn.&#39;
+        # Apply rope embeddings to query and key tensors.
+        assert self.rope is not None
+        if &#39;past_keys&#39; in self._streaming_state:
+            past_keys_offset = self._streaming_state[&#39;past_keys&#39;].shape[1]
+        else:
+            past_keys_offset = 0
+        if &#39;offset&#39; in self._streaming_state:
+            past_context_offset = int(self._streaming_state[&#39;offset&#39;].item())
+        else:
+            past_context_offset = 0
+        streaming_offset = past_context_offset + past_keys_offset
+        return self.rope.rotate_qk(query, key, start=streaming_offset)
+
+    def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor,
+                key_padding_mask=None, need_weights=False, attn_mask=None,
+                average_attn_weights=True, is_causal=False):
+        assert attn_mask is None
+        assert not is_causal, (&#34;new param added in torch 2.0.1 not supported, &#34;
+                               &#34;use the causal args in the constructor.&#34;)
+
+        time_dim = _get_attention_time_dimension()
+        if time_dim == 2:
+            layout = &#34;b h t d&#34;
+        else:
+            layout = &#34;b t h d&#34;
+        dtype = query.dtype
+        if self._is_streaming:
+            assert self.causal or self.cross_attention, \
+                &#34;Streaming only available for causal or cross attention&#34;
+
+        if self.causal:
+            # At the moment we specialize only for the self-attention case.
+            assert query.shape[1] == key.shape[1], &#34;Causal only for same length query / key / value&#34;
+            assert value.shape[1] == key.shape[1], &#34;Causal only for same length query / key / value&#34;
+            attn_mask = self._get_mask(query.shape[1], query.device, query.dtype)
+
+        if self.custom:
+            # custom implementation
+            assert need_weights is False
+            assert key_padding_mask is None
+            if self.cross_attention:
+                # Different queries, keys, values, we have to spit manually the weights
+                # before applying the linear.
+                dim = self.in_proj_weight.shape[0] // 3
+                if self.in_proj_bias is None:
+                    bias_q, bias_k, bias_v = None, None, None
+                else:
+                    bias_q = self.in_proj_bias[:dim]
+                    bias_k = self.in_proj_bias[dim: 2 * dim]
+                    bias_v = self.in_proj_bias[2 * dim:]
+                q = nn.functional.linear(query, self.in_proj_weight[:dim], bias_q)
+                # todo: when streaming, we could actually save k, v and check the shape actually match.
+                k = nn.functional.linear(key, self.in_proj_weight[dim: 2 * dim], bias_k)
+                v = nn.functional.linear(value, self.in_proj_weight[2 * dim:], bias_v)
+                if self.qk_layer_norm is True:
+                    q = self.q_layer_norm(q)
+                    k = self.k_layer_norm(k)
+                q, k, v = [rearrange(x, f&#34;b t (h d) -&gt; {layout}&#34;, h=self.num_heads) for x in [q, k, v]]
+            else:
+                if not _is_profiled():
+                    # profiling breaks that propertysomehow.
+                    assert query is key, &#34;specialized implementation&#34;
+                    assert value is key, &#34;specialized implementation&#34;
+                projected = nn.functional.linear(query, self.in_proj_weight, self.in_proj_bias)
+                if self.kv_repeat == 1:
+                    if time_dim == 2:
+                        bound_layout = &#34;b h p t d&#34;
+                    else:
+                        bound_layout = &#34;b t p h d&#34;
+                    packed = rearrange(projected, f&#34;b t (p h d) -&gt; {bound_layout}&#34;, p=3, h=self.num_heads)
+                    q, k, v = ops.unbind(packed, dim=2)
+                else:
+                    embed_dim = self.embed_dim
+                    per_head_dim = (embed_dim // self.num_heads)
+                    kv_heads = self.num_heads // self.kv_repeat
+                    q = projected[:, :, :embed_dim]
+                    start = embed_dim
+                    end = start + per_head_dim * kv_heads
+                    k = projected[:, :, start: end]
+                    v = projected[:, :, end:]
+                    q = rearrange(q, f&#34;b t (h d) -&gt; {layout}&#34;, h=self.num_heads)
+                    k = rearrange(k, f&#34;b t (h d) -&gt; {layout}&#34;, h=kv_heads)
+                    v = rearrange(v, f&#34;b t (h d) -&gt; {layout}&#34;, h=kv_heads)
+
+                if self.qk_layer_norm is True:
+                    assert self.kv_repeat == 1
+                    q, k = [rearrange(x, f&#34;{layout} -&gt; b t (h d)&#34;) for x in [q, k]]
+                    q = self.q_layer_norm(q)
+                    k = self.k_layer_norm(k)
+                    q, k = [rearrange(x, f&#34;b t (h d) -&gt; {layout}&#34;, h=self.num_heads) for x in [q, k]]
+                if self.rope:
+                    q, k = self._apply_rope(q, k)
+                k, v = self._complete_kv(k, v)
+                if self.kv_repeat &gt; 1:
+                    k = expand_repeated_kv(k, self.kv_repeat)
+                    v = expand_repeated_kv(v, self.kv_repeat)
+            if self.attention_as_float32:
+                q, k, v = [x.float() for x in [q, k, v]]
+            if self.memory_efficient:
+                p = self.dropout if self.training else 0
+                if _efficient_attention_backend == &#39;torch&#39;:
+                    x = torch.nn.functional.scaled_dot_product_attention(
+                        q, k, v, is_causal=attn_mask is not None, dropout_p=p)
+                else:
+                    x = ops.memory_efficient_attention(q, k, v, attn_mask, p=p)
+            else:
+                # We include the dot product as float32, for consistency
+                # with the other implementations that include that step
+                # as part of the attention. Note that when using `autocast`,
+                # the einsums would be done as bfloat16, but the softmax
+                # would be done as bfloat16, so `attention_as_float32` will
+                # extend a bit the range of operations done in float32,
+                # although this should make no difference.
+                q = q / q.shape[-1] ** 0.5
+                key_layout = layout.replace(&#39;t&#39;, &#39;k&#39;)
+                query_layout = layout
+                if self._is_streaming and self.safe_streaming and q.device.type == &#39;cuda&#39;:
+                    with torch.autocast(device_type=q.device.type, dtype=torch.float32):
+                        pre_w = torch.einsum(f&#34;{query_layout},{key_layout}-&gt; b h t k&#34;, q, k)
+                else:
+                    pre_w = torch.einsum(f&#34;{query_layout},{key_layout}-&gt; b h t k&#34;, q, k)
+                if attn_mask is not None:
+                    pre_w = pre_w + attn_mask
+                w = torch.softmax(pre_w, dim=-1)
+                w = F.dropout(w, self.dropout, training=self.training).to(v)
+                # Key and value have the same format.
+                x = torch.einsum(f&#34;b h t k, {key_layout} -&gt; {layout}&#34;, w, v)
+            x = x.to(dtype)
+            x = rearrange(x, f&#34;{layout} -&gt; b t (h d)&#34;, h=self.num_heads)
+            x = self.out_proj(x)
+        else:
+            key, value = self._complete_kv(key, value)
+            if self.attention_as_float32:
+                query, key, value = [x.float() for x in [query, key, value]]
+            x, _ = self.mha(
+                query, key, value, key_padding_mask,
+                need_weights, attn_mask, average_attn_weights)
+            x = x.to(dtype)
+
+        return x, None
+
+
+class StreamingTransformerLayer(nn.TransformerEncoderLayer):
+    &#34;&#34;&#34;TransformerLayer with Streaming / Causal support.
+    This also integrates cross_attention, when passing `cross_attention=True`,
+    rather than having two separate classes like in PyTorch.
+
+    Args:
+        d_model (int): Dimension of the data.
+        num_heads (int): Number of heads.
+        dim_feedforward (int): Intermediate dimension of FF module.
+        dropout (float): Dropout both for MHA and FF.
+        bias_ff (bool): Use bias for FF.
+        bias_attn (bool): Use bias for MHA.
+        causal (bool): Causal mask applied automatically.
+        past_context (int or None): Receptive field for the causal mask, infinite if None.
+        custom (bool): Use custom MHA implementation, for testing / benchmarking.
+        memory_efficient (bool): Use xformers based memory efficient attention.
+        attention_as_float32 (bool): Perform the attention as float32
+            (especially important with memory_efficient as autocast won&#39;t do this automatically).
+        qk_layer_norm (bool): Layer normalization applied to queries and keys before dot product in attention.
+        qk_layer_norm_cross (bool): Same for the cross attention.
+        cross_attention (bool): If True, expect to get secondary input for cross-attention.
+            Cross attention will use the default MHA, as it typically won&#39;t require
+            special treatment.
+        layer_scale (float or None): If not None, LayerScale will be used with
+            the given value as initial scale.
+        rope (`RotaryEmbedding` or None): Rope embedding to use.
+        attention_dropout (float or None): If not None, separate the value of the dimension dropout
+            in FFN and of the attention dropout.
+        kv_repeat (int): If &gt; 1, will repeat keys and queries multiple times (need to divide num_heads).
+            This will lead to faster decoding time on A100 or other GPUs with tensorcore.
+        device (torch.device or None): Device on which to initialize.
+        dtype (torch.dtype or None): dtype to use.
+        **kwargs: See `nn.TransformerEncoderLayer`.
+    &#34;&#34;&#34;
+    def __init__(self, d_model: int, num_heads: int, dim_feedforward: int = 2048, dropout: float = 0.1,
+                 bias_ff: bool = True, bias_attn: bool = True, causal: bool = False,
+                 past_context: tp.Optional[int] = None, custom: bool = False,
+                 memory_efficient: bool = False, attention_as_float32: bool = False,
+                 qk_layer_norm: bool = False, qk_layer_norm_cross: bool = False,
+                 cross_attention: bool = False, layer_scale: tp.Optional[float] = None,
+                 rope: tp.Optional[RotaryEmbedding] = None, attention_dropout: tp.Optional[float] = None,
+                 kv_repeat: int = 1, norm: str = &#39;layer_norm&#39;, device=None, dtype=None, **kwargs):
+        super().__init__(d_model, num_heads, dim_feedforward, dropout,
+                         device=device, dtype=dtype, batch_first=True, **kwargs)
+        factory_kwargs = {&#39;device&#39;: device, &#39;dtype&#39;: dtype}
+        # Redefine self_attn to our streaming multi-head attention
+        attn_kwargs: tp.Dict[str, tp.Any] = {
+            &#39;embed_dim&#39;: d_model,
+            &#39;num_heads&#39;: num_heads,
+            &#39;dropout&#39;: dropout if attention_dropout is None else attention_dropout,
+            &#39;bias&#39;: bias_attn,
+            &#39;custom&#39;: custom,
+            &#39;memory_efficient&#39;: memory_efficient,
+            &#39;attention_as_float32&#39;: attention_as_float32,
+        }
+        self.self_attn: StreamingMultiheadAttention = StreamingMultiheadAttention(
+            causal=causal, past_context=past_context, rope=rope, qk_layer_norm=qk_layer_norm,
+            kv_repeat=kv_repeat, **attn_kwargs, **factory_kwargs)  # type: ignore
+        # Redefine feedforward layers to expose bias parameter
+        self.linear1 = nn.Linear(d_model, dim_feedforward, bias=bias_ff, **factory_kwargs)
+        self.linear2 = nn.Linear(dim_feedforward, d_model, bias=bias_ff, **factory_kwargs)
+
+        self.layer_scale_1: nn.Module
+        self.layer_scale_2: nn.Module
+        if layer_scale is None:
+            self.layer_scale_1 = nn.Identity()
+            self.layer_scale_2 = nn.Identity()
+        else:
+            self.layer_scale_1 = LayerScale(d_model, layer_scale, **factory_kwargs)
+            self.layer_scale_2 = LayerScale(d_model, layer_scale, **factory_kwargs)
+
+        self.cross_attention: tp.Optional[nn.Module] = None
+        if cross_attention:
+            self.cross_attention = StreamingMultiheadAttention(
+                cross_attention=True, qk_layer_norm=qk_layer_norm_cross,
+                **attn_kwargs, **factory_kwargs)
+            # Norm and dropout
+            self.dropout_cross = nn.Dropout(dropout)
+            # eps value matching that used in PyTorch reference implementation.
+            self.norm_cross = nn.LayerNorm(d_model, eps=1e-5, **factory_kwargs)
+            self.layer_scale_cross: nn.Module
+            if layer_scale is None:
+                self.layer_scale_cross = nn.Identity()
+            else:
+                self.layer_scale_cross = LayerScale(d_model, layer_scale, **factory_kwargs)
+        self.norm1 = create_norm_fn(norm, d_model, **factory_kwargs)  # type: ignore
+        self.norm2 = create_norm_fn(norm, d_model, **factory_kwargs)  # type: ignore
+
+    def _cross_attention_block(self, src: torch.Tensor,
+                               cross_attention_src: torch.Tensor) -&gt; torch.Tensor:
+        assert self.cross_attention is not None
+        # queries are from src, keys and values from cross_attention_src.
+        x = self.cross_attention(
+            src, cross_attention_src, cross_attention_src, need_weights=False)[0]
+        return self.dropout_cross(x)  # type: ignore
+
+    def forward(self, src: torch.Tensor, src_mask: tp.Optional[torch.Tensor] = None,  # type: ignore
+                src_key_padding_mask: tp.Optional[torch.Tensor] = None,
+                cross_attention_src: tp.Optional[torch.Tensor] = None):
+        if self.cross_attention is None:
+            assert cross_attention_src is None
+        else:
+            assert cross_attention_src is not None
+        x = src
+        if self.norm_first:
+            x = x + self.layer_scale_1(
+                self._sa_block(self.norm1(x), src_mask, src_key_padding_mask))
+            if cross_attention_src is not None:
+                x = x + self.layer_scale_cross(
+                    self._cross_attention_block(
+                        self.norm_cross(x), cross_attention_src))
+            x = x + self.layer_scale_2(self._ff_block(self.norm2(x)))
+        else:
+            x = self.norm1(x + self.layer_scale_1(
+                self._sa_block(x, src_mask, src_key_padding_mask)))
+            if cross_attention_src is not None:
+                x = self.norm_cross(
+                    x + self.layer_scale_cross(
+                        self._cross_attention_block(src, cross_attention_src)))
+            x = self.norm2(x + self.layer_scale_2(self._ff_block(x)))
+        return x
+
+
+class StreamingTransformer(StreamingModule):
+    &#34;&#34;&#34;Transformer with Streaming / Causal support.
+
+    Args:
+        d_model (int): Dimension of the data.
+        num_heads (int): Number of heads.
+        dim_feedforward (int): Intermediate dimension of FF module.
+        dropout (float): Dropout both for MHA and FF.
+        bias_ff (bool): Use bias for FF.
+        bias_attn (bool): Use bias for MHA.
+        causal (bool): Causal mask applied automatically.
+        past_context (int or None): Receptive field for the causal mask, infinite if None.
+        custom (bool): Use custom MHA implementation, for testing / benchmarking.
+        memory_efficient (bool): Use xformers based memory efficient attention.
+        attention_as_float32 (bool): Perform the attention as float32
+            (especially important with memory_efficient as autocast won&#39;t do this automatically).
+        cross_attention (bool): If True, expect to get secondary input for cross-attention.
+        layer_scale (float or None): If not None, LayerScale will be used
+            with the given value as initial scale.
+        positional_embedding (str): Positional embedding strategy (sin, rope, or sin_rope).
+        max_period (float): Maximum period of the time embedding.
+        positional_scale (float): Scale of positional embedding, set to 0 to deactivate.
+        xpos (bool): Apply xpos exponential decay to positional embedding (rope only).
+        lr (float or None): learning rate override through the `make_optim_group` API.
+        weight_decay (float or None): Weight_decay override through the `make_optim_group` API.
+        layer_class: (subclass of `StreamingTransformerLayer): class to use
+            to initialize the layers, allowing further customization outside of Audiocraft.
+        checkpointing (str): Checkpointing strategy to reduce memory usage.
+            No checkpointing if set to &#39;none&#39;. Per layer checkpointing using PyTorch
+            if set to &#39;torch&#39; (entire layer checkpointed, i.e. linears are evaluated twice,
+            minimal memory usage, but maximal runtime). Finally, `xformers_default` provide
+            a policy for opting-out some operations of the checkpointing like
+            linear layers and attention, providing a middle ground between speed and memory.
+        device (torch.device or None): Device on which to initialize.
+        dtype (torch.dtype or None): dtype to use.
+        **kwargs: See `nn.TransformerEncoderLayer`.
+    &#34;&#34;&#34;
+    def __init__(self, d_model: int, num_heads: int, num_layers: int, dim_feedforward: int = 2048,
+                 dropout: float = 0.1, bias_ff: bool = True, bias_attn: bool = True,
+                 causal: bool = False, past_context: tp.Optional[int] = None,
+                 custom: bool = False, memory_efficient: bool = False, attention_as_float32: bool = False,
+                 cross_attention: bool = False, layer_scale: tp.Optional[float] = None,
+                 positional_embedding: str = &#39;sin&#39;, max_period: float = 10_000, positional_scale: float = 1.,
+                 xpos: bool = False, lr: tp.Optional[float] = None, weight_decay: tp.Optional[float] = None,
+                 layer_class: tp.Type[StreamingTransformerLayer] = StreamingTransformerLayer,
+                 checkpointing: str = &#39;none&#39;, device=None, dtype=None, **kwargs):
+        super().__init__()
+        assert d_model % num_heads == 0
+
+        self.positional_embedding = positional_embedding
+        self.max_period = max_period
+        self.positional_scale = positional_scale
+        self.weight_decay = weight_decay
+        self.lr = lr
+
+        assert positional_embedding in [&#39;sin&#39;, &#39;rope&#39;, &#39;sin_rope&#39;]
+        self.rope: tp.Optional[RotaryEmbedding] = None
+        if self.positional_embedding in [&#39;rope&#39;, &#39;sin_rope&#39;]:
+            assert _is_custom(custom, memory_efficient)
+            self.rope = RotaryEmbedding(d_model // num_heads, max_period=max_period,
+                                        xpos=xpos, scale=positional_scale, device=device)
+
+        self.checkpointing = checkpointing
+
+        assert checkpointing in [&#39;none&#39;, &#39;torch&#39;, &#39;xformers_default&#39;, &#39;xformers_mm&#39;]
+        if self.checkpointing.startswith(&#39;xformers&#39;):
+            _verify_xformers_internal_compat()
+
+        self.layers = nn.ModuleList()
+        for idx in range(num_layers):
+            self.layers.append(
+                layer_class(
+                    d_model=d_model, num_heads=num_heads, dim_feedforward=dim_feedforward,
+                    dropout=dropout, bias_ff=bias_ff, bias_attn=bias_attn,
+                    causal=causal, past_context=past_context, custom=custom,
+                    memory_efficient=memory_efficient, attention_as_float32=attention_as_float32,
+                    cross_attention=cross_attention, layer_scale=layer_scale, rope=self.rope,
+                    device=device, dtype=dtype, **kwargs))
+
+        if self.checkpointing != &#39;none&#39;:
+            for layer in self.layers:
+                # see audiocraft/optim/fsdp.py, magic signal to indicate this requires fixing the
+                # backward hook inside of FSDP...
+                layer._magma_checkpointed = True  # type: ignore
+                assert layer.layer_drop == 0., &#34;Need further checking&#34;  # type: ignore
+
+    def _apply_layer(self, layer, *args, **kwargs):
+        method = self.checkpointing
+        if method == &#39;none&#39;:
+            return layer(*args, **kwargs)
+        elif method == &#39;torch&#39;:
+            return torch_checkpoint(layer, *args, use_reentrant=False, **kwargs)
+        elif method.startswith(&#39;xformers&#39;):
+            from xformers.checkpoint_fairinternal import checkpoint, _get_default_policy
+            if method == &#39;xformers_default&#39;:
+                # those operations will be saved, and not recomputed.
+                # According to Francisco we can get smarter policies but this is a good start.
+                allow_list = [
+                    &#34;xformers.efficient_attention_forward_cutlass.default&#34;,
+                    &#34;xformers_flash.flash_fwd.default&#34;,
+                    &#34;aten.addmm.default&#34;,
+                    &#34;aten.mm.default&#34;,
+                ]
+            elif method == &#39;xformers_mm&#39;:
+                # those operations will be saved, and not recomputed.
+                # According to Francisco we can get smarter policies but this is a good start.
+                allow_list = [
+                    &#34;aten.addmm.default&#34;,
+                    &#34;aten.mm.default&#34;,
+                ]
+            else:
+                raise ValueError(f&#34;xformers checkpointing xformers policy {method} is not known.&#34;)
+            policy_fn = _get_default_policy(allow_list)
+            return checkpoint(layer, *args, policy_fn=policy_fn, **kwargs)
+        else:
+            raise ValueError(f&#34;Checkpointing method {method} is unknown.&#34;)
+
+    def forward(self, x: torch.Tensor, *args, **kwargs):
+        B, T, C = x.shape
+
+        if &#39;offsets&#39; in self._streaming_state:
+            offsets = self._streaming_state[&#39;offsets&#39;]
+        else:
+            offsets = torch.zeros(B, dtype=torch.long, device=x.device)
+
+        if self.positional_embedding in [&#39;sin&#39;, &#39;sin_rope&#39;]:
+            positions = torch.arange(T, device=x.device).view(1, -1, 1)
+            positions = positions + offsets.view(-1, 1, 1)
+            pos_emb = create_sin_embedding(positions, C, max_period=self.max_period, dtype=x.dtype)
+            x = x + self.positional_scale * pos_emb
+
+        for layer in self.layers:
+            x = self._apply_layer(layer, x, *args, **kwargs)
+
+        if self._is_streaming:
+            self._streaming_state[&#39;offsets&#39;] = offsets + T
+
+        return x
+
+    def make_optim_group(self):
+        group = {&#34;params&#34;: list(self.parameters())}
+        if self.lr is not None:
+            group[&#34;lr&#34;] = self.lr
+        if self.weight_decay is not None:
+            group[&#34;weight_decay&#34;] = self.weight_decay
+        return group
+
+
+# special attention attention related function
+
+def _verify_xformers_memory_efficient_compat():
+    try:
+        from xformers.ops import memory_efficient_attention, LowerTriangularMask  # noqa
+    except ImportError:
+        raise ImportError(
+            &#34;xformers is not installed. Please install it and try again.\n&#34;
+            &#34;To install on AWS and Azure, run \n&#34;
+            &#34;FORCE_CUDA=1 TORCH_CUDA_ARCH_LIST=&#39;8.0&#39;\\\n&#34;
+            &#34;pip install -U git+https://git@github.com/fairinternal/xformers.git#egg=xformers\n&#34;
+            &#34;To install on FAIR Cluster, run \n&#34;
+            &#34;FORCE_CUDA=1 TORCH_CUDA_ARCH_LIST=&#39;6.0;7.0&#39;\\\n&#34;
+            &#34;pip install -U git+https://git@github.com/fairinternal/xformers.git#egg=xformers\n&#34;)
+
+
+def _verify_xformers_internal_compat():
+    try:
+        from xformers.checkpoint_fairinternal import checkpoint, _get_default_policy  # noqa
+    except ImportError:
+        raise ImportError(
+            &#34;Francisco&#39;s fairinternal xformers is not installed. Please install it and try again.\n&#34;
+            &#34;To install on AWS and Azure, run \n&#34;
+            &#34;FORCE_CUDA=1 TORCH_CUDA_ARCH_LIST=&#39;8.0&#39;\\\n&#34;
+            &#34;pip install -U git+https://git@github.com/fairinternal/xformers.git#egg=xformers\n&#34;
+            &#34;To install on FAIR Cluster, run \n&#34;
+            &#34;FORCE_CUDA=1 TORCH_CUDA_ARCH_LIST=&#39;6.0;7.0&#39;\\\n&#34;
+            &#34;pip install -U git+https://git@github.com/fairinternal/xformers.git#egg=xformers\n&#34;)
+
+
+def _is_custom(custom: bool, memory_efficient: bool):
+    return custom or memory_efficient</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-functions">Functions</h2>
+<dl>
+<dt id="audiocraft.modules.transformer.create_norm_fn"><code class="name flex">
+<span>def <span class="ident">create_norm_fn</span></span>(<span>norm_type: str, dim: int, **kwargs) ‑> torch.nn.modules.module.Module</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Create normalization module for transformer encoder layer.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>norm_type</code></strong> :&ensp;<code>str</code></dt>
+<dd>Normalization method.</dd>
+<dt><strong><code>dim</code></strong> :&ensp;<code>int</code></dt>
+<dd>Dimension of the normalized layer.</dd>
+<dt><strong><code>**kwargs</code></strong> :&ensp;<code>dict</code></dt>
+<dd>Additional parameters for normalization layer.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><code>nn.Module</code></dt>
+<dd>Normalization module.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def create_norm_fn(norm_type: str, dim: int, **kwargs) -&gt; nn.Module:
+    &#34;&#34;&#34;Create normalization module for transformer encoder layer.
+
+    Args:
+        norm_type (str): Normalization method.
+        dim (int): Dimension of the normalized layer.
+        **kwargs (dict): Additional parameters for normalization layer.
+    Returns:
+        nn.Module: Normalization module.
+    &#34;&#34;&#34;
+    if norm_type == &#39;layer_norm&#39;:
+        return nn.LayerNorm(dim, eps=1e-5, **kwargs)
+    else:
+        raise ValueError(f&#34;Unknown norm type: {norm_type}&#34;)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.transformer.create_sin_embedding"><code class="name flex">
+<span>def <span class="ident">create_sin_embedding</span></span>(<span>positions: torch.Tensor, dim: int, max_period: float = 10000, dtype: torch.dtype = torch.float32) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Create sinusoidal positional embedding, with shape <code>[B, T, C]</code>.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>positions</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>LongTensor of positions.</dd>
+<dt><strong><code>dim</code></strong> :&ensp;<code>int</code></dt>
+<dd>Dimension of the embedding.</dd>
+<dt><strong><code>max_period</code></strong> :&ensp;<code>float</code></dt>
+<dd>Maximum period of the cosine/sine functions.</dd>
+<dt><strong><code>dtype</code></strong> :&ensp;<code>torch.dtype</code> or <code>str</code></dt>
+<dd>dtype to use to generate the embedding.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><code>torch.Tensor</code></dt>
+<dd>Sinusoidal positional embedding.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def create_sin_embedding(positions: torch.Tensor, dim: int, max_period: float = 10000,
+                         dtype: torch.dtype = torch.float32) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Create sinusoidal positional embedding, with shape `[B, T, C]`.
+
+    Args:
+        positions (torch.Tensor): LongTensor of positions.
+        dim (int): Dimension of the embedding.
+        max_period (float): Maximum period of the cosine/sine functions.
+        dtype (torch.dtype or str): dtype to use to generate the embedding.
+    Returns:
+        torch.Tensor: Sinusoidal positional embedding.
+    &#34;&#34;&#34;
+    # We aim for BTC format
+    assert dim % 2 == 0
+    half_dim = dim // 2
+    positions = positions.to(dtype)
+    adim = torch.arange(half_dim, device=positions.device, dtype=dtype).view(1, 1, -1)
+    max_period_tensor = torch.full([], max_period, device=positions.device, dtype=dtype)  # avoid sync point
+    phase = positions / (max_period_tensor ** (adim / (half_dim - 1)))
+    return torch.cat([torch.cos(phase), torch.sin(phase)], dim=-1)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.transformer.expand_repeated_kv"><code class="name flex">
+<span>def <span class="ident">expand_repeated_kv</span></span>(<span>x: torch.Tensor, n_rep: int) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"><p>torch.repeat_interleave(x, dim=2, repeats=n_rep) from xlformers</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def expand_repeated_kv(x: torch.Tensor, n_rep: int) -&gt; torch.Tensor:
+    &#34;&#34;&#34;torch.repeat_interleave(x, dim=2, repeats=n_rep) from xlformers&#34;&#34;&#34;
+    if n_rep == 1:
+        return x
+    if _efficient_attention_backend == &#39;torch&#39;:
+        bs, n_kv_heads, slen, head_dim = x.shape
+        return (
+            x[:, :, None, :, :]
+            .expand(bs, n_kv_heads, n_rep, slen, head_dim)
+            .reshape(bs, n_kv_heads * n_rep, slen, head_dim)
+        )
+    else:
+        bs, slen, n_kv_heads, head_dim = x.shape
+        return (
+            x[:, :, :, None, :]
+            .expand(bs, slen, n_kv_heads, n_rep, head_dim)
+            .reshape(bs, slen, n_kv_heads * n_rep, head_dim)
+        )</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.modules.transformer.set_efficient_attention_backend"><code class="name flex">
+<span>def <span class="ident">set_efficient_attention_backend</span></span>(<span>backend: str = 'torch')</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def set_efficient_attention_backend(backend: str = &#39;torch&#39;):
+    # Using torch by default, it seems a bit faster on older P100 GPUs (~20% faster).
+    global _efficient_attention_backend
+    assert _efficient_attention_backend in [&#39;xformers&#39;, &#39;torch&#39;]
+    _efficient_attention_backend = backend</code></pre>
+</details>
+</dd>
+</dl>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.modules.transformer.LayerScale"><code class="flex name class">
+<span>class <span class="ident">LayerScale</span></span>
+<span>(</span><span>channels: int, init: float = 0.0001, channel_last: bool = True, device=None, dtype=None)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Layer scale from [Touvron et al 2021] (<a href="https://arxiv.org/pdf/2103.17239.pdf">https://arxiv.org/pdf/2103.17239.pdf</a>).
+This rescales diagonaly the residual outputs close to 0, with a learnt scale.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>channels</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of channels.</dd>
+<dt><strong><code>init</code></strong> :&ensp;<code>float</code></dt>
+<dd>Initial scale.</dd>
+<dt><strong><code>channel_last</code></strong> :&ensp;<code>bool</code></dt>
+<dd>If True, expect <code>[*, C]</code> shaped tensors, otherwise, <code>[*, C, T]</code>.</dd>
+<dt><strong><code>device</code></strong> :&ensp;<code>torch.device</code> or <code>None</code></dt>
+<dd>Device on which to initialize the module.</dd>
+<dt><strong><code>dtype</code></strong> :&ensp;<code>torch.dtype</code> or <code>None</code></dt>
+<dd>dtype to use to initialize the module.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class LayerScale(nn.Module):
+    &#34;&#34;&#34;Layer scale from [Touvron et al 2021] (https://arxiv.org/pdf/2103.17239.pdf).
+    This rescales diagonaly the residual outputs close to 0, with a learnt scale.
+
+    Args:
+        channels (int): Number of channels.
+        init (float): Initial scale.
+        channel_last (bool): If True, expect `[*, C]` shaped tensors, otherwise, `[*, C, T]`.
+        device (torch.device or None): Device on which to initialize the module.
+        dtype (torch.dtype or None): dtype to use to initialize the module.
+    &#34;&#34;&#34;
+    def __init__(self, channels: int, init: float = 1e-4, channel_last: bool = True,
+                 device=None, dtype=None):
+        super().__init__()
+        self.channel_last = channel_last
+        self.scale = nn.Parameter(
+            torch.full((channels,), init,
+                       requires_grad=True, device=device, dtype=dtype))
+
+    def forward(self, x: torch.Tensor):
+        if self.channel_last:
+            return self.scale * x
+        else:
+            return self.scale[:, None] * x</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.transformer.LayerScale.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.transformer.LayerScale.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.transformer.LayerScale.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.modules.transformer.LayerScale.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, x: torch.Tensor) ‑> Callable[..., Any]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Defines the computation performed at every call.</p>
+<p>Should be overridden by all subclasses.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>Although the recipe for forward pass needs to be defined within
+this function, one should call the :class:<code>Module</code> instance afterwards
+instead of this since the former takes care of running the
+registered hooks while the latter silently ignores them.</p>
+</div></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, x: torch.Tensor):
+    if self.channel_last:
+        return self.scale * x
+    else:
+        return self.scale[:, None] * x</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.modules.transformer.StreamingMultiheadAttention"><code class="flex name class">
+<span>class <span class="ident">StreamingMultiheadAttention</span></span>
+<span>(</span><span>embed_dim: int, num_heads: int, dropout: float = 0.0, bias: bool = True, causal: bool = False, past_context: Optional[int] = None, custom: bool = False, memory_efficient: bool = False, attention_as_float32: bool = False, rope: Optional[<a title="audiocraft.modules.rope.RotaryEmbedding" href="rope.html#audiocraft.modules.rope.RotaryEmbedding">RotaryEmbedding</a>] = None, cross_attention: bool = False, safe_streaming: bool = True, qk_layer_norm: bool = False, kv_repeat: int = 1, device=None, dtype=None)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Similar to <code>nn.MultiheadAttention</code> but with support for streaming, causal evaluation.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>embed_dim</code></strong> :&ensp;<code>int</code></dt>
+<dd>Dimension to project to.</dd>
+<dt><strong><code>num_heads</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of heads.</dd>
+<dt><strong><code>dropout</code></strong> :&ensp;<code>float</code></dt>
+<dd>Dropout level.</dd>
+<dt><strong><code>bias</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Use bias in projections.</dd>
+<dt><strong><code>causal</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Causal mask applied automatically.</dd>
+<dt><strong><code>past_context</code></strong> :&ensp;<code>int</code> or <code>None</code></dt>
+<dd>Receptive field for the causal mask, infinite if None.</dd>
+<dt><strong><code>custom</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Use custom MHA implementation, for testing / benchmarking.</dd>
+<dt><strong><code>memory_efficient</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Use xformers based memory efficient attention.</dd>
+<dt><strong><code>attention_as_float32</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Perform the attention as float32
+(especially important with memory_efficient as autocast won't do this automatically).</dd>
+<dt>rope (<code>RotaryEmbedding</code> or None): Rope embedding to use.</dt>
+<dt><strong><code>cross_attention</code></strong></dt>
+<dd>Should be true when used as a cross attention.
+All keys and values must be available at once, streaming is only for the queries.
+Cannot be used with <code>causal</code> or <code>rope</code> (as it wouldn't make sens to
+intepret the time steps in the keys relative to those in the queries).</dd>
+<dt><strong><code>safe_streaming</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Bug fix, will go away with xformers update.</dd>
+<dt><strong><code>qk_layer_norm</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Layer normalization applied to queries and keys before dot product.</dd>
+<dt><strong><code>kv_repeat</code></strong> :&ensp;<code>int</code></dt>
+<dd>If &gt; 1, will repeat keys and queries multiple times (need to divide num_heads).
+This will lead to faster decoding time on A100 or other GPUs with tensorcore.</dd>
+<dt><strong><code>device</code></strong> :&ensp;<code>torch.device</code> or <code>None</code></dt>
+<dd>Sevice on which to initialize.</dd>
+<dt><strong><code>dtype</code></strong> :&ensp;<code>torch.dtype</code> or <code>None</code></dt>
+<dd>dtype to use.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class StreamingMultiheadAttention(StreamingModule):
+    &#34;&#34;&#34;Similar to `nn.MultiheadAttention` but with support for streaming, causal evaluation.
+
+    Args:
+        embed_dim (int): Dimension to project to.
+        num_heads (int): Number of heads.
+        dropout (float): Dropout level.
+        bias (bool): Use bias in projections.
+        causal (bool): Causal mask applied automatically.
+        past_context (int or None): Receptive field for the causal mask, infinite if None.
+        custom (bool): Use custom MHA implementation, for testing / benchmarking.
+        memory_efficient (bool): Use xformers based memory efficient attention.
+        attention_as_float32 (bool): Perform the attention as float32
+            (especially important with memory_efficient as autocast won&#39;t do this automatically).
+        rope (`RotaryEmbedding` or None): Rope embedding to use.
+        cross_attention: Should be true when used as a cross attention.
+            All keys and values must be available at once, streaming is only for the queries.
+            Cannot be used with `causal` or `rope` (as it wouldn&#39;t make sens to
+            intepret the time steps in the keys relative to those in the queries).
+        safe_streaming (bool): Bug fix, will go away with xformers update.
+        qk_layer_norm (bool): Layer normalization applied to queries and keys before dot product.
+        kv_repeat (int): If &gt; 1, will repeat keys and queries multiple times (need to divide num_heads).
+            This will lead to faster decoding time on A100 or other GPUs with tensorcore.
+        device (torch.device or None): Sevice on which to initialize.
+        dtype (torch.dtype or None): dtype to use.
+    &#34;&#34;&#34;
+    def __init__(self, embed_dim: int, num_heads: int, dropout: float = 0.0, bias: bool = True,
+                 causal: bool = False, past_context: tp.Optional[int] = None, custom: bool = False,
+                 memory_efficient: bool = False, attention_as_float32: bool = False,
+                 rope: tp.Optional[RotaryEmbedding] = None, cross_attention: bool = False,
+                 safe_streaming: bool = True, qk_layer_norm: bool = False, kv_repeat: int = 1,
+                 device=None, dtype=None):
+        super().__init__()
+        factory_kwargs = {&#39;device&#39;: device, &#39;dtype&#39;: dtype}
+        if past_context is not None:
+            assert causal
+
+        self.embed_dim = embed_dim
+        self.causal = causal
+        self.past_context = past_context
+        self.memory_efficient = memory_efficient
+        self.attention_as_float32 = attention_as_float32
+        self.rope = rope
+        self.cross_attention = cross_attention
+        self.safe_streaming = safe_streaming
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.kv_repeat = kv_repeat
+        if cross_attention:
+            assert not causal, &#34;Causal cannot work with cross attention.&#34;
+            assert rope is None, &#34;Rope cannot work with cross attention.&#34;
+
+        if memory_efficient:
+            _verify_xformers_memory_efficient_compat()
+
+        self.custom = _is_custom(custom, memory_efficient)
+        if self.custom:
+            out_dim = embed_dim
+            assert num_heads % kv_repeat == 0
+            assert not cross_attention or kv_repeat == 1
+            num_kv = num_heads // kv_repeat
+            kv_dim = (embed_dim // num_heads) * num_kv
+            out_dim += 2 * kv_dim
+            in_proj = nn.Linear(embed_dim, out_dim, bias=bias, **factory_kwargs)
+            # We try to follow the default PyTorch MHA convention, to easily compare results.
+            self.in_proj_weight = in_proj.weight
+            self.in_proj_bias = in_proj.bias
+            if bias:
+                self.in_proj_bias.data.zero_()  # Following Pytorch convention
+            self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias, **factory_kwargs)
+            if bias:
+                self.out_proj.bias.data.zero_()
+        else:
+            assert not qk_layer_norm
+            assert kv_repeat == 1
+            self.mha = nn.MultiheadAttention(
+                embed_dim, num_heads, dropout=dropout, bias=bias, batch_first=True,
+                **factory_kwargs)
+        self.qk_layer_norm = qk_layer_norm
+        if qk_layer_norm:
+            assert self.custom
+            assert kv_repeat == 1
+            ln_dim = embed_dim
+            self.q_layer_norm = nn.LayerNorm(ln_dim)
+            self.k_layer_norm = nn.LayerNorm(ln_dim)
+
+    def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs):
+        if not self.custom:
+            # Support compat with regular MHA
+            keys = [n for n, _ in self.mha.named_parameters()]
+            for key in keys:
+                if prefix + key in state_dict:
+                    state_dict[prefix + &#34;mha.&#34; + key] = state_dict.pop(prefix + key)
+        super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
+
+    def _get_mask(self, current_steps: int, device: torch.device, dtype: torch.dtype):
+        # Return a causal mask, accounting for potentially stored past keys/values
+        # We actually return a bias for the attention score, as this has the same
+        # convention both in the builtin MHA in Pytorch, and Xformers functions.
+        time_dim = _get_attention_time_dimension()
+        if self.memory_efficient:
+            from xformers.ops import LowerTriangularMask
+            if current_steps == 1:
+                # If we only have one step, then we do not need a mask.
+                return None
+            elif &#39;past_keys&#39; in self._streaming_state:
+                raise RuntimeError(&#39;Not supported at the moment&#39;)
+            else:
+                # Then we can safely use a lower triangular mask
+                return LowerTriangularMask()
+        if self._streaming_state:
+            past_keys = self._streaming_state[&#39;past_keys&#39;]
+            past_steps = past_keys.shape[time_dim]
+        else:
+            past_steps = 0
+
+        queries_pos = torch.arange(
+            past_steps, current_steps + past_steps, device=device).view(-1, 1)
+        keys_pos = torch.arange(past_steps + current_steps, device=device).view(1, -1)
+        delta = queries_pos - keys_pos
+        valid = delta &gt;= 0
+        if self.past_context is not None:
+            valid &amp;= (delta &lt;= self.past_context)
+        return torch.where(
+            valid,
+            torch.zeros([], device=device, dtype=dtype),
+            torch.full([], float(&#39;-inf&#39;), device=device, dtype=dtype))
+
+    def _complete_kv(self, k, v):
+        time_dim = _get_attention_time_dimension()
+        if self.cross_attention:
+            # With cross attention we assume all keys and values
+            # are already available, and streaming is with respect
+            # to the queries only.
+            return k, v
+        # Complete the key/value pair using the streaming state.
+        if self._streaming_state:
+            pk = self._streaming_state[&#39;past_keys&#39;]
+            nk = torch.cat([pk, k], dim=time_dim)
+            if v is k:
+                nv = nk
+            else:
+                pv = self._streaming_state[&#39;past_values&#39;]
+                nv = torch.cat([pv, v], dim=time_dim)
+        else:
+            nk = k
+            nv = v
+
+        assert nk.shape[time_dim] == nv.shape[time_dim]
+        offset = 0
+        if self.past_context is not None:
+            offset = max(0, nk.shape[time_dim] - self.past_context)
+        if self._is_streaming:
+            self._streaming_state[&#39;past_keys&#39;] = nk[:, offset:]
+            if v is not k:
+                self._streaming_state[&#39;past_values&#39;] = nv[:, offset:]
+            if &#39;offset&#39; in self._streaming_state:
+                self._streaming_state[&#39;offset&#39;] += offset
+            else:
+                self._streaming_state[&#39;offset&#39;] = torch.tensor(0)
+        return nk, nv
+
+    def _apply_rope(self, query: torch.Tensor, key: torch.Tensor):
+        # TODO: fix and verify layout.
+        assert _efficient_attention_backend == &#39;xformers&#39;, &#39;Rope not supported with torch attn.&#39;
+        # Apply rope embeddings to query and key tensors.
+        assert self.rope is not None
+        if &#39;past_keys&#39; in self._streaming_state:
+            past_keys_offset = self._streaming_state[&#39;past_keys&#39;].shape[1]
+        else:
+            past_keys_offset = 0
+        if &#39;offset&#39; in self._streaming_state:
+            past_context_offset = int(self._streaming_state[&#39;offset&#39;].item())
+        else:
+            past_context_offset = 0
+        streaming_offset = past_context_offset + past_keys_offset
+        return self.rope.rotate_qk(query, key, start=streaming_offset)
+
+    def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor,
+                key_padding_mask=None, need_weights=False, attn_mask=None,
+                average_attn_weights=True, is_causal=False):
+        assert attn_mask is None
+        assert not is_causal, (&#34;new param added in torch 2.0.1 not supported, &#34;
+                               &#34;use the causal args in the constructor.&#34;)
+
+        time_dim = _get_attention_time_dimension()
+        if time_dim == 2:
+            layout = &#34;b h t d&#34;
+        else:
+            layout = &#34;b t h d&#34;
+        dtype = query.dtype
+        if self._is_streaming:
+            assert self.causal or self.cross_attention, \
+                &#34;Streaming only available for causal or cross attention&#34;
+
+        if self.causal:
+            # At the moment we specialize only for the self-attention case.
+            assert query.shape[1] == key.shape[1], &#34;Causal only for same length query / key / value&#34;
+            assert value.shape[1] == key.shape[1], &#34;Causal only for same length query / key / value&#34;
+            attn_mask = self._get_mask(query.shape[1], query.device, query.dtype)
+
+        if self.custom:
+            # custom implementation
+            assert need_weights is False
+            assert key_padding_mask is None
+            if self.cross_attention:
+                # Different queries, keys, values, we have to spit manually the weights
+                # before applying the linear.
+                dim = self.in_proj_weight.shape[0] // 3
+                if self.in_proj_bias is None:
+                    bias_q, bias_k, bias_v = None, None, None
+                else:
+                    bias_q = self.in_proj_bias[:dim]
+                    bias_k = self.in_proj_bias[dim: 2 * dim]
+                    bias_v = self.in_proj_bias[2 * dim:]
+                q = nn.functional.linear(query, self.in_proj_weight[:dim], bias_q)
+                # todo: when streaming, we could actually save k, v and check the shape actually match.
+                k = nn.functional.linear(key, self.in_proj_weight[dim: 2 * dim], bias_k)
+                v = nn.functional.linear(value, self.in_proj_weight[2 * dim:], bias_v)
+                if self.qk_layer_norm is True:
+                    q = self.q_layer_norm(q)
+                    k = self.k_layer_norm(k)
+                q, k, v = [rearrange(x, f&#34;b t (h d) -&gt; {layout}&#34;, h=self.num_heads) for x in [q, k, v]]
+            else:
+                if not _is_profiled():
+                    # profiling breaks that propertysomehow.
+                    assert query is key, &#34;specialized implementation&#34;
+                    assert value is key, &#34;specialized implementation&#34;
+                projected = nn.functional.linear(query, self.in_proj_weight, self.in_proj_bias)
+                if self.kv_repeat == 1:
+                    if time_dim == 2:
+                        bound_layout = &#34;b h p t d&#34;
+                    else:
+                        bound_layout = &#34;b t p h d&#34;
+                    packed = rearrange(projected, f&#34;b t (p h d) -&gt; {bound_layout}&#34;, p=3, h=self.num_heads)
+                    q, k, v = ops.unbind(packed, dim=2)
+                else:
+                    embed_dim = self.embed_dim
+                    per_head_dim = (embed_dim // self.num_heads)
+                    kv_heads = self.num_heads // self.kv_repeat
+                    q = projected[:, :, :embed_dim]
+                    start = embed_dim
+                    end = start + per_head_dim * kv_heads
+                    k = projected[:, :, start: end]
+                    v = projected[:, :, end:]
+                    q = rearrange(q, f&#34;b t (h d) -&gt; {layout}&#34;, h=self.num_heads)
+                    k = rearrange(k, f&#34;b t (h d) -&gt; {layout}&#34;, h=kv_heads)
+                    v = rearrange(v, f&#34;b t (h d) -&gt; {layout}&#34;, h=kv_heads)
+
+                if self.qk_layer_norm is True:
+                    assert self.kv_repeat == 1
+                    q, k = [rearrange(x, f&#34;{layout} -&gt; b t (h d)&#34;) for x in [q, k]]
+                    q = self.q_layer_norm(q)
+                    k = self.k_layer_norm(k)
+                    q, k = [rearrange(x, f&#34;b t (h d) -&gt; {layout}&#34;, h=self.num_heads) for x in [q, k]]
+                if self.rope:
+                    q, k = self._apply_rope(q, k)
+                k, v = self._complete_kv(k, v)
+                if self.kv_repeat &gt; 1:
+                    k = expand_repeated_kv(k, self.kv_repeat)
+                    v = expand_repeated_kv(v, self.kv_repeat)
+            if self.attention_as_float32:
+                q, k, v = [x.float() for x in [q, k, v]]
+            if self.memory_efficient:
+                p = self.dropout if self.training else 0
+                if _efficient_attention_backend == &#39;torch&#39;:
+                    x = torch.nn.functional.scaled_dot_product_attention(
+                        q, k, v, is_causal=attn_mask is not None, dropout_p=p)
+                else:
+                    x = ops.memory_efficient_attention(q, k, v, attn_mask, p=p)
+            else:
+                # We include the dot product as float32, for consistency
+                # with the other implementations that include that step
+                # as part of the attention. Note that when using `autocast`,
+                # the einsums would be done as bfloat16, but the softmax
+                # would be done as bfloat16, so `attention_as_float32` will
+                # extend a bit the range of operations done in float32,
+                # although this should make no difference.
+                q = q / q.shape[-1] ** 0.5
+                key_layout = layout.replace(&#39;t&#39;, &#39;k&#39;)
+                query_layout = layout
+                if self._is_streaming and self.safe_streaming and q.device.type == &#39;cuda&#39;:
+                    with torch.autocast(device_type=q.device.type, dtype=torch.float32):
+                        pre_w = torch.einsum(f&#34;{query_layout},{key_layout}-&gt; b h t k&#34;, q, k)
+                else:
+                    pre_w = torch.einsum(f&#34;{query_layout},{key_layout}-&gt; b h t k&#34;, q, k)
+                if attn_mask is not None:
+                    pre_w = pre_w + attn_mask
+                w = torch.softmax(pre_w, dim=-1)
+                w = F.dropout(w, self.dropout, training=self.training).to(v)
+                # Key and value have the same format.
+                x = torch.einsum(f&#34;b h t k, {key_layout} -&gt; {layout}&#34;, w, v)
+            x = x.to(dtype)
+            x = rearrange(x, f&#34;{layout} -&gt; b t (h d)&#34;, h=self.num_heads)
+            x = self.out_proj(x)
+        else:
+            key, value = self._complete_kv(key, value)
+            if self.attention_as_float32:
+                query, key, value = [x.float() for x in [query, key, value]]
+            x, _ = self.mha(
+                query, key, value, key_padding_mask,
+                need_weights, attn_mask, average_attn_weights)
+            x = x.to(dtype)
+
+        return x, None</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.streaming.StreamingModule" href="streaming.html#audiocraft.modules.streaming.StreamingModule">StreamingModule</a></li>
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.transformer.StreamingMultiheadAttention.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.transformer.StreamingMultiheadAttention.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.transformer.StreamingMultiheadAttention.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Inherited members</h3>
+<ul class="hlist">
+<li><code><b><a title="audiocraft.modules.streaming.StreamingModule" href="streaming.html#audiocraft.modules.streaming.StreamingModule">StreamingModule</a></b></code>:
+<ul class="hlist">
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.flush" href="streaming.html#audiocraft.modules.streaming.StreamingModule.flush">flush</a></code></li>
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.forward" href="streaming.html#audiocraft.modules.streaming.StreamingModule.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.get_streaming_state" href="streaming.html#audiocraft.modules.streaming.StreamingModule.get_streaming_state">get_streaming_state</a></code></li>
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.reset_streaming" href="streaming.html#audiocraft.modules.streaming.StreamingModule.reset_streaming">reset_streaming</a></code></li>
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.set_streaming_state" href="streaming.html#audiocraft.modules.streaming.StreamingModule.set_streaming_state">set_streaming_state</a></code></li>
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.streaming" href="streaming.html#audiocraft.modules.streaming.StreamingModule.streaming">streaming</a></code></li>
+</ul>
+</li>
+</ul>
+</dd>
+<dt id="audiocraft.modules.transformer.StreamingTransformer"><code class="flex name class">
+<span>class <span class="ident">StreamingTransformer</span></span>
+<span>(</span><span>d_model: int, num_heads: int, num_layers: int, dim_feedforward: int = 2048, dropout: float = 0.1, bias_ff: bool = True, bias_attn: bool = True, causal: bool = False, past_context: Optional[int] = None, custom: bool = False, memory_efficient: bool = False, attention_as_float32: bool = False, cross_attention: bool = False, layer_scale: Optional[float] = None, positional_embedding: str = 'sin', max_period: float = 10000, positional_scale: float = 1.0, xpos: bool = False, lr: Optional[float] = None, weight_decay: Optional[float] = None, layer_class: Type[<a title="audiocraft.modules.transformer.StreamingTransformerLayer" href="#audiocraft.modules.transformer.StreamingTransformerLayer">StreamingTransformerLayer</a>] = audiocraft.modules.transformer.StreamingTransformerLayer, checkpointing: str = 'none', device=None, dtype=None, **kwargs)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Transformer with Streaming / Causal support.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>d_model</code></strong> :&ensp;<code>int</code></dt>
+<dd>Dimension of the data.</dd>
+<dt><strong><code>num_heads</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of heads.</dd>
+<dt><strong><code>dim_feedforward</code></strong> :&ensp;<code>int</code></dt>
+<dd>Intermediate dimension of FF module.</dd>
+<dt><strong><code>dropout</code></strong> :&ensp;<code>float</code></dt>
+<dd>Dropout both for MHA and FF.</dd>
+<dt><strong><code>bias_ff</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Use bias for FF.</dd>
+<dt><strong><code>bias_attn</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Use bias for MHA.</dd>
+<dt><strong><code>causal</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Causal mask applied automatically.</dd>
+<dt><strong><code>past_context</code></strong> :&ensp;<code>int</code> or <code>None</code></dt>
+<dd>Receptive field for the causal mask, infinite if None.</dd>
+<dt><strong><code>custom</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Use custom MHA implementation, for testing / benchmarking.</dd>
+<dt><strong><code>memory_efficient</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Use xformers based memory efficient attention.</dd>
+<dt><strong><code>attention_as_float32</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Perform the attention as float32
+(especially important with memory_efficient as autocast won't do this automatically).</dd>
+<dt><strong><code>cross_attention</code></strong> :&ensp;<code>bool</code></dt>
+<dd>If True, expect to get secondary input for cross-attention.</dd>
+<dt><strong><code>layer_scale</code></strong> :&ensp;<code>float</code> or <code>None</code></dt>
+<dd>If not None, LayerScale will be used
+with the given value as initial scale.</dd>
+<dt><strong><code>positional_embedding</code></strong> :&ensp;<code>str</code></dt>
+<dd>Positional embedding strategy (sin, rope, or sin_rope).</dd>
+<dt><strong><code>max_period</code></strong> :&ensp;<code>float</code></dt>
+<dd>Maximum period of the time embedding.</dd>
+<dt><strong><code>positional_scale</code></strong> :&ensp;<code>float</code></dt>
+<dd>Scale of positional embedding, set to 0 to deactivate.</dd>
+<dt><strong><code>xpos</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Apply xpos exponential decay to positional embedding (rope only).</dd>
+<dt><strong><code>lr</code></strong> :&ensp;<code>float</code> or <code>None</code></dt>
+<dd>learning rate override through the <code>make_optim_group</code> API.</dd>
+<dt><strong><code>weight_decay</code></strong> :&ensp;<code>float</code> or <code>None</code></dt>
+<dd>Weight_decay override through the <code>make_optim_group</code> API.</dd>
+<dt><strong><code>layer_class</code></strong></dt>
+<dd>(subclass of `StreamingTransformerLayer): class to use
+to initialize the layers, allowing further customization outside of Audiocraft.</dd>
+<dt><strong><code>checkpointing</code></strong> :&ensp;<code>str</code></dt>
+<dd>Checkpointing strategy to reduce memory usage.
+No checkpointing if set to 'none'. Per layer checkpointing using PyTorch
+if set to 'torch' (entire layer checkpointed, i.e. linears are evaluated twice,
+minimal memory usage, but maximal runtime). Finally, <code>xformers_default</code> provide
+a policy for opting-out some operations of the checkpointing like
+linear layers and attention, providing a middle ground between speed and memory.</dd>
+<dt><strong><code>device</code></strong> :&ensp;<code>torch.device</code> or <code>None</code></dt>
+<dd>Device on which to initialize.</dd>
+<dt><strong><code>dtype</code></strong> :&ensp;<code>torch.dtype</code> or <code>None</code></dt>
+<dd>dtype to use.</dd>
+<dt><strong><code>**kwargs</code></strong></dt>
+<dd>See <code>nn.TransformerEncoderLayer</code>.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class StreamingTransformer(StreamingModule):
+    &#34;&#34;&#34;Transformer with Streaming / Causal support.
+
+    Args:
+        d_model (int): Dimension of the data.
+        num_heads (int): Number of heads.
+        dim_feedforward (int): Intermediate dimension of FF module.
+        dropout (float): Dropout both for MHA and FF.
+        bias_ff (bool): Use bias for FF.
+        bias_attn (bool): Use bias for MHA.
+        causal (bool): Causal mask applied automatically.
+        past_context (int or None): Receptive field for the causal mask, infinite if None.
+        custom (bool): Use custom MHA implementation, for testing / benchmarking.
+        memory_efficient (bool): Use xformers based memory efficient attention.
+        attention_as_float32 (bool): Perform the attention as float32
+            (especially important with memory_efficient as autocast won&#39;t do this automatically).
+        cross_attention (bool): If True, expect to get secondary input for cross-attention.
+        layer_scale (float or None): If not None, LayerScale will be used
+            with the given value as initial scale.
+        positional_embedding (str): Positional embedding strategy (sin, rope, or sin_rope).
+        max_period (float): Maximum period of the time embedding.
+        positional_scale (float): Scale of positional embedding, set to 0 to deactivate.
+        xpos (bool): Apply xpos exponential decay to positional embedding (rope only).
+        lr (float or None): learning rate override through the `make_optim_group` API.
+        weight_decay (float or None): Weight_decay override through the `make_optim_group` API.
+        layer_class: (subclass of `StreamingTransformerLayer): class to use
+            to initialize the layers, allowing further customization outside of Audiocraft.
+        checkpointing (str): Checkpointing strategy to reduce memory usage.
+            No checkpointing if set to &#39;none&#39;. Per layer checkpointing using PyTorch
+            if set to &#39;torch&#39; (entire layer checkpointed, i.e. linears are evaluated twice,
+            minimal memory usage, but maximal runtime). Finally, `xformers_default` provide
+            a policy for opting-out some operations of the checkpointing like
+            linear layers and attention, providing a middle ground between speed and memory.
+        device (torch.device or None): Device on which to initialize.
+        dtype (torch.dtype or None): dtype to use.
+        **kwargs: See `nn.TransformerEncoderLayer`.
+    &#34;&#34;&#34;
+    def __init__(self, d_model: int, num_heads: int, num_layers: int, dim_feedforward: int = 2048,
+                 dropout: float = 0.1, bias_ff: bool = True, bias_attn: bool = True,
+                 causal: bool = False, past_context: tp.Optional[int] = None,
+                 custom: bool = False, memory_efficient: bool = False, attention_as_float32: bool = False,
+                 cross_attention: bool = False, layer_scale: tp.Optional[float] = None,
+                 positional_embedding: str = &#39;sin&#39;, max_period: float = 10_000, positional_scale: float = 1.,
+                 xpos: bool = False, lr: tp.Optional[float] = None, weight_decay: tp.Optional[float] = None,
+                 layer_class: tp.Type[StreamingTransformerLayer] = StreamingTransformerLayer,
+                 checkpointing: str = &#39;none&#39;, device=None, dtype=None, **kwargs):
+        super().__init__()
+        assert d_model % num_heads == 0
+
+        self.positional_embedding = positional_embedding
+        self.max_period = max_period
+        self.positional_scale = positional_scale
+        self.weight_decay = weight_decay
+        self.lr = lr
+
+        assert positional_embedding in [&#39;sin&#39;, &#39;rope&#39;, &#39;sin_rope&#39;]
+        self.rope: tp.Optional[RotaryEmbedding] = None
+        if self.positional_embedding in [&#39;rope&#39;, &#39;sin_rope&#39;]:
+            assert _is_custom(custom, memory_efficient)
+            self.rope = RotaryEmbedding(d_model // num_heads, max_period=max_period,
+                                        xpos=xpos, scale=positional_scale, device=device)
+
+        self.checkpointing = checkpointing
+
+        assert checkpointing in [&#39;none&#39;, &#39;torch&#39;, &#39;xformers_default&#39;, &#39;xformers_mm&#39;]
+        if self.checkpointing.startswith(&#39;xformers&#39;):
+            _verify_xformers_internal_compat()
+
+        self.layers = nn.ModuleList()
+        for idx in range(num_layers):
+            self.layers.append(
+                layer_class(
+                    d_model=d_model, num_heads=num_heads, dim_feedforward=dim_feedforward,
+                    dropout=dropout, bias_ff=bias_ff, bias_attn=bias_attn,
+                    causal=causal, past_context=past_context, custom=custom,
+                    memory_efficient=memory_efficient, attention_as_float32=attention_as_float32,
+                    cross_attention=cross_attention, layer_scale=layer_scale, rope=self.rope,
+                    device=device, dtype=dtype, **kwargs))
+
+        if self.checkpointing != &#39;none&#39;:
+            for layer in self.layers:
+                # see audiocraft/optim/fsdp.py, magic signal to indicate this requires fixing the
+                # backward hook inside of FSDP...
+                layer._magma_checkpointed = True  # type: ignore
+                assert layer.layer_drop == 0., &#34;Need further checking&#34;  # type: ignore
+
+    def _apply_layer(self, layer, *args, **kwargs):
+        method = self.checkpointing
+        if method == &#39;none&#39;:
+            return layer(*args, **kwargs)
+        elif method == &#39;torch&#39;:
+            return torch_checkpoint(layer, *args, use_reentrant=False, **kwargs)
+        elif method.startswith(&#39;xformers&#39;):
+            from xformers.checkpoint_fairinternal import checkpoint, _get_default_policy
+            if method == &#39;xformers_default&#39;:
+                # those operations will be saved, and not recomputed.
+                # According to Francisco we can get smarter policies but this is a good start.
+                allow_list = [
+                    &#34;xformers.efficient_attention_forward_cutlass.default&#34;,
+                    &#34;xformers_flash.flash_fwd.default&#34;,
+                    &#34;aten.addmm.default&#34;,
+                    &#34;aten.mm.default&#34;,
+                ]
+            elif method == &#39;xformers_mm&#39;:
+                # those operations will be saved, and not recomputed.
+                # According to Francisco we can get smarter policies but this is a good start.
+                allow_list = [
+                    &#34;aten.addmm.default&#34;,
+                    &#34;aten.mm.default&#34;,
+                ]
+            else:
+                raise ValueError(f&#34;xformers checkpointing xformers policy {method} is not known.&#34;)
+            policy_fn = _get_default_policy(allow_list)
+            return checkpoint(layer, *args, policy_fn=policy_fn, **kwargs)
+        else:
+            raise ValueError(f&#34;Checkpointing method {method} is unknown.&#34;)
+
+    def forward(self, x: torch.Tensor, *args, **kwargs):
+        B, T, C = x.shape
+
+        if &#39;offsets&#39; in self._streaming_state:
+            offsets = self._streaming_state[&#39;offsets&#39;]
+        else:
+            offsets = torch.zeros(B, dtype=torch.long, device=x.device)
+
+        if self.positional_embedding in [&#39;sin&#39;, &#39;sin_rope&#39;]:
+            positions = torch.arange(T, device=x.device).view(1, -1, 1)
+            positions = positions + offsets.view(-1, 1, 1)
+            pos_emb = create_sin_embedding(positions, C, max_period=self.max_period, dtype=x.dtype)
+            x = x + self.positional_scale * pos_emb
+
+        for layer in self.layers:
+            x = self._apply_layer(layer, x, *args, **kwargs)
+
+        if self._is_streaming:
+            self._streaming_state[&#39;offsets&#39;] = offsets + T
+
+        return x
+
+    def make_optim_group(self):
+        group = {&#34;params&#34;: list(self.parameters())}
+        if self.lr is not None:
+            group[&#34;lr&#34;] = self.lr
+        if self.weight_decay is not None:
+            group[&#34;weight_decay&#34;] = self.weight_decay
+        return group</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.modules.streaming.StreamingModule" href="streaming.html#audiocraft.modules.streaming.StreamingModule">StreamingModule</a></li>
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.transformer.StreamingTransformer.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.transformer.StreamingTransformer.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.transformer.StreamingTransformer.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.modules.transformer.StreamingTransformer.make_optim_group"><code class="name flex">
+<span>def <span class="ident">make_optim_group</span></span>(<span>self)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def make_optim_group(self):
+    group = {&#34;params&#34;: list(self.parameters())}
+    if self.lr is not None:
+        group[&#34;lr&#34;] = self.lr
+    if self.weight_decay is not None:
+        group[&#34;weight_decay&#34;] = self.weight_decay
+    return group</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Inherited members</h3>
+<ul class="hlist">
+<li><code><b><a title="audiocraft.modules.streaming.StreamingModule" href="streaming.html#audiocraft.modules.streaming.StreamingModule">StreamingModule</a></b></code>:
+<ul class="hlist">
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.flush" href="streaming.html#audiocraft.modules.streaming.StreamingModule.flush">flush</a></code></li>
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.forward" href="streaming.html#audiocraft.modules.streaming.StreamingModule.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.get_streaming_state" href="streaming.html#audiocraft.modules.streaming.StreamingModule.get_streaming_state">get_streaming_state</a></code></li>
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.reset_streaming" href="streaming.html#audiocraft.modules.streaming.StreamingModule.reset_streaming">reset_streaming</a></code></li>
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.set_streaming_state" href="streaming.html#audiocraft.modules.streaming.StreamingModule.set_streaming_state">set_streaming_state</a></code></li>
+<li><code><a title="audiocraft.modules.streaming.StreamingModule.streaming" href="streaming.html#audiocraft.modules.streaming.StreamingModule.streaming">streaming</a></code></li>
+</ul>
+</li>
+</ul>
+</dd>
+<dt id="audiocraft.modules.transformer.StreamingTransformerLayer"><code class="flex name class">
+<span>class <span class="ident">StreamingTransformerLayer</span></span>
+<span>(</span><span>d_model: int, num_heads: int, dim_feedforward: int = 2048, dropout: float = 0.1, bias_ff: bool = True, bias_attn: bool = True, causal: bool = False, past_context: Optional[int] = None, custom: bool = False, memory_efficient: bool = False, attention_as_float32: bool = False, qk_layer_norm: bool = False, qk_layer_norm_cross: bool = False, cross_attention: bool = False, layer_scale: Optional[float] = None, rope: Optional[<a title="audiocraft.modules.rope.RotaryEmbedding" href="rope.html#audiocraft.modules.rope.RotaryEmbedding">RotaryEmbedding</a>] = None, attention_dropout: Optional[float] = None, kv_repeat: int = 1, norm: str = 'layer_norm', device=None, dtype=None, **kwargs)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>TransformerLayer with Streaming / Causal support.
+This also integrates cross_attention, when passing <code>cross_attention=True</code>,
+rather than having two separate classes like in PyTorch.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>d_model</code></strong> :&ensp;<code>int</code></dt>
+<dd>Dimension of the data.</dd>
+<dt><strong><code>num_heads</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of heads.</dd>
+<dt><strong><code>dim_feedforward</code></strong> :&ensp;<code>int</code></dt>
+<dd>Intermediate dimension of FF module.</dd>
+<dt><strong><code>dropout</code></strong> :&ensp;<code>float</code></dt>
+<dd>Dropout both for MHA and FF.</dd>
+<dt><strong><code>bias_ff</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Use bias for FF.</dd>
+<dt><strong><code>bias_attn</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Use bias for MHA.</dd>
+<dt><strong><code>causal</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Causal mask applied automatically.</dd>
+<dt><strong><code>past_context</code></strong> :&ensp;<code>int</code> or <code>None</code></dt>
+<dd>Receptive field for the causal mask, infinite if None.</dd>
+<dt><strong><code>custom</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Use custom MHA implementation, for testing / benchmarking.</dd>
+<dt><strong><code>memory_efficient</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Use xformers based memory efficient attention.</dd>
+<dt><strong><code>attention_as_float32</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Perform the attention as float32
+(especially important with memory_efficient as autocast won't do this automatically).</dd>
+<dt><strong><code>qk_layer_norm</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Layer normalization applied to queries and keys before dot product in attention.</dd>
+<dt><strong><code>qk_layer_norm_cross</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Same for the cross attention.</dd>
+<dt><strong><code>cross_attention</code></strong> :&ensp;<code>bool</code></dt>
+<dd>If True, expect to get secondary input for cross-attention.
+Cross attention will use the default MHA, as it typically won't require
+special treatment.</dd>
+<dt><strong><code>layer_scale</code></strong> :&ensp;<code>float</code> or <code>None</code></dt>
+<dd>If not None, LayerScale will be used with
+the given value as initial scale.</dd>
+<dt>rope (<code>RotaryEmbedding</code> or None): Rope embedding to use.</dt>
+<dt><strong><code>attention_dropout</code></strong> :&ensp;<code>float</code> or <code>None</code></dt>
+<dd>If not None, separate the value of the dimension dropout
+in FFN and of the attention dropout.</dd>
+<dt><strong><code>kv_repeat</code></strong> :&ensp;<code>int</code></dt>
+<dd>If &gt; 1, will repeat keys and queries multiple times (need to divide num_heads).
+This will lead to faster decoding time on A100 or other GPUs with tensorcore.</dd>
+<dt><strong><code>device</code></strong> :&ensp;<code>torch.device</code> or <code>None</code></dt>
+<dd>Device on which to initialize.</dd>
+<dt><strong><code>dtype</code></strong> :&ensp;<code>torch.dtype</code> or <code>None</code></dt>
+<dd>dtype to use.</dd>
+<dt><strong><code>**kwargs</code></strong></dt>
+<dd>See <code>nn.TransformerEncoderLayer</code>.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class StreamingTransformerLayer(nn.TransformerEncoderLayer):
+    &#34;&#34;&#34;TransformerLayer with Streaming / Causal support.
+    This also integrates cross_attention, when passing `cross_attention=True`,
+    rather than having two separate classes like in PyTorch.
+
+    Args:
+        d_model (int): Dimension of the data.
+        num_heads (int): Number of heads.
+        dim_feedforward (int): Intermediate dimension of FF module.
+        dropout (float): Dropout both for MHA and FF.
+        bias_ff (bool): Use bias for FF.
+        bias_attn (bool): Use bias for MHA.
+        causal (bool): Causal mask applied automatically.
+        past_context (int or None): Receptive field for the causal mask, infinite if None.
+        custom (bool): Use custom MHA implementation, for testing / benchmarking.
+        memory_efficient (bool): Use xformers based memory efficient attention.
+        attention_as_float32 (bool): Perform the attention as float32
+            (especially important with memory_efficient as autocast won&#39;t do this automatically).
+        qk_layer_norm (bool): Layer normalization applied to queries and keys before dot product in attention.
+        qk_layer_norm_cross (bool): Same for the cross attention.
+        cross_attention (bool): If True, expect to get secondary input for cross-attention.
+            Cross attention will use the default MHA, as it typically won&#39;t require
+            special treatment.
+        layer_scale (float or None): If not None, LayerScale will be used with
+            the given value as initial scale.
+        rope (`RotaryEmbedding` or None): Rope embedding to use.
+        attention_dropout (float or None): If not None, separate the value of the dimension dropout
+            in FFN and of the attention dropout.
+        kv_repeat (int): If &gt; 1, will repeat keys and queries multiple times (need to divide num_heads).
+            This will lead to faster decoding time on A100 or other GPUs with tensorcore.
+        device (torch.device or None): Device on which to initialize.
+        dtype (torch.dtype or None): dtype to use.
+        **kwargs: See `nn.TransformerEncoderLayer`.
+    &#34;&#34;&#34;
+    def __init__(self, d_model: int, num_heads: int, dim_feedforward: int = 2048, dropout: float = 0.1,
+                 bias_ff: bool = True, bias_attn: bool = True, causal: bool = False,
+                 past_context: tp.Optional[int] = None, custom: bool = False,
+                 memory_efficient: bool = False, attention_as_float32: bool = False,
+                 qk_layer_norm: bool = False, qk_layer_norm_cross: bool = False,
+                 cross_attention: bool = False, layer_scale: tp.Optional[float] = None,
+                 rope: tp.Optional[RotaryEmbedding] = None, attention_dropout: tp.Optional[float] = None,
+                 kv_repeat: int = 1, norm: str = &#39;layer_norm&#39;, device=None, dtype=None, **kwargs):
+        super().__init__(d_model, num_heads, dim_feedforward, dropout,
+                         device=device, dtype=dtype, batch_first=True, **kwargs)
+        factory_kwargs = {&#39;device&#39;: device, &#39;dtype&#39;: dtype}
+        # Redefine self_attn to our streaming multi-head attention
+        attn_kwargs: tp.Dict[str, tp.Any] = {
+            &#39;embed_dim&#39;: d_model,
+            &#39;num_heads&#39;: num_heads,
+            &#39;dropout&#39;: dropout if attention_dropout is None else attention_dropout,
+            &#39;bias&#39;: bias_attn,
+            &#39;custom&#39;: custom,
+            &#39;memory_efficient&#39;: memory_efficient,
+            &#39;attention_as_float32&#39;: attention_as_float32,
+        }
+        self.self_attn: StreamingMultiheadAttention = StreamingMultiheadAttention(
+            causal=causal, past_context=past_context, rope=rope, qk_layer_norm=qk_layer_norm,
+            kv_repeat=kv_repeat, **attn_kwargs, **factory_kwargs)  # type: ignore
+        # Redefine feedforward layers to expose bias parameter
+        self.linear1 = nn.Linear(d_model, dim_feedforward, bias=bias_ff, **factory_kwargs)
+        self.linear2 = nn.Linear(dim_feedforward, d_model, bias=bias_ff, **factory_kwargs)
+
+        self.layer_scale_1: nn.Module
+        self.layer_scale_2: nn.Module
+        if layer_scale is None:
+            self.layer_scale_1 = nn.Identity()
+            self.layer_scale_2 = nn.Identity()
+        else:
+            self.layer_scale_1 = LayerScale(d_model, layer_scale, **factory_kwargs)
+            self.layer_scale_2 = LayerScale(d_model, layer_scale, **factory_kwargs)
+
+        self.cross_attention: tp.Optional[nn.Module] = None
+        if cross_attention:
+            self.cross_attention = StreamingMultiheadAttention(
+                cross_attention=True, qk_layer_norm=qk_layer_norm_cross,
+                **attn_kwargs, **factory_kwargs)
+            # Norm and dropout
+            self.dropout_cross = nn.Dropout(dropout)
+            # eps value matching that used in PyTorch reference implementation.
+            self.norm_cross = nn.LayerNorm(d_model, eps=1e-5, **factory_kwargs)
+            self.layer_scale_cross: nn.Module
+            if layer_scale is None:
+                self.layer_scale_cross = nn.Identity()
+            else:
+                self.layer_scale_cross = LayerScale(d_model, layer_scale, **factory_kwargs)
+        self.norm1 = create_norm_fn(norm, d_model, **factory_kwargs)  # type: ignore
+        self.norm2 = create_norm_fn(norm, d_model, **factory_kwargs)  # type: ignore
+
+    def _cross_attention_block(self, src: torch.Tensor,
+                               cross_attention_src: torch.Tensor) -&gt; torch.Tensor:
+        assert self.cross_attention is not None
+        # queries are from src, keys and values from cross_attention_src.
+        x = self.cross_attention(
+            src, cross_attention_src, cross_attention_src, need_weights=False)[0]
+        return self.dropout_cross(x)  # type: ignore
+
+    def forward(self, src: torch.Tensor, src_mask: tp.Optional[torch.Tensor] = None,  # type: ignore
+                src_key_padding_mask: tp.Optional[torch.Tensor] = None,
+                cross_attention_src: tp.Optional[torch.Tensor] = None):
+        if self.cross_attention is None:
+            assert cross_attention_src is None
+        else:
+            assert cross_attention_src is not None
+        x = src
+        if self.norm_first:
+            x = x + self.layer_scale_1(
+                self._sa_block(self.norm1(x), src_mask, src_key_padding_mask))
+            if cross_attention_src is not None:
+                x = x + self.layer_scale_cross(
+                    self._cross_attention_block(
+                        self.norm_cross(x), cross_attention_src))
+            x = x + self.layer_scale_2(self._ff_block(self.norm2(x)))
+        else:
+            x = self.norm1(x + self.layer_scale_1(
+                self._sa_block(x, src_mask, src_key_padding_mask)))
+            if cross_attention_src is not None:
+                x = self.norm_cross(
+                    x + self.layer_scale_cross(
+                        self._cross_attention_block(src, cross_attention_src)))
+            x = self.norm2(x + self.layer_scale_2(self._ff_block(x)))
+        return x</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.transformer.TransformerEncoderLayer</li>
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.modules.transformer.StreamingTransformerLayer.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.transformer.StreamingTransformerLayer.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.modules.transformer.StreamingTransformerLayer.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.modules.transformer.StreamingTransformerLayer.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, src: torch.Tensor, src_mask: Optional[torch.Tensor] = None, src_key_padding_mask: Optional[torch.Tensor] = None, cross_attention_src: Optional[torch.Tensor] = None) ‑> Callable[..., Any]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Pass the input through the encoder layer.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>src</code></strong></dt>
+<dd>the sequence to the encoder layer (required).</dd>
+<dt><strong><code>src_mask</code></strong></dt>
+<dd>the mask for the src sequence (optional).</dd>
+<dt><strong><code>is_causal</code></strong></dt>
+<dd>If specified, applies a causal mask as src_mask.
+Default: <code>False</code>.</dd>
+<dt><strong><code>src_key_padding_mask</code></strong></dt>
+<dd>the mask for the src keys per batch (optional).</dd>
+</dl>
+<h2 id="shape">Shape</h2>
+<p>see the docs in Transformer class.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, src: torch.Tensor, src_mask: tp.Optional[torch.Tensor] = None,  # type: ignore
+            src_key_padding_mask: tp.Optional[torch.Tensor] = None,
+            cross_attention_src: tp.Optional[torch.Tensor] = None):
+    if self.cross_attention is None:
+        assert cross_attention_src is None
+    else:
+        assert cross_attention_src is not None
+    x = src
+    if self.norm_first:
+        x = x + self.layer_scale_1(
+            self._sa_block(self.norm1(x), src_mask, src_key_padding_mask))
+        if cross_attention_src is not None:
+            x = x + self.layer_scale_cross(
+                self._cross_attention_block(
+                    self.norm_cross(x), cross_attention_src))
+        x = x + self.layer_scale_2(self._ff_block(self.norm2(x)))
+    else:
+        x = self.norm1(x + self.layer_scale_1(
+            self._sa_block(x, src_mask, src_key_padding_mask)))
+        if cross_attention_src is not None:
+            x = self.norm_cross(
+                x + self.layer_scale_cross(
+                    self._cross_attention_block(src, cross_attention_src)))
+        x = self.norm2(x + self.layer_scale_2(self._ff_block(x)))
+    return x</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.modules" href="index.html">audiocraft.modules</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-functions">Functions</a></h3>
+<ul class="">
+<li><code><a title="audiocraft.modules.transformer.create_norm_fn" href="#audiocraft.modules.transformer.create_norm_fn">create_norm_fn</a></code></li>
+<li><code><a title="audiocraft.modules.transformer.create_sin_embedding" href="#audiocraft.modules.transformer.create_sin_embedding">create_sin_embedding</a></code></li>
+<li><code><a title="audiocraft.modules.transformer.expand_repeated_kv" href="#audiocraft.modules.transformer.expand_repeated_kv">expand_repeated_kv</a></code></li>
+<li><code><a title="audiocraft.modules.transformer.set_efficient_attention_backend" href="#audiocraft.modules.transformer.set_efficient_attention_backend">set_efficient_attention_backend</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.modules.transformer.LayerScale" href="#audiocraft.modules.transformer.LayerScale">LayerScale</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.transformer.LayerScale.call_super_init" href="#audiocraft.modules.transformer.LayerScale.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.transformer.LayerScale.dump_patches" href="#audiocraft.modules.transformer.LayerScale.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.transformer.LayerScale.forward" href="#audiocraft.modules.transformer.LayerScale.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.transformer.LayerScale.training" href="#audiocraft.modules.transformer.LayerScale.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.transformer.StreamingMultiheadAttention" href="#audiocraft.modules.transformer.StreamingMultiheadAttention">StreamingMultiheadAttention</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.transformer.StreamingMultiheadAttention.call_super_init" href="#audiocraft.modules.transformer.StreamingMultiheadAttention.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.transformer.StreamingMultiheadAttention.dump_patches" href="#audiocraft.modules.transformer.StreamingMultiheadAttention.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.transformer.StreamingMultiheadAttention.training" href="#audiocraft.modules.transformer.StreamingMultiheadAttention.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.transformer.StreamingTransformer" href="#audiocraft.modules.transformer.StreamingTransformer">StreamingTransformer</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.transformer.StreamingTransformer.call_super_init" href="#audiocraft.modules.transformer.StreamingTransformer.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.transformer.StreamingTransformer.dump_patches" href="#audiocraft.modules.transformer.StreamingTransformer.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.transformer.StreamingTransformer.make_optim_group" href="#audiocraft.modules.transformer.StreamingTransformer.make_optim_group">make_optim_group</a></code></li>
+<li><code><a title="audiocraft.modules.transformer.StreamingTransformer.training" href="#audiocraft.modules.transformer.StreamingTransformer.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.modules.transformer.StreamingTransformerLayer" href="#audiocraft.modules.transformer.StreamingTransformerLayer">StreamingTransformerLayer</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.modules.transformer.StreamingTransformerLayer.call_super_init" href="#audiocraft.modules.transformer.StreamingTransformerLayer.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.modules.transformer.StreamingTransformerLayer.dump_patches" href="#audiocraft.modules.transformer.StreamingTransformerLayer.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.modules.transformer.StreamingTransformerLayer.forward" href="#audiocraft.modules.transformer.StreamingTransformerLayer.forward">forward</a></code></li>
+<li><code><a title="audiocraft.modules.transformer.StreamingTransformerLayer.training" href="#audiocraft.modules.transformer.StreamingTransformerLayer.training">training</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/docs/audiocraft/quantization/base.html b/docs/audiocraft/quantization/base.html
new file mode 100644
index 00000000..efe5b397
--- /dev/null
+++ b/docs/audiocraft/quantization/base.html
@@ -0,0 +1,566 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.quantization.base API documentation</title>
+<meta name="description" content="Base class for all quantizers." />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.quantization.base</code></h1>
+</header>
+<section id="section-intro">
+<p>Base class for all quantizers.</p>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+&#34;&#34;&#34;
+Base class for all quantizers.
+&#34;&#34;&#34;
+
+from dataclasses import dataclass, field
+import typing as tp
+
+import torch
+from torch import nn
+
+
+@dataclass
+class QuantizedResult:
+    x: torch.Tensor
+    codes: torch.Tensor
+    bandwidth: torch.Tensor  # bandwidth in kb/s used, per batch item.
+    penalty: tp.Optional[torch.Tensor] = None
+    metrics: dict = field(default_factory=dict)
+
+
+class BaseQuantizer(nn.Module):
+    &#34;&#34;&#34;Base class for quantizers.
+    &#34;&#34;&#34;
+
+    def forward(self, x: torch.Tensor, frame_rate: int) -&gt; QuantizedResult:
+        &#34;&#34;&#34;
+        Given input tensor x, returns first the quantized (or approximately quantized)
+        representation along with quantized codes, bandwidth, and any penalty term for the loss.
+        Finally, this returns a dict of metrics to update logging etc.
+        Frame rate must be passed so that the bandwidth is properly computed.
+        &#34;&#34;&#34;
+        raise NotImplementedError()
+
+    def encode(self, x: torch.Tensor) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Encode a given input tensor with the specified sample rate at the given bandwidth.
+        &#34;&#34;&#34;
+        raise NotImplementedError()
+
+    def decode(self, codes: torch.Tensor) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Decode the given codes to the quantized representation.
+        &#34;&#34;&#34;
+        raise NotImplementedError()
+
+    @property
+    def total_codebooks(self):
+        &#34;&#34;&#34;Total number of codebooks.
+        &#34;&#34;&#34;
+        raise NotImplementedError()
+
+    @property
+    def num_codebooks(self):
+        &#34;&#34;&#34;Number of active codebooks.
+        &#34;&#34;&#34;
+        raise NotImplementedError()
+
+    def set_num_codebooks(self, n: int):
+        &#34;&#34;&#34;Set the number of active codebooks.
+        &#34;&#34;&#34;
+        raise NotImplementedError()
+
+
+class DummyQuantizer(BaseQuantizer):
+    &#34;&#34;&#34;Fake quantizer that actually does not perform any quantization.
+    &#34;&#34;&#34;
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x: torch.Tensor, frame_rate: int):
+        q = x.unsqueeze(1)
+        return QuantizedResult(x, q, torch.tensor(q.numel() * 32 * frame_rate / 1000 / len(x)).to(x))
+
+    def encode(self, x: torch.Tensor) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Encode a given input tensor with the specified sample rate at the given bandwidth.
+        In the case of the DummyQuantizer, the codes are actually identical
+        to the input and resulting quantized representation as no quantization is done.
+        &#34;&#34;&#34;
+        return x.unsqueeze(1)
+
+    def decode(self, codes: torch.Tensor) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Decode the given codes to the quantized representation.
+        In the case of the DummyQuantizer, the codes are actually identical
+        to the input and resulting quantized representation as no quantization is done.
+        &#34;&#34;&#34;
+        return codes.squeeze(1)
+
+    @property
+    def total_codebooks(self):
+        &#34;&#34;&#34;Total number of codebooks.
+        &#34;&#34;&#34;
+        return 1
+
+    @property
+    def num_codebooks(self):
+        &#34;&#34;&#34;Total number of codebooks.
+        &#34;&#34;&#34;
+        return self.total_codebooks
+
+    def set_num_codebooks(self, n: int):
+        &#34;&#34;&#34;Set the number of active codebooks.
+        &#34;&#34;&#34;
+        raise AttributeError(&#34;Cannot override the number of codebooks for the dummy quantizer&#34;)</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.quantization.base.BaseQuantizer"><code class="flex name class">
+<span>class <span class="ident">BaseQuantizer</span></span>
+<span>(</span><span>*args, **kwargs)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Base class for quantizers.</p>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class BaseQuantizer(nn.Module):
+    &#34;&#34;&#34;Base class for quantizers.
+    &#34;&#34;&#34;
+
+    def forward(self, x: torch.Tensor, frame_rate: int) -&gt; QuantizedResult:
+        &#34;&#34;&#34;
+        Given input tensor x, returns first the quantized (or approximately quantized)
+        representation along with quantized codes, bandwidth, and any penalty term for the loss.
+        Finally, this returns a dict of metrics to update logging etc.
+        Frame rate must be passed so that the bandwidth is properly computed.
+        &#34;&#34;&#34;
+        raise NotImplementedError()
+
+    def encode(self, x: torch.Tensor) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Encode a given input tensor with the specified sample rate at the given bandwidth.
+        &#34;&#34;&#34;
+        raise NotImplementedError()
+
+    def decode(self, codes: torch.Tensor) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Decode the given codes to the quantized representation.
+        &#34;&#34;&#34;
+        raise NotImplementedError()
+
+    @property
+    def total_codebooks(self):
+        &#34;&#34;&#34;Total number of codebooks.
+        &#34;&#34;&#34;
+        raise NotImplementedError()
+
+    @property
+    def num_codebooks(self):
+        &#34;&#34;&#34;Number of active codebooks.
+        &#34;&#34;&#34;
+        raise NotImplementedError()
+
+    def set_num_codebooks(self, n: int):
+        &#34;&#34;&#34;Set the number of active codebooks.
+        &#34;&#34;&#34;
+        raise NotImplementedError()</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Subclasses</h3>
+<ul class="hlist">
+<li><a title="audiocraft.quantization.base.DummyQuantizer" href="#audiocraft.quantization.base.DummyQuantizer">DummyQuantizer</a></li>
+<li><a title="audiocraft.quantization.vq.ResidualVectorQuantizer" href="vq.html#audiocraft.quantization.vq.ResidualVectorQuantizer">ResidualVectorQuantizer</a></li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.quantization.base.BaseQuantizer.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.quantization.base.BaseQuantizer.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.quantization.base.BaseQuantizer.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Instance variables</h3>
+<dl>
+<dt id="audiocraft.quantization.base.BaseQuantizer.num_codebooks"><code class="name">var <span class="ident">num_codebooks</span></code></dt>
+<dd>
+<div class="desc"><p>Number of active codebooks.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def num_codebooks(self):
+    &#34;&#34;&#34;Number of active codebooks.
+    &#34;&#34;&#34;
+    raise NotImplementedError()</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.quantization.base.BaseQuantizer.total_codebooks"><code class="name">var <span class="ident">total_codebooks</span></code></dt>
+<dd>
+<div class="desc"><p>Total number of codebooks.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def total_codebooks(self):
+    &#34;&#34;&#34;Total number of codebooks.
+    &#34;&#34;&#34;
+    raise NotImplementedError()</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.quantization.base.BaseQuantizer.decode"><code class="name flex">
+<span>def <span class="ident">decode</span></span>(<span>self, codes: torch.Tensor) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Decode the given codes to the quantized representation.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def decode(self, codes: torch.Tensor) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Decode the given codes to the quantized representation.
+    &#34;&#34;&#34;
+    raise NotImplementedError()</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.quantization.base.BaseQuantizer.encode"><code class="name flex">
+<span>def <span class="ident">encode</span></span>(<span>self, x: torch.Tensor) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Encode a given input tensor with the specified sample rate at the given bandwidth.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def encode(self, x: torch.Tensor) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Encode a given input tensor with the specified sample rate at the given bandwidth.
+    &#34;&#34;&#34;
+    raise NotImplementedError()</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.quantization.base.BaseQuantizer.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, x: torch.Tensor, frame_rate: int) ‑> <a title="audiocraft.quantization.base.QuantizedResult" href="#audiocraft.quantization.base.QuantizedResult">QuantizedResult</a></span>
+</code></dt>
+<dd>
+<div class="desc"><p>Given input tensor x, returns first the quantized (or approximately quantized)
+representation along with quantized codes, bandwidth, and any penalty term for the loss.
+Finally, this returns a dict of metrics to update logging etc.
+Frame rate must be passed so that the bandwidth is properly computed.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, x: torch.Tensor, frame_rate: int) -&gt; QuantizedResult:
+    &#34;&#34;&#34;
+    Given input tensor x, returns first the quantized (or approximately quantized)
+    representation along with quantized codes, bandwidth, and any penalty term for the loss.
+    Finally, this returns a dict of metrics to update logging etc.
+    Frame rate must be passed so that the bandwidth is properly computed.
+    &#34;&#34;&#34;
+    raise NotImplementedError()</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.quantization.base.BaseQuantizer.set_num_codebooks"><code class="name flex">
+<span>def <span class="ident">set_num_codebooks</span></span>(<span>self, n: int)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Set the number of active codebooks.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def set_num_codebooks(self, n: int):
+    &#34;&#34;&#34;Set the number of active codebooks.
+    &#34;&#34;&#34;
+    raise NotImplementedError()</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.quantization.base.DummyQuantizer"><code class="flex name class">
+<span>class <span class="ident">DummyQuantizer</span></span>
+</code></dt>
+<dd>
+<div class="desc"><p>Fake quantizer that actually does not perform any quantization.</p>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class DummyQuantizer(BaseQuantizer):
+    &#34;&#34;&#34;Fake quantizer that actually does not perform any quantization.
+    &#34;&#34;&#34;
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x: torch.Tensor, frame_rate: int):
+        q = x.unsqueeze(1)
+        return QuantizedResult(x, q, torch.tensor(q.numel() * 32 * frame_rate / 1000 / len(x)).to(x))
+
+    def encode(self, x: torch.Tensor) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Encode a given input tensor with the specified sample rate at the given bandwidth.
+        In the case of the DummyQuantizer, the codes are actually identical
+        to the input and resulting quantized representation as no quantization is done.
+        &#34;&#34;&#34;
+        return x.unsqueeze(1)
+
+    def decode(self, codes: torch.Tensor) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Decode the given codes to the quantized representation.
+        In the case of the DummyQuantizer, the codes are actually identical
+        to the input and resulting quantized representation as no quantization is done.
+        &#34;&#34;&#34;
+        return codes.squeeze(1)
+
+    @property
+    def total_codebooks(self):
+        &#34;&#34;&#34;Total number of codebooks.
+        &#34;&#34;&#34;
+        return 1
+
+    @property
+    def num_codebooks(self):
+        &#34;&#34;&#34;Total number of codebooks.
+        &#34;&#34;&#34;
+        return self.total_codebooks
+
+    def set_num_codebooks(self, n: int):
+        &#34;&#34;&#34;Set the number of active codebooks.
+        &#34;&#34;&#34;
+        raise AttributeError(&#34;Cannot override the number of codebooks for the dummy quantizer&#34;)</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.quantization.base.BaseQuantizer" href="#audiocraft.quantization.base.BaseQuantizer">BaseQuantizer</a></li>
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.quantization.base.DummyQuantizer.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.quantization.base.DummyQuantizer.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.quantization.base.DummyQuantizer.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Instance variables</h3>
+<dl>
+<dt id="audiocraft.quantization.base.DummyQuantizer.num_codebooks"><code class="name">var <span class="ident">num_codebooks</span></code></dt>
+<dd>
+<div class="desc"><p>Total number of codebooks.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def num_codebooks(self):
+    &#34;&#34;&#34;Total number of codebooks.
+    &#34;&#34;&#34;
+    return self.total_codebooks</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.quantization.base.DummyQuantizer.decode"><code class="name flex">
+<span>def <span class="ident">decode</span></span>(<span>self, codes: torch.Tensor) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Decode the given codes to the quantized representation.
+In the case of the DummyQuantizer, the codes are actually identical
+to the input and resulting quantized representation as no quantization is done.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def decode(self, codes: torch.Tensor) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Decode the given codes to the quantized representation.
+    In the case of the DummyQuantizer, the codes are actually identical
+    to the input and resulting quantized representation as no quantization is done.
+    &#34;&#34;&#34;
+    return codes.squeeze(1)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.quantization.base.DummyQuantizer.encode"><code class="name flex">
+<span>def <span class="ident">encode</span></span>(<span>self, x: torch.Tensor) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Encode a given input tensor with the specified sample rate at the given bandwidth.
+In the case of the DummyQuantizer, the codes are actually identical
+to the input and resulting quantized representation as no quantization is done.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def encode(self, x: torch.Tensor) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Encode a given input tensor with the specified sample rate at the given bandwidth.
+    In the case of the DummyQuantizer, the codes are actually identical
+    to the input and resulting quantized representation as no quantization is done.
+    &#34;&#34;&#34;
+    return x.unsqueeze(1)</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Inherited members</h3>
+<ul class="hlist">
+<li><code><b><a title="audiocraft.quantization.base.BaseQuantizer" href="#audiocraft.quantization.base.BaseQuantizer">BaseQuantizer</a></b></code>:
+<ul class="hlist">
+<li><code><a title="audiocraft.quantization.base.BaseQuantizer.forward" href="#audiocraft.quantization.base.BaseQuantizer.forward">forward</a></code></li>
+<li><code><a title="audiocraft.quantization.base.BaseQuantizer.set_num_codebooks" href="#audiocraft.quantization.base.BaseQuantizer.set_num_codebooks">set_num_codebooks</a></code></li>
+<li><code><a title="audiocraft.quantization.base.BaseQuantizer.total_codebooks" href="#audiocraft.quantization.base.BaseQuantizer.total_codebooks">total_codebooks</a></code></li>
+</ul>
+</li>
+</ul>
+</dd>
+<dt id="audiocraft.quantization.base.QuantizedResult"><code class="flex name class">
+<span>class <span class="ident">QuantizedResult</span></span>
+<span>(</span><span>x: torch.Tensor, codes: torch.Tensor, bandwidth: torch.Tensor, penalty: Optional[torch.Tensor] = None, metrics: dict = &lt;factory&gt;)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>QuantizedResult(x: torch.Tensor, codes: torch.Tensor, bandwidth: torch.Tensor, penalty: Union[torch.Tensor, NoneType] = None, metrics: dict = <factory>)</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class QuantizedResult:
+    x: torch.Tensor
+    codes: torch.Tensor
+    bandwidth: torch.Tensor  # bandwidth in kb/s used, per batch item.
+    penalty: tp.Optional[torch.Tensor] = None
+    metrics: dict = field(default_factory=dict)</code></pre>
+</details>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.quantization.base.QuantizedResult.bandwidth"><code class="name">var <span class="ident">bandwidth</span> : torch.Tensor</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.quantization.base.QuantizedResult.codes"><code class="name">var <span class="ident">codes</span> : torch.Tensor</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.quantization.base.QuantizedResult.metrics"><code class="name">var <span class="ident">metrics</span> : dict</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.quantization.base.QuantizedResult.penalty"><code class="name">var <span class="ident">penalty</span> : Optional[torch.Tensor]</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.quantization.base.QuantizedResult.x"><code class="name">var <span class="ident">x</span> : torch.Tensor</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.quantization" href="index.html">audiocraft.quantization</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.quantization.base.BaseQuantizer" href="#audiocraft.quantization.base.BaseQuantizer">BaseQuantizer</a></code></h4>
+<ul class="two-column">
+<li><code><a title="audiocraft.quantization.base.BaseQuantizer.call_super_init" href="#audiocraft.quantization.base.BaseQuantizer.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.quantization.base.BaseQuantizer.decode" href="#audiocraft.quantization.base.BaseQuantizer.decode">decode</a></code></li>
+<li><code><a title="audiocraft.quantization.base.BaseQuantizer.dump_patches" href="#audiocraft.quantization.base.BaseQuantizer.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.quantization.base.BaseQuantizer.encode" href="#audiocraft.quantization.base.BaseQuantizer.encode">encode</a></code></li>
+<li><code><a title="audiocraft.quantization.base.BaseQuantizer.forward" href="#audiocraft.quantization.base.BaseQuantizer.forward">forward</a></code></li>
+<li><code><a title="audiocraft.quantization.base.BaseQuantizer.num_codebooks" href="#audiocraft.quantization.base.BaseQuantizer.num_codebooks">num_codebooks</a></code></li>
+<li><code><a title="audiocraft.quantization.base.BaseQuantizer.set_num_codebooks" href="#audiocraft.quantization.base.BaseQuantizer.set_num_codebooks">set_num_codebooks</a></code></li>
+<li><code><a title="audiocraft.quantization.base.BaseQuantizer.total_codebooks" href="#audiocraft.quantization.base.BaseQuantizer.total_codebooks">total_codebooks</a></code></li>
+<li><code><a title="audiocraft.quantization.base.BaseQuantizer.training" href="#audiocraft.quantization.base.BaseQuantizer.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.quantization.base.DummyQuantizer" href="#audiocraft.quantization.base.DummyQuantizer">DummyQuantizer</a></code></h4>
+<ul class="two-column">
+<li><code><a title="audiocraft.quantization.base.DummyQuantizer.call_super_init" href="#audiocraft.quantization.base.DummyQuantizer.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.quantization.base.DummyQuantizer.decode" href="#audiocraft.quantization.base.DummyQuantizer.decode">decode</a></code></li>
+<li><code><a title="audiocraft.quantization.base.DummyQuantizer.dump_patches" href="#audiocraft.quantization.base.DummyQuantizer.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.quantization.base.DummyQuantizer.encode" href="#audiocraft.quantization.base.DummyQuantizer.encode">encode</a></code></li>
+<li><code><a title="audiocraft.quantization.base.DummyQuantizer.num_codebooks" href="#audiocraft.quantization.base.DummyQuantizer.num_codebooks">num_codebooks</a></code></li>
+<li><code><a title="audiocraft.quantization.base.DummyQuantizer.training" href="#audiocraft.quantization.base.DummyQuantizer.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.quantization.base.QuantizedResult" href="#audiocraft.quantization.base.QuantizedResult">QuantizedResult</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.quantization.base.QuantizedResult.bandwidth" href="#audiocraft.quantization.base.QuantizedResult.bandwidth">bandwidth</a></code></li>
+<li><code><a title="audiocraft.quantization.base.QuantizedResult.codes" href="#audiocraft.quantization.base.QuantizedResult.codes">codes</a></code></li>
+<li><code><a title="audiocraft.quantization.base.QuantizedResult.metrics" href="#audiocraft.quantization.base.QuantizedResult.metrics">metrics</a></code></li>
+<li><code><a title="audiocraft.quantization.base.QuantizedResult.penalty" href="#audiocraft.quantization.base.QuantizedResult.penalty">penalty</a></code></li>
+<li><code><a title="audiocraft.quantization.base.QuantizedResult.x" href="#audiocraft.quantization.base.QuantizedResult.x">x</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/docs/audiocraft/quantization/core_vq.html b/docs/audiocraft/quantization/core_vq.html
new file mode 100644
index 00000000..99610654
--- /dev/null
+++ b/docs/audiocraft/quantization/core_vq.html
@@ -0,0 +1,1538 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.quantization.core_vq API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.quantization.core_vq</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import typing as tp
+
+from einops import rearrange, repeat
+import flashy
+import torch
+from torch import nn, einsum
+import torch.nn.functional as F
+
+
+def exists(val: tp.Optional[tp.Any]) -&gt; bool:
+    return val is not None
+
+
+def default(val: tp.Any, d: tp.Any) -&gt; tp.Any:
+    return val if exists(val) else d
+
+
+def l2norm(t):
+    return F.normalize(t, p=2, dim=-1)
+
+
+def ema_inplace(moving_avg, new, decay: float):
+    moving_avg.data.mul_(decay).add_(new, alpha=(1 - decay))
+
+
+def laplace_smoothing(x, n_categories: int, epsilon: float = 1e-5):
+    return (x + epsilon) / (x.sum() + n_categories * epsilon)
+
+
+def uniform_init(*shape: int):
+    t = torch.empty(shape)
+    nn.init.kaiming_uniform_(t)
+    return t
+
+
+def sample_vectors(samples, num: int):
+    num_samples, device = samples.shape[0], samples.device
+
+    if num_samples &gt;= num:
+        indices = torch.randperm(num_samples, device=device)[:num]
+    else:
+        indices = torch.randint(0, num_samples, (num,), device=device)
+
+    return samples[indices]
+
+
+def kmeans(samples, num_clusters: int, num_iters: int = 10):
+    dim, dtype = samples.shape[-1], samples.dtype
+
+    means = sample_vectors(samples, num_clusters)
+
+    for _ in range(num_iters):
+        diffs = rearrange(samples, &#34;n d -&gt; n () d&#34;) - rearrange(
+            means, &#34;c d -&gt; () c d&#34;
+        )
+        dists = -(diffs ** 2).sum(dim=-1)
+
+        buckets = dists.max(dim=-1).indices
+        bins = torch.bincount(buckets, minlength=num_clusters)
+        zero_mask = bins == 0
+        bins_min_clamped = bins.masked_fill(zero_mask, 1)
+
+        new_means = buckets.new_zeros(num_clusters, dim, dtype=dtype)
+        new_means.scatter_add_(0, repeat(buckets, &#34;n -&gt; n d&#34;, d=dim), samples)
+        new_means = new_means / bins_min_clamped[..., None]
+
+        means = torch.where(zero_mask[..., None], means, new_means)
+
+    return means, bins
+
+
+def orthgonal_loss_fn(t):
+    # eq (2) from https://arxiv.org/abs/2112.00384
+    n = t.shape[0]
+    normed_codes = l2norm(t)
+    identity = torch.eye(n, device=t.device)
+    cosine_sim = einsum(&#34;i d, j d -&gt; i j&#34;, normed_codes, normed_codes)
+    return ((cosine_sim - identity) ** 2).sum() / (n ** 2)
+
+
+class EuclideanCodebook(nn.Module):
+    &#34;&#34;&#34;Codebook with Euclidean distance.
+
+    Args:
+        dim (int): Dimension.
+        codebook_size (int): Codebook size.
+        kmeans_init (bool): Whether to use k-means to initialize the codebooks.
+            If set to true, run the k-means algorithm on the first training batch and use
+            the learned centroids as initialization.
+        kmeans_iters (int): Number of iterations used for k-means algorithm at initialization.
+        decay (float): Decay for exponential moving average over the codebooks.
+        epsilon (float): Epsilon value for numerical stability.
+        threshold_ema_dead_code (int): Threshold for dead code expiration. Replace any codes
+            that have an exponential moving average cluster size less than the specified threshold with
+            randomly selected vector from the current batch.
+    &#34;&#34;&#34;
+    def __init__(
+        self,
+        dim: int,
+        codebook_size: int,
+        kmeans_init: int = False,
+        kmeans_iters: int = 10,
+        decay: float = 0.8,
+        epsilon: float = 1e-5,
+        threshold_ema_dead_code: int = 2,
+    ):
+        super().__init__()
+        self.decay = decay
+        init_fn: tp.Union[tp.Callable[..., torch.Tensor], tp.Any] = uniform_init if not kmeans_init else torch.zeros
+        embed = init_fn(codebook_size, dim)
+
+        self.codebook_size = codebook_size
+
+        self.kmeans_iters = kmeans_iters
+        self.epsilon = epsilon
+        self.threshold_ema_dead_code = threshold_ema_dead_code
+
+        self.register_buffer(&#34;inited&#34;, torch.Tensor([not kmeans_init]))
+        self.register_buffer(&#34;cluster_size&#34;, torch.zeros(codebook_size))
+        self.register_buffer(&#34;embed&#34;, embed)
+        self.register_buffer(&#34;embed_avg&#34;, embed.clone())
+
+    @torch.jit.ignore
+    def init_embed_(self, data):
+        if self.inited:
+            return
+
+        embed, cluster_size = kmeans(data, self.codebook_size, self.kmeans_iters)
+        self.embed.data.copy_(embed)
+        self.embed_avg.data.copy_(embed.clone())
+        self.cluster_size.data.copy_(cluster_size)
+        self.inited.data.copy_(torch.Tensor([True]))
+        # Make sure all buffers across workers are in sync after initialization
+        flashy.distrib.broadcast_tensors(self.buffers())
+
+    def replace_(self, samples, mask):
+        modified_codebook = torch.where(
+            mask[..., None], sample_vectors(samples, self.codebook_size), self.embed
+        )
+        self.embed.data.copy_(modified_codebook)
+
+    def expire_codes_(self, batch_samples):
+        if self.threshold_ema_dead_code == 0:
+            return
+
+        expired_codes = self.cluster_size &lt; self.threshold_ema_dead_code
+        if not torch.any(expired_codes):
+            return
+
+        batch_samples = rearrange(batch_samples, &#34;... d -&gt; (...) d&#34;)
+        self.replace_(batch_samples, mask=expired_codes)
+        flashy.distrib.broadcast_tensors(self.buffers())
+
+    def preprocess(self, x):
+        x = rearrange(x, &#34;... d -&gt; (...) d&#34;)
+        return x
+
+    def quantize(self, x):
+        embed = self.embed.t()
+        dist = -(
+            x.pow(2).sum(1, keepdim=True)
+            - 2 * x @ embed
+            + embed.pow(2).sum(0, keepdim=True)
+        )
+        embed_ind = dist.max(dim=-1).indices
+        return embed_ind
+
+    def postprocess_emb(self, embed_ind, shape):
+        return embed_ind.view(*shape[:-1])
+
+    def dequantize(self, embed_ind):
+        quantize = F.embedding(embed_ind, self.embed)
+        return quantize
+
+    def encode(self, x):
+        shape = x.shape
+        # pre-process
+        x = self.preprocess(x)
+        # quantize
+        embed_ind = self.quantize(x)
+        # post-process
+        embed_ind = self.postprocess_emb(embed_ind, shape)
+        return embed_ind
+
+    def decode(self, embed_ind):
+        quantize = self.dequantize(embed_ind)
+        return quantize
+
+    def forward(self, x):
+        shape, dtype = x.shape, x.dtype
+        x = self.preprocess(x)
+        self.init_embed_(x)
+
+        embed_ind = self.quantize(x)
+        embed_onehot = F.one_hot(embed_ind, self.codebook_size).type(dtype)
+        embed_ind = self.postprocess_emb(embed_ind, shape)
+        quantize = self.dequantize(embed_ind)
+
+        if self.training:
+            # We do the expiry of code at that point as buffers are in sync
+            # and all the workers will take the same decision.
+            self.expire_codes_(x)
+            ema_inplace(self.cluster_size, embed_onehot.sum(0), self.decay)
+            embed_sum = x.t() @ embed_onehot
+            ema_inplace(self.embed_avg, embed_sum.t(), self.decay)
+            cluster_size = (
+                laplace_smoothing(self.cluster_size, self.codebook_size, self.epsilon)
+                * self.cluster_size.sum()
+            )
+            embed_normalized = self.embed_avg / cluster_size.unsqueeze(1)
+            self.embed.data.copy_(embed_normalized)
+
+        return quantize, embed_ind
+
+
+class VectorQuantization(nn.Module):
+    &#34;&#34;&#34;Vector quantization implementation.
+    Currently supports only euclidean distance.
+
+    Args:
+        dim (int): Dimension
+        codebook_size (int): Codebook size
+        codebook_dim (int): Codebook dimension. If not defined, uses the specified dimension in dim.
+        decay (float): Decay for exponential moving average over the codebooks.
+        epsilon (float): Epsilon value for numerical stability.
+        kmeans_init (bool): Whether to use kmeans to initialize the codebooks.
+        kmeans_iters (int): Number of iterations used for kmeans initialization.
+        threshold_ema_dead_code (int):
+        channels_last (bool): Channels are the last dimension in the input tensors.
+        commitment_weight (float): Weight for commitment loss.
+        orthogonal_reg_weight (float): Orthogonal regularization weights.
+        orthogonal_reg_active_codes_only (bool): Apply orthogonal regularization only on active codes.
+        orthogonal_reg_max_codes (optional int): Maximum number of codes to consider
+            for orthogonal regulariation.
+        threshold_ema_dead_code (int): Threshold for dead code expiration. Replace any codes
+            that have an exponential moving average cluster size less than the specified threshold with
+            randomly selected vector from the current batch.
+    &#34;&#34;&#34;
+    def __init__(
+        self,
+        dim: int,
+        codebook_size: int,
+        codebook_dim: tp.Optional[int] = None,
+        decay: float = 0.8,
+        epsilon: float = 1e-5,
+        kmeans_init: bool = False,
+        kmeans_iters: int = 10,
+        threshold_ema_dead_code: int = 2,
+        channels_last: bool = False,
+        commitment_weight: float = 1.,
+        orthogonal_reg_weight: float = 0.0,
+        orthogonal_reg_active_codes_only: bool = False,
+        orthogonal_reg_max_codes: tp.Optional[int] = None,
+    ):
+        super().__init__()
+        _codebook_dim: int = default(codebook_dim, dim)
+
+        requires_projection = _codebook_dim != dim
+        self.project_in = (nn.Linear(dim, _codebook_dim) if requires_projection else nn.Identity())
+        self.project_out = (nn.Linear(_codebook_dim, dim) if requires_projection else nn.Identity())
+
+        self.epsilon = epsilon
+        self.commitment_weight = commitment_weight
+
+        self.orthogonal_reg_weight = orthogonal_reg_weight
+        self.orthogonal_reg_active_codes_only = orthogonal_reg_active_codes_only
+        self.orthogonal_reg_max_codes = orthogonal_reg_max_codes
+
+        self._codebook = EuclideanCodebook(dim=_codebook_dim, codebook_size=codebook_size,
+                                           kmeans_init=kmeans_init, kmeans_iters=kmeans_iters,
+                                           decay=decay, epsilon=epsilon,
+                                           threshold_ema_dead_code=threshold_ema_dead_code)
+        self.codebook_size = codebook_size
+
+        self.channels_last = channels_last
+
+    @property
+    def codebook(self):
+        return self._codebook.embed
+
+    @property
+    def inited(self):
+        return self._codebook.inited
+
+    def _preprocess(self, x):
+        if not self.channels_last:
+            x = rearrange(x, &#34;b d n -&gt; b n d&#34;)
+        return x
+
+    def _postprocess(self, quantize):
+        if not self.channels_last:
+            quantize = rearrange(quantize, &#34;b n d -&gt; b d n&#34;)
+        return quantize
+
+    def encode(self, x):
+        x = self._preprocess(x)
+        x = self.project_in(x)
+        embed_in = self._codebook.encode(x)
+        return embed_in
+
+    def decode(self, embed_ind):
+        quantize = self._codebook.decode(embed_ind)
+        quantize = self.project_out(quantize)
+        quantize = self._postprocess(quantize)
+        return quantize
+
+    def forward(self, x):
+        device = x.device
+        x = self._preprocess(x)
+
+        x = self.project_in(x)
+        quantize, embed_ind = self._codebook(x)
+
+        if self.training:
+            quantize = x + (quantize - x).detach()
+
+        loss = torch.tensor([0.0], device=device, requires_grad=self.training)
+
+        if self.training:
+            if self.commitment_weight &gt; 0:
+                commit_loss = F.mse_loss(quantize.detach(), x)
+                loss = loss + commit_loss * self.commitment_weight
+
+            if self.orthogonal_reg_weight &gt; 0:
+                codebook = self.codebook
+
+                if self.orthogonal_reg_active_codes_only:
+                    # only calculate orthogonal loss for the activated codes for this batch
+                    unique_code_ids = torch.unique(embed_ind)
+                    codebook = codebook[unique_code_ids]
+
+                num_codes = codebook.shape[0]
+                if exists(self.orthogonal_reg_max_codes) and num_codes &gt; self.orthogonal_reg_max_codes:
+                    rand_ids = torch.randperm(num_codes, device=device)[:self.orthogonal_reg_max_codes]
+                    codebook = codebook[rand_ids]
+
+                orthogonal_reg_loss = orthgonal_loss_fn(codebook)
+                loss = loss + orthogonal_reg_loss * self.orthogonal_reg_weight
+
+        quantize = self.project_out(quantize)
+        quantize = self._postprocess(quantize)
+
+        return quantize, embed_ind, loss
+
+
+class ResidualVectorQuantization(nn.Module):
+    &#34;&#34;&#34;Residual vector quantization implementation.
+
+    Follows Algorithm 1. in https://arxiv.org/pdf/2107.03312.pdf
+    &#34;&#34;&#34;
+    def __init__(self, *, num_quantizers, **kwargs):
+        super().__init__()
+        self.layers = nn.ModuleList(
+            [VectorQuantization(**kwargs) for _ in range(num_quantizers)]
+        )
+
+    def forward(self, x, n_q: tp.Optional[int] = None):
+        quantized_out = 0.0
+        residual = x
+
+        all_losses = []
+        all_indices = []
+
+        n_q = n_q or len(self.layers)
+
+        for i, layer in enumerate(self.layers[:n_q]):
+            quantized, indices, loss = layer(residual)
+            residual = residual - quantized
+            quantized_out = quantized_out + quantized
+            all_indices.append(indices)
+            all_losses.append(loss)
+
+        out_losses, out_indices = map(torch.stack, (all_losses, all_indices))
+        return quantized_out, out_indices, out_losses
+
+    def encode(self, x: torch.Tensor, n_q: tp.Optional[int] = None) -&gt; torch.Tensor:
+        residual = x
+        all_indices = []
+        n_q = n_q or len(self.layers)
+        for layer in self.layers[:n_q]:
+            indices = layer.encode(residual)
+            quantized = layer.decode(indices)
+            residual = residual - quantized
+            all_indices.append(indices)
+        out_indices = torch.stack(all_indices)
+        return out_indices
+
+    def decode(self, q_indices: torch.Tensor) -&gt; torch.Tensor:
+        quantized_out = torch.tensor(0.0, device=q_indices.device)
+        for i, indices in enumerate(q_indices):
+            layer = self.layers[i]
+            quantized = layer.decode(indices)
+            quantized_out = quantized_out + quantized
+        return quantized_out</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-functions">Functions</h2>
+<dl>
+<dt id="audiocraft.quantization.core_vq.default"><code class="name flex">
+<span>def <span class="ident">default</span></span>(<span>val: Any, d: Any) ‑> Any</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def default(val: tp.Any, d: tp.Any) -&gt; tp.Any:
+    return val if exists(val) else d</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.quantization.core_vq.ema_inplace"><code class="name flex">
+<span>def <span class="ident">ema_inplace</span></span>(<span>moving_avg, new, decay: float)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def ema_inplace(moving_avg, new, decay: float):
+    moving_avg.data.mul_(decay).add_(new, alpha=(1 - decay))</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.quantization.core_vq.exists"><code class="name flex">
+<span>def <span class="ident">exists</span></span>(<span>val: Optional[Any]) ‑> bool</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def exists(val: tp.Optional[tp.Any]) -&gt; bool:
+    return val is not None</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.quantization.core_vq.kmeans"><code class="name flex">
+<span>def <span class="ident">kmeans</span></span>(<span>samples, num_clusters: int, num_iters: int = 10)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def kmeans(samples, num_clusters: int, num_iters: int = 10):
+    dim, dtype = samples.shape[-1], samples.dtype
+
+    means = sample_vectors(samples, num_clusters)
+
+    for _ in range(num_iters):
+        diffs = rearrange(samples, &#34;n d -&gt; n () d&#34;) - rearrange(
+            means, &#34;c d -&gt; () c d&#34;
+        )
+        dists = -(diffs ** 2).sum(dim=-1)
+
+        buckets = dists.max(dim=-1).indices
+        bins = torch.bincount(buckets, minlength=num_clusters)
+        zero_mask = bins == 0
+        bins_min_clamped = bins.masked_fill(zero_mask, 1)
+
+        new_means = buckets.new_zeros(num_clusters, dim, dtype=dtype)
+        new_means.scatter_add_(0, repeat(buckets, &#34;n -&gt; n d&#34;, d=dim), samples)
+        new_means = new_means / bins_min_clamped[..., None]
+
+        means = torch.where(zero_mask[..., None], means, new_means)
+
+    return means, bins</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.quantization.core_vq.l2norm"><code class="name flex">
+<span>def <span class="ident">l2norm</span></span>(<span>t)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def l2norm(t):
+    return F.normalize(t, p=2, dim=-1)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.quantization.core_vq.laplace_smoothing"><code class="name flex">
+<span>def <span class="ident">laplace_smoothing</span></span>(<span>x, n_categories: int, epsilon: float = 1e-05)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def laplace_smoothing(x, n_categories: int, epsilon: float = 1e-5):
+    return (x + epsilon) / (x.sum() + n_categories * epsilon)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.quantization.core_vq.orthgonal_loss_fn"><code class="name flex">
+<span>def <span class="ident">orthgonal_loss_fn</span></span>(<span>t)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def orthgonal_loss_fn(t):
+    # eq (2) from https://arxiv.org/abs/2112.00384
+    n = t.shape[0]
+    normed_codes = l2norm(t)
+    identity = torch.eye(n, device=t.device)
+    cosine_sim = einsum(&#34;i d, j d -&gt; i j&#34;, normed_codes, normed_codes)
+    return ((cosine_sim - identity) ** 2).sum() / (n ** 2)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.quantization.core_vq.sample_vectors"><code class="name flex">
+<span>def <span class="ident">sample_vectors</span></span>(<span>samples, num: int)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def sample_vectors(samples, num: int):
+    num_samples, device = samples.shape[0], samples.device
+
+    if num_samples &gt;= num:
+        indices = torch.randperm(num_samples, device=device)[:num]
+    else:
+        indices = torch.randint(0, num_samples, (num,), device=device)
+
+    return samples[indices]</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.quantization.core_vq.uniform_init"><code class="name flex">
+<span>def <span class="ident">uniform_init</span></span>(<span>*shape: int)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def uniform_init(*shape: int):
+    t = torch.empty(shape)
+    nn.init.kaiming_uniform_(t)
+    return t</code></pre>
+</details>
+</dd>
+</dl>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.quantization.core_vq.EuclideanCodebook"><code class="flex name class">
+<span>class <span class="ident">EuclideanCodebook</span></span>
+<span>(</span><span>dim: int, codebook_size: int, kmeans_init: int = False, kmeans_iters: int = 10, decay: float = 0.8, epsilon: float = 1e-05, threshold_ema_dead_code: int = 2)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Codebook with Euclidean distance.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>dim</code></strong> :&ensp;<code>int</code></dt>
+<dd>Dimension.</dd>
+<dt><strong><code>codebook_size</code></strong> :&ensp;<code>int</code></dt>
+<dd>Codebook size.</dd>
+<dt><strong><code>kmeans_init</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to use k-means to initialize the codebooks.
+If set to true, run the k-means algorithm on the first training batch and use
+the learned centroids as initialization.</dd>
+<dt><strong><code>kmeans_iters</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of iterations used for k-means algorithm at initialization.</dd>
+<dt><strong><code>decay</code></strong> :&ensp;<code>float</code></dt>
+<dd>Decay for exponential moving average over the codebooks.</dd>
+<dt><strong><code>epsilon</code></strong> :&ensp;<code>float</code></dt>
+<dd>Epsilon value for numerical stability.</dd>
+<dt><strong><code>threshold_ema_dead_code</code></strong> :&ensp;<code>int</code></dt>
+<dd>Threshold for dead code expiration. Replace any codes
+that have an exponential moving average cluster size less than the specified threshold with
+randomly selected vector from the current batch.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class EuclideanCodebook(nn.Module):
+    &#34;&#34;&#34;Codebook with Euclidean distance.
+
+    Args:
+        dim (int): Dimension.
+        codebook_size (int): Codebook size.
+        kmeans_init (bool): Whether to use k-means to initialize the codebooks.
+            If set to true, run the k-means algorithm on the first training batch and use
+            the learned centroids as initialization.
+        kmeans_iters (int): Number of iterations used for k-means algorithm at initialization.
+        decay (float): Decay for exponential moving average over the codebooks.
+        epsilon (float): Epsilon value for numerical stability.
+        threshold_ema_dead_code (int): Threshold for dead code expiration. Replace any codes
+            that have an exponential moving average cluster size less than the specified threshold with
+            randomly selected vector from the current batch.
+    &#34;&#34;&#34;
+    def __init__(
+        self,
+        dim: int,
+        codebook_size: int,
+        kmeans_init: int = False,
+        kmeans_iters: int = 10,
+        decay: float = 0.8,
+        epsilon: float = 1e-5,
+        threshold_ema_dead_code: int = 2,
+    ):
+        super().__init__()
+        self.decay = decay
+        init_fn: tp.Union[tp.Callable[..., torch.Tensor], tp.Any] = uniform_init if not kmeans_init else torch.zeros
+        embed = init_fn(codebook_size, dim)
+
+        self.codebook_size = codebook_size
+
+        self.kmeans_iters = kmeans_iters
+        self.epsilon = epsilon
+        self.threshold_ema_dead_code = threshold_ema_dead_code
+
+        self.register_buffer(&#34;inited&#34;, torch.Tensor([not kmeans_init]))
+        self.register_buffer(&#34;cluster_size&#34;, torch.zeros(codebook_size))
+        self.register_buffer(&#34;embed&#34;, embed)
+        self.register_buffer(&#34;embed_avg&#34;, embed.clone())
+
+    @torch.jit.ignore
+    def init_embed_(self, data):
+        if self.inited:
+            return
+
+        embed, cluster_size = kmeans(data, self.codebook_size, self.kmeans_iters)
+        self.embed.data.copy_(embed)
+        self.embed_avg.data.copy_(embed.clone())
+        self.cluster_size.data.copy_(cluster_size)
+        self.inited.data.copy_(torch.Tensor([True]))
+        # Make sure all buffers across workers are in sync after initialization
+        flashy.distrib.broadcast_tensors(self.buffers())
+
+    def replace_(self, samples, mask):
+        modified_codebook = torch.where(
+            mask[..., None], sample_vectors(samples, self.codebook_size), self.embed
+        )
+        self.embed.data.copy_(modified_codebook)
+
+    def expire_codes_(self, batch_samples):
+        if self.threshold_ema_dead_code == 0:
+            return
+
+        expired_codes = self.cluster_size &lt; self.threshold_ema_dead_code
+        if not torch.any(expired_codes):
+            return
+
+        batch_samples = rearrange(batch_samples, &#34;... d -&gt; (...) d&#34;)
+        self.replace_(batch_samples, mask=expired_codes)
+        flashy.distrib.broadcast_tensors(self.buffers())
+
+    def preprocess(self, x):
+        x = rearrange(x, &#34;... d -&gt; (...) d&#34;)
+        return x
+
+    def quantize(self, x):
+        embed = self.embed.t()
+        dist = -(
+            x.pow(2).sum(1, keepdim=True)
+            - 2 * x @ embed
+            + embed.pow(2).sum(0, keepdim=True)
+        )
+        embed_ind = dist.max(dim=-1).indices
+        return embed_ind
+
+    def postprocess_emb(self, embed_ind, shape):
+        return embed_ind.view(*shape[:-1])
+
+    def dequantize(self, embed_ind):
+        quantize = F.embedding(embed_ind, self.embed)
+        return quantize
+
+    def encode(self, x):
+        shape = x.shape
+        # pre-process
+        x = self.preprocess(x)
+        # quantize
+        embed_ind = self.quantize(x)
+        # post-process
+        embed_ind = self.postprocess_emb(embed_ind, shape)
+        return embed_ind
+
+    def decode(self, embed_ind):
+        quantize = self.dequantize(embed_ind)
+        return quantize
+
+    def forward(self, x):
+        shape, dtype = x.shape, x.dtype
+        x = self.preprocess(x)
+        self.init_embed_(x)
+
+        embed_ind = self.quantize(x)
+        embed_onehot = F.one_hot(embed_ind, self.codebook_size).type(dtype)
+        embed_ind = self.postprocess_emb(embed_ind, shape)
+        quantize = self.dequantize(embed_ind)
+
+        if self.training:
+            # We do the expiry of code at that point as buffers are in sync
+            # and all the workers will take the same decision.
+            self.expire_codes_(x)
+            ema_inplace(self.cluster_size, embed_onehot.sum(0), self.decay)
+            embed_sum = x.t() @ embed_onehot
+            ema_inplace(self.embed_avg, embed_sum.t(), self.decay)
+            cluster_size = (
+                laplace_smoothing(self.cluster_size, self.codebook_size, self.epsilon)
+                * self.cluster_size.sum()
+            )
+            embed_normalized = self.embed_avg / cluster_size.unsqueeze(1)
+            self.embed.data.copy_(embed_normalized)
+
+        return quantize, embed_ind</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.quantization.core_vq.EuclideanCodebook.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.quantization.core_vq.EuclideanCodebook.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.quantization.core_vq.EuclideanCodebook.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.quantization.core_vq.EuclideanCodebook.decode"><code class="name flex">
+<span>def <span class="ident">decode</span></span>(<span>self, embed_ind)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def decode(self, embed_ind):
+    quantize = self.dequantize(embed_ind)
+    return quantize</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.quantization.core_vq.EuclideanCodebook.dequantize"><code class="name flex">
+<span>def <span class="ident">dequantize</span></span>(<span>self, embed_ind)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def dequantize(self, embed_ind):
+    quantize = F.embedding(embed_ind, self.embed)
+    return quantize</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.quantization.core_vq.EuclideanCodebook.encode"><code class="name flex">
+<span>def <span class="ident">encode</span></span>(<span>self, x)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def encode(self, x):
+    shape = x.shape
+    # pre-process
+    x = self.preprocess(x)
+    # quantize
+    embed_ind = self.quantize(x)
+    # post-process
+    embed_ind = self.postprocess_emb(embed_ind, shape)
+    return embed_ind</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.quantization.core_vq.EuclideanCodebook.expire_codes_"><code class="name flex">
+<span>def <span class="ident">expire_codes_</span></span>(<span>self, batch_samples)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def expire_codes_(self, batch_samples):
+    if self.threshold_ema_dead_code == 0:
+        return
+
+    expired_codes = self.cluster_size &lt; self.threshold_ema_dead_code
+    if not torch.any(expired_codes):
+        return
+
+    batch_samples = rearrange(batch_samples, &#34;... d -&gt; (...) d&#34;)
+    self.replace_(batch_samples, mask=expired_codes)
+    flashy.distrib.broadcast_tensors(self.buffers())</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.quantization.core_vq.EuclideanCodebook.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, x) ‑> Callable[..., Any]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Defines the computation performed at every call.</p>
+<p>Should be overridden by all subclasses.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>Although the recipe for forward pass needs to be defined within
+this function, one should call the :class:<code>Module</code> instance afterwards
+instead of this since the former takes care of running the
+registered hooks while the latter silently ignores them.</p>
+</div></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, x):
+    shape, dtype = x.shape, x.dtype
+    x = self.preprocess(x)
+    self.init_embed_(x)
+
+    embed_ind = self.quantize(x)
+    embed_onehot = F.one_hot(embed_ind, self.codebook_size).type(dtype)
+    embed_ind = self.postprocess_emb(embed_ind, shape)
+    quantize = self.dequantize(embed_ind)
+
+    if self.training:
+        # We do the expiry of code at that point as buffers are in sync
+        # and all the workers will take the same decision.
+        self.expire_codes_(x)
+        ema_inplace(self.cluster_size, embed_onehot.sum(0), self.decay)
+        embed_sum = x.t() @ embed_onehot
+        ema_inplace(self.embed_avg, embed_sum.t(), self.decay)
+        cluster_size = (
+            laplace_smoothing(self.cluster_size, self.codebook_size, self.epsilon)
+            * self.cluster_size.sum()
+        )
+        embed_normalized = self.embed_avg / cluster_size.unsqueeze(1)
+        self.embed.data.copy_(embed_normalized)
+
+    return quantize, embed_ind</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.quantization.core_vq.EuclideanCodebook.init_embed_"><code class="name flex">
+<span>def <span class="ident">init_embed_</span></span>(<span>self, data)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@torch.jit.ignore
+def init_embed_(self, data):
+    if self.inited:
+        return
+
+    embed, cluster_size = kmeans(data, self.codebook_size, self.kmeans_iters)
+    self.embed.data.copy_(embed)
+    self.embed_avg.data.copy_(embed.clone())
+    self.cluster_size.data.copy_(cluster_size)
+    self.inited.data.copy_(torch.Tensor([True]))
+    # Make sure all buffers across workers are in sync after initialization
+    flashy.distrib.broadcast_tensors(self.buffers())</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.quantization.core_vq.EuclideanCodebook.postprocess_emb"><code class="name flex">
+<span>def <span class="ident">postprocess_emb</span></span>(<span>self, embed_ind, shape)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def postprocess_emb(self, embed_ind, shape):
+    return embed_ind.view(*shape[:-1])</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.quantization.core_vq.EuclideanCodebook.preprocess"><code class="name flex">
+<span>def <span class="ident">preprocess</span></span>(<span>self, x)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def preprocess(self, x):
+    x = rearrange(x, &#34;... d -&gt; (...) d&#34;)
+    return x</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.quantization.core_vq.EuclideanCodebook.quantize"><code class="name flex">
+<span>def <span class="ident">quantize</span></span>(<span>self, x)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def quantize(self, x):
+    embed = self.embed.t()
+    dist = -(
+        x.pow(2).sum(1, keepdim=True)
+        - 2 * x @ embed
+        + embed.pow(2).sum(0, keepdim=True)
+    )
+    embed_ind = dist.max(dim=-1).indices
+    return embed_ind</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.quantization.core_vq.EuclideanCodebook.replace_"><code class="name flex">
+<span>def <span class="ident">replace_</span></span>(<span>self, samples, mask)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def replace_(self, samples, mask):
+    modified_codebook = torch.where(
+        mask[..., None], sample_vectors(samples, self.codebook_size), self.embed
+    )
+    self.embed.data.copy_(modified_codebook)</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.quantization.core_vq.ResidualVectorQuantization"><code class="flex name class">
+<span>class <span class="ident">ResidualVectorQuantization</span></span>
+<span>(</span><span>*, num_quantizers, **kwargs)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Residual vector quantization implementation.</p>
+<p>Follows Algorithm 1. in <a href="https://arxiv.org/pdf/2107.03312.pdf">https://arxiv.org/pdf/2107.03312.pdf</a></p>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class ResidualVectorQuantization(nn.Module):
+    &#34;&#34;&#34;Residual vector quantization implementation.
+
+    Follows Algorithm 1. in https://arxiv.org/pdf/2107.03312.pdf
+    &#34;&#34;&#34;
+    def __init__(self, *, num_quantizers, **kwargs):
+        super().__init__()
+        self.layers = nn.ModuleList(
+            [VectorQuantization(**kwargs) for _ in range(num_quantizers)]
+        )
+
+    def forward(self, x, n_q: tp.Optional[int] = None):
+        quantized_out = 0.0
+        residual = x
+
+        all_losses = []
+        all_indices = []
+
+        n_q = n_q or len(self.layers)
+
+        for i, layer in enumerate(self.layers[:n_q]):
+            quantized, indices, loss = layer(residual)
+            residual = residual - quantized
+            quantized_out = quantized_out + quantized
+            all_indices.append(indices)
+            all_losses.append(loss)
+
+        out_losses, out_indices = map(torch.stack, (all_losses, all_indices))
+        return quantized_out, out_indices, out_losses
+
+    def encode(self, x: torch.Tensor, n_q: tp.Optional[int] = None) -&gt; torch.Tensor:
+        residual = x
+        all_indices = []
+        n_q = n_q or len(self.layers)
+        for layer in self.layers[:n_q]:
+            indices = layer.encode(residual)
+            quantized = layer.decode(indices)
+            residual = residual - quantized
+            all_indices.append(indices)
+        out_indices = torch.stack(all_indices)
+        return out_indices
+
+    def decode(self, q_indices: torch.Tensor) -&gt; torch.Tensor:
+        quantized_out = torch.tensor(0.0, device=q_indices.device)
+        for i, indices in enumerate(q_indices):
+            layer = self.layers[i]
+            quantized = layer.decode(indices)
+            quantized_out = quantized_out + quantized
+        return quantized_out</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.quantization.core_vq.ResidualVectorQuantization.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.quantization.core_vq.ResidualVectorQuantization.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.quantization.core_vq.ResidualVectorQuantization.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.quantization.core_vq.ResidualVectorQuantization.decode"><code class="name flex">
+<span>def <span class="ident">decode</span></span>(<span>self, q_indices: torch.Tensor) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def decode(self, q_indices: torch.Tensor) -&gt; torch.Tensor:
+    quantized_out = torch.tensor(0.0, device=q_indices.device)
+    for i, indices in enumerate(q_indices):
+        layer = self.layers[i]
+        quantized = layer.decode(indices)
+        quantized_out = quantized_out + quantized
+    return quantized_out</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.quantization.core_vq.ResidualVectorQuantization.encode"><code class="name flex">
+<span>def <span class="ident">encode</span></span>(<span>self, x: torch.Tensor, n_q: Optional[int] = None) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def encode(self, x: torch.Tensor, n_q: tp.Optional[int] = None) -&gt; torch.Tensor:
+    residual = x
+    all_indices = []
+    n_q = n_q or len(self.layers)
+    for layer in self.layers[:n_q]:
+        indices = layer.encode(residual)
+        quantized = layer.decode(indices)
+        residual = residual - quantized
+        all_indices.append(indices)
+    out_indices = torch.stack(all_indices)
+    return out_indices</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.quantization.core_vq.ResidualVectorQuantization.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, x, n_q: Optional[int] = None) ‑> Callable[..., Any]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Defines the computation performed at every call.</p>
+<p>Should be overridden by all subclasses.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>Although the recipe for forward pass needs to be defined within
+this function, one should call the :class:<code>Module</code> instance afterwards
+instead of this since the former takes care of running the
+registered hooks while the latter silently ignores them.</p>
+</div></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, x, n_q: tp.Optional[int] = None):
+    quantized_out = 0.0
+    residual = x
+
+    all_losses = []
+    all_indices = []
+
+    n_q = n_q or len(self.layers)
+
+    for i, layer in enumerate(self.layers[:n_q]):
+        quantized, indices, loss = layer(residual)
+        residual = residual - quantized
+        quantized_out = quantized_out + quantized
+        all_indices.append(indices)
+        all_losses.append(loss)
+
+    out_losses, out_indices = map(torch.stack, (all_losses, all_indices))
+    return quantized_out, out_indices, out_losses</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+<dt id="audiocraft.quantization.core_vq.VectorQuantization"><code class="flex name class">
+<span>class <span class="ident">VectorQuantization</span></span>
+<span>(</span><span>dim: int, codebook_size: int, codebook_dim: Optional[int] = None, decay: float = 0.8, epsilon: float = 1e-05, kmeans_init: bool = False, kmeans_iters: int = 10, threshold_ema_dead_code: int = 2, channels_last: bool = False, commitment_weight: float = 1.0, orthogonal_reg_weight: float = 0.0, orthogonal_reg_active_codes_only: bool = False, orthogonal_reg_max_codes: Optional[int] = None)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Vector quantization implementation.
+Currently supports only euclidean distance.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>dim</code></strong> :&ensp;<code>int</code></dt>
+<dd>Dimension</dd>
+<dt><strong><code>codebook_size</code></strong> :&ensp;<code>int</code></dt>
+<dd>Codebook size</dd>
+<dt><strong><code>codebook_dim</code></strong> :&ensp;<code>int</code></dt>
+<dd>Codebook dimension. If not defined, uses the specified dimension in dim.</dd>
+<dt><strong><code>decay</code></strong> :&ensp;<code>float</code></dt>
+<dd>Decay for exponential moving average over the codebooks.</dd>
+<dt><strong><code>epsilon</code></strong> :&ensp;<code>float</code></dt>
+<dd>Epsilon value for numerical stability.</dd>
+<dt><strong><code>kmeans_init</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to use kmeans to initialize the codebooks.</dd>
+<dt><strong><code>kmeans_iters</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of iterations used for kmeans initialization.</dd>
+<dt>threshold_ema_dead_code (int):</dt>
+<dt><strong><code>channels_last</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Channels are the last dimension in the input tensors.</dd>
+<dt><strong><code>commitment_weight</code></strong> :&ensp;<code>float</code></dt>
+<dd>Weight for commitment loss.</dd>
+<dt><strong><code>orthogonal_reg_weight</code></strong> :&ensp;<code>float</code></dt>
+<dd>Orthogonal regularization weights.</dd>
+<dt><strong><code>orthogonal_reg_active_codes_only</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Apply orthogonal regularization only on active codes.</dd>
+<dt><strong><code>orthogonal_reg_max_codes</code></strong> :&ensp;<code>optional int</code></dt>
+<dd>Maximum number of codes to consider
+for orthogonal regulariation.</dd>
+<dt><strong><code>threshold_ema_dead_code</code></strong> :&ensp;<code>int</code></dt>
+<dd>Threshold for dead code expiration. Replace any codes
+that have an exponential moving average cluster size less than the specified threshold with
+randomly selected vector from the current batch.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class VectorQuantization(nn.Module):
+    &#34;&#34;&#34;Vector quantization implementation.
+    Currently supports only euclidean distance.
+
+    Args:
+        dim (int): Dimension
+        codebook_size (int): Codebook size
+        codebook_dim (int): Codebook dimension. If not defined, uses the specified dimension in dim.
+        decay (float): Decay for exponential moving average over the codebooks.
+        epsilon (float): Epsilon value for numerical stability.
+        kmeans_init (bool): Whether to use kmeans to initialize the codebooks.
+        kmeans_iters (int): Number of iterations used for kmeans initialization.
+        threshold_ema_dead_code (int):
+        channels_last (bool): Channels are the last dimension in the input tensors.
+        commitment_weight (float): Weight for commitment loss.
+        orthogonal_reg_weight (float): Orthogonal regularization weights.
+        orthogonal_reg_active_codes_only (bool): Apply orthogonal regularization only on active codes.
+        orthogonal_reg_max_codes (optional int): Maximum number of codes to consider
+            for orthogonal regulariation.
+        threshold_ema_dead_code (int): Threshold for dead code expiration. Replace any codes
+            that have an exponential moving average cluster size less than the specified threshold with
+            randomly selected vector from the current batch.
+    &#34;&#34;&#34;
+    def __init__(
+        self,
+        dim: int,
+        codebook_size: int,
+        codebook_dim: tp.Optional[int] = None,
+        decay: float = 0.8,
+        epsilon: float = 1e-5,
+        kmeans_init: bool = False,
+        kmeans_iters: int = 10,
+        threshold_ema_dead_code: int = 2,
+        channels_last: bool = False,
+        commitment_weight: float = 1.,
+        orthogonal_reg_weight: float = 0.0,
+        orthogonal_reg_active_codes_only: bool = False,
+        orthogonal_reg_max_codes: tp.Optional[int] = None,
+    ):
+        super().__init__()
+        _codebook_dim: int = default(codebook_dim, dim)
+
+        requires_projection = _codebook_dim != dim
+        self.project_in = (nn.Linear(dim, _codebook_dim) if requires_projection else nn.Identity())
+        self.project_out = (nn.Linear(_codebook_dim, dim) if requires_projection else nn.Identity())
+
+        self.epsilon = epsilon
+        self.commitment_weight = commitment_weight
+
+        self.orthogonal_reg_weight = orthogonal_reg_weight
+        self.orthogonal_reg_active_codes_only = orthogonal_reg_active_codes_only
+        self.orthogonal_reg_max_codes = orthogonal_reg_max_codes
+
+        self._codebook = EuclideanCodebook(dim=_codebook_dim, codebook_size=codebook_size,
+                                           kmeans_init=kmeans_init, kmeans_iters=kmeans_iters,
+                                           decay=decay, epsilon=epsilon,
+                                           threshold_ema_dead_code=threshold_ema_dead_code)
+        self.codebook_size = codebook_size
+
+        self.channels_last = channels_last
+
+    @property
+    def codebook(self):
+        return self._codebook.embed
+
+    @property
+    def inited(self):
+        return self._codebook.inited
+
+    def _preprocess(self, x):
+        if not self.channels_last:
+            x = rearrange(x, &#34;b d n -&gt; b n d&#34;)
+        return x
+
+    def _postprocess(self, quantize):
+        if not self.channels_last:
+            quantize = rearrange(quantize, &#34;b n d -&gt; b d n&#34;)
+        return quantize
+
+    def encode(self, x):
+        x = self._preprocess(x)
+        x = self.project_in(x)
+        embed_in = self._codebook.encode(x)
+        return embed_in
+
+    def decode(self, embed_ind):
+        quantize = self._codebook.decode(embed_ind)
+        quantize = self.project_out(quantize)
+        quantize = self._postprocess(quantize)
+        return quantize
+
+    def forward(self, x):
+        device = x.device
+        x = self._preprocess(x)
+
+        x = self.project_in(x)
+        quantize, embed_ind = self._codebook(x)
+
+        if self.training:
+            quantize = x + (quantize - x).detach()
+
+        loss = torch.tensor([0.0], device=device, requires_grad=self.training)
+
+        if self.training:
+            if self.commitment_weight &gt; 0:
+                commit_loss = F.mse_loss(quantize.detach(), x)
+                loss = loss + commit_loss * self.commitment_weight
+
+            if self.orthogonal_reg_weight &gt; 0:
+                codebook = self.codebook
+
+                if self.orthogonal_reg_active_codes_only:
+                    # only calculate orthogonal loss for the activated codes for this batch
+                    unique_code_ids = torch.unique(embed_ind)
+                    codebook = codebook[unique_code_ids]
+
+                num_codes = codebook.shape[0]
+                if exists(self.orthogonal_reg_max_codes) and num_codes &gt; self.orthogonal_reg_max_codes:
+                    rand_ids = torch.randperm(num_codes, device=device)[:self.orthogonal_reg_max_codes]
+                    codebook = codebook[rand_ids]
+
+                orthogonal_reg_loss = orthgonal_loss_fn(codebook)
+                loss = loss + orthogonal_reg_loss * self.orthogonal_reg_weight
+
+        quantize = self.project_out(quantize)
+        quantize = self._postprocess(quantize)
+
+        return quantize, embed_ind, loss</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.quantization.core_vq.VectorQuantization.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.quantization.core_vq.VectorQuantization.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.quantization.core_vq.VectorQuantization.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Instance variables</h3>
+<dl>
+<dt id="audiocraft.quantization.core_vq.VectorQuantization.codebook"><code class="name">var <span class="ident">codebook</span></code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def codebook(self):
+    return self._codebook.embed</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.quantization.core_vq.VectorQuantization.inited"><code class="name">var <span class="ident">inited</span></code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">@property
+def inited(self):
+    return self._codebook.inited</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.quantization.core_vq.VectorQuantization.decode"><code class="name flex">
+<span>def <span class="ident">decode</span></span>(<span>self, embed_ind)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def decode(self, embed_ind):
+    quantize = self._codebook.decode(embed_ind)
+    quantize = self.project_out(quantize)
+    quantize = self._postprocess(quantize)
+    return quantize</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.quantization.core_vq.VectorQuantization.encode"><code class="name flex">
+<span>def <span class="ident">encode</span></span>(<span>self, x)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def encode(self, x):
+    x = self._preprocess(x)
+    x = self.project_in(x)
+    embed_in = self._codebook.encode(x)
+    return embed_in</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.quantization.core_vq.VectorQuantization.forward"><code class="name flex">
+<span>def <span class="ident">forward</span></span>(<span>self, x) ‑> Callable[..., Any]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Defines the computation performed at every call.</p>
+<p>Should be overridden by all subclasses.</p>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>Although the recipe for forward pass needs to be defined within
+this function, one should call the :class:<code>Module</code> instance afterwards
+instead of this since the former takes care of running the
+registered hooks while the latter silently ignores them.</p>
+</div></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def forward(self, x):
+    device = x.device
+    x = self._preprocess(x)
+
+    x = self.project_in(x)
+    quantize, embed_ind = self._codebook(x)
+
+    if self.training:
+        quantize = x + (quantize - x).detach()
+
+    loss = torch.tensor([0.0], device=device, requires_grad=self.training)
+
+    if self.training:
+        if self.commitment_weight &gt; 0:
+            commit_loss = F.mse_loss(quantize.detach(), x)
+            loss = loss + commit_loss * self.commitment_weight
+
+        if self.orthogonal_reg_weight &gt; 0:
+            codebook = self.codebook
+
+            if self.orthogonal_reg_active_codes_only:
+                # only calculate orthogonal loss for the activated codes for this batch
+                unique_code_ids = torch.unique(embed_ind)
+                codebook = codebook[unique_code_ids]
+
+            num_codes = codebook.shape[0]
+            if exists(self.orthogonal_reg_max_codes) and num_codes &gt; self.orthogonal_reg_max_codes:
+                rand_ids = torch.randperm(num_codes, device=device)[:self.orthogonal_reg_max_codes]
+                codebook = codebook[rand_ids]
+
+            orthogonal_reg_loss = orthgonal_loss_fn(codebook)
+            loss = loss + orthogonal_reg_loss * self.orthogonal_reg_weight
+
+    quantize = self.project_out(quantize)
+    quantize = self._postprocess(quantize)
+
+    return quantize, embed_ind, loss</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.quantization" href="index.html">audiocraft.quantization</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-functions">Functions</a></h3>
+<ul class="two-column">
+<li><code><a title="audiocraft.quantization.core_vq.default" href="#audiocraft.quantization.core_vq.default">default</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq.ema_inplace" href="#audiocraft.quantization.core_vq.ema_inplace">ema_inplace</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq.exists" href="#audiocraft.quantization.core_vq.exists">exists</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq.kmeans" href="#audiocraft.quantization.core_vq.kmeans">kmeans</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq.l2norm" href="#audiocraft.quantization.core_vq.l2norm">l2norm</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq.laplace_smoothing" href="#audiocraft.quantization.core_vq.laplace_smoothing">laplace_smoothing</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq.orthgonal_loss_fn" href="#audiocraft.quantization.core_vq.orthgonal_loss_fn">orthgonal_loss_fn</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq.sample_vectors" href="#audiocraft.quantization.core_vq.sample_vectors">sample_vectors</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq.uniform_init" href="#audiocraft.quantization.core_vq.uniform_init">uniform_init</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.quantization.core_vq.EuclideanCodebook" href="#audiocraft.quantization.core_vq.EuclideanCodebook">EuclideanCodebook</a></code></h4>
+<ul class="two-column">
+<li><code><a title="audiocraft.quantization.core_vq.EuclideanCodebook.call_super_init" href="#audiocraft.quantization.core_vq.EuclideanCodebook.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq.EuclideanCodebook.decode" href="#audiocraft.quantization.core_vq.EuclideanCodebook.decode">decode</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq.EuclideanCodebook.dequantize" href="#audiocraft.quantization.core_vq.EuclideanCodebook.dequantize">dequantize</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq.EuclideanCodebook.dump_patches" href="#audiocraft.quantization.core_vq.EuclideanCodebook.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq.EuclideanCodebook.encode" href="#audiocraft.quantization.core_vq.EuclideanCodebook.encode">encode</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq.EuclideanCodebook.expire_codes_" href="#audiocraft.quantization.core_vq.EuclideanCodebook.expire_codes_">expire_codes_</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq.EuclideanCodebook.forward" href="#audiocraft.quantization.core_vq.EuclideanCodebook.forward">forward</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq.EuclideanCodebook.init_embed_" href="#audiocraft.quantization.core_vq.EuclideanCodebook.init_embed_">init_embed_</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq.EuclideanCodebook.postprocess_emb" href="#audiocraft.quantization.core_vq.EuclideanCodebook.postprocess_emb">postprocess_emb</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq.EuclideanCodebook.preprocess" href="#audiocraft.quantization.core_vq.EuclideanCodebook.preprocess">preprocess</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq.EuclideanCodebook.quantize" href="#audiocraft.quantization.core_vq.EuclideanCodebook.quantize">quantize</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq.EuclideanCodebook.replace_" href="#audiocraft.quantization.core_vq.EuclideanCodebook.replace_">replace_</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq.EuclideanCodebook.training" href="#audiocraft.quantization.core_vq.EuclideanCodebook.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.quantization.core_vq.ResidualVectorQuantization" href="#audiocraft.quantization.core_vq.ResidualVectorQuantization">ResidualVectorQuantization</a></code></h4>
+<ul class="two-column">
+<li><code><a title="audiocraft.quantization.core_vq.ResidualVectorQuantization.call_super_init" href="#audiocraft.quantization.core_vq.ResidualVectorQuantization.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq.ResidualVectorQuantization.decode" href="#audiocraft.quantization.core_vq.ResidualVectorQuantization.decode">decode</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq.ResidualVectorQuantization.dump_patches" href="#audiocraft.quantization.core_vq.ResidualVectorQuantization.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq.ResidualVectorQuantization.encode" href="#audiocraft.quantization.core_vq.ResidualVectorQuantization.encode">encode</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq.ResidualVectorQuantization.forward" href="#audiocraft.quantization.core_vq.ResidualVectorQuantization.forward">forward</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq.ResidualVectorQuantization.training" href="#audiocraft.quantization.core_vq.ResidualVectorQuantization.training">training</a></code></li>
+</ul>
+</li>
+<li>
+<h4><code><a title="audiocraft.quantization.core_vq.VectorQuantization" href="#audiocraft.quantization.core_vq.VectorQuantization">VectorQuantization</a></code></h4>
+<ul class="two-column">
+<li><code><a title="audiocraft.quantization.core_vq.VectorQuantization.call_super_init" href="#audiocraft.quantization.core_vq.VectorQuantization.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq.VectorQuantization.codebook" href="#audiocraft.quantization.core_vq.VectorQuantization.codebook">codebook</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq.VectorQuantization.decode" href="#audiocraft.quantization.core_vq.VectorQuantization.decode">decode</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq.VectorQuantization.dump_patches" href="#audiocraft.quantization.core_vq.VectorQuantization.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq.VectorQuantization.encode" href="#audiocraft.quantization.core_vq.VectorQuantization.encode">encode</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq.VectorQuantization.forward" href="#audiocraft.quantization.core_vq.VectorQuantization.forward">forward</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq.VectorQuantization.inited" href="#audiocraft.quantization.core_vq.VectorQuantization.inited">inited</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq.VectorQuantization.training" href="#audiocraft.quantization.core_vq.VectorQuantization.training">training</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/docs/audiocraft/quantization/index.html b/docs/audiocraft/quantization/index.html
new file mode 100644
index 00000000..3224d9aa
--- /dev/null
+++ b/docs/audiocraft/quantization/index.html
@@ -0,0 +1,89 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.quantization API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.quantization</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# flake8: noqa
+from .vq import ResidualVectorQuantizer
+from .base import BaseQuantizer, DummyQuantizer, QuantizedResult</code></pre>
+</details>
+</section>
+<section>
+<h2 class="section-title" id="header-submodules">Sub-modules</h2>
+<dl>
+<dt><code class="name"><a title="audiocraft.quantization.base" href="base.html">audiocraft.quantization.base</a></code></dt>
+<dd>
+<div class="desc"><p>Base class for all quantizers.</p></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.quantization.core_vq" href="core_vq.html">audiocraft.quantization.core_vq</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.quantization.vq" href="vq.html">audiocraft.quantization.vq</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft" href="../index.html">audiocraft</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-submodules">Sub-modules</a></h3>
+<ul>
+<li><code><a title="audiocraft.quantization.base" href="base.html">audiocraft.quantization.base</a></code></li>
+<li><code><a title="audiocraft.quantization.core_vq" href="core_vq.html">audiocraft.quantization.core_vq</a></code></li>
+<li><code><a title="audiocraft.quantization.vq" href="vq.html">audiocraft.quantization.vq</a></code></li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/docs/audiocraft/quantization/vq.html b/docs/audiocraft/quantization/vq.html
new file mode 100644
index 00000000..9bd7694e
--- /dev/null
+++ b/docs/audiocraft/quantization/vq.html
@@ -0,0 +1,390 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.quantization.vq API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.quantization.vq</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+import typing as tp
+
+import torch
+
+from .base import BaseQuantizer, QuantizedResult
+from .core_vq import ResidualVectorQuantization
+
+
+class ResidualVectorQuantizer(BaseQuantizer):
+    &#34;&#34;&#34;Residual Vector Quantizer.
+
+    Args:
+        dimension (int): Dimension of the codebooks.
+        n_q (int): Number of residual vector quantizers used.
+        q_dropout (bool): Random quantizer drop out at train time.
+        bins (int): Codebook size.
+        decay (float): Decay for exponential moving average over the codebooks.
+        kmeans_init (bool): Whether to use kmeans to initialize the codebooks.
+        kmeans_iters (int): Number of iterations used for kmeans initialization.
+        threshold_ema_dead_code (int): Threshold for dead code expiration. Replace any codes
+            that have an exponential moving average cluster size less than the specified threshold with
+            randomly selected vector from the current batch.
+        orthogonal_reg_weight (float): Orthogonal regularization weights.
+        orthogonal_reg_active_codes_only (bool): Apply orthogonal regularization only on active codes.
+        orthogonal_reg_max_codes (optional int): Maximum number of codes to consider.
+            for orthogonal regulariation.
+    &#34;&#34;&#34;
+    def __init__(
+        self,
+        dimension: int = 256,
+        n_q: int = 8,
+        q_dropout: bool = False,
+        bins: int = 1024,
+        decay: float = 0.99,
+        kmeans_init: bool = True,
+        kmeans_iters: int = 10,
+        threshold_ema_dead_code: int = 2,
+        orthogonal_reg_weight: float = 0.0,
+        orthogonal_reg_active_codes_only: bool = False,
+        orthogonal_reg_max_codes: tp.Optional[int] = None,
+    ):
+        super().__init__()
+        self.max_n_q = n_q
+        self.n_q = n_q
+        self.q_dropout = q_dropout
+        self.dimension = dimension
+        self.bins = bins
+        self.decay = decay
+        self.kmeans_init = kmeans_init
+        self.kmeans_iters = kmeans_iters
+        self.threshold_ema_dead_code = threshold_ema_dead_code
+        self.orthogonal_reg_weight = orthogonal_reg_weight
+        self.orthogonal_reg_active_codes_only = orthogonal_reg_active_codes_only
+        self.orthogonal_reg_max_codes = orthogonal_reg_max_codes
+        self.vq = ResidualVectorQuantization(
+            dim=self.dimension,
+            codebook_size=self.bins,
+            num_quantizers=self.n_q,
+            decay=self.decay,
+            kmeans_init=self.kmeans_init,
+            kmeans_iters=self.kmeans_iters,
+            threshold_ema_dead_code=self.threshold_ema_dead_code,
+            orthogonal_reg_weight=self.orthogonal_reg_weight,
+            orthogonal_reg_active_codes_only=self.orthogonal_reg_active_codes_only,
+            orthogonal_reg_max_codes=self.orthogonal_reg_max_codes,
+            channels_last=False
+        )
+
+    def forward(self, x: torch.Tensor, frame_rate: int):
+        n_q = self.n_q
+        if self.training and self.q_dropout:
+            n_q = int(torch.randint(1, self.n_q + 1, (1,)).item())
+        bw_per_q = math.log2(self.bins) * frame_rate / 1000
+        quantized, codes, commit_loss = self.vq(x, n_q=n_q)
+        codes = codes.transpose(0, 1)
+        # codes is [B, K, T], with T frames, K nb of codebooks.
+        bw = torch.tensor(n_q * bw_per_q).to(x)
+        return QuantizedResult(quantized, codes, bw, penalty=torch.mean(commit_loss))
+
+    def encode(self, x: torch.Tensor) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Encode a given input tensor with the specified frame rate at the given bandwidth.
+        The RVQ encode method sets the appropriate number of quantizer to use
+        and returns indices for each quantizer.
+        &#34;&#34;&#34;
+        n_q = self.n_q
+        codes = self.vq.encode(x, n_q=n_q)
+        codes = codes.transpose(0, 1)
+        # codes is [B, K, T], with T frames, K nb of codebooks.
+        return codes
+
+    def decode(self, codes: torch.Tensor) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Decode the given codes to the quantized representation.
+        &#34;&#34;&#34;
+        # codes is [B, K, T], with T frames, K nb of codebooks, vq.decode expects [K, B, T].
+        codes = codes.transpose(0, 1)
+        quantized = self.vq.decode(codes)
+        return quantized
+
+    @property
+    def total_codebooks(self):
+        return self.max_n_q
+
+    @property
+    def num_codebooks(self):
+        return self.n_q
+
+    def set_num_codebooks(self, n: int):
+        assert n &gt; 0 and n &lt;= self.max_n_q
+        self.n_q = n</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.quantization.vq.ResidualVectorQuantizer"><code class="flex name class">
+<span>class <span class="ident">ResidualVectorQuantizer</span></span>
+<span>(</span><span>dimension: int = 256, n_q: int = 8, q_dropout: bool = False, bins: int = 1024, decay: float = 0.99, kmeans_init: bool = True, kmeans_iters: int = 10, threshold_ema_dead_code: int = 2, orthogonal_reg_weight: float = 0.0, orthogonal_reg_active_codes_only: bool = False, orthogonal_reg_max_codes: Optional[int] = None)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Residual Vector Quantizer.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>dimension</code></strong> :&ensp;<code>int</code></dt>
+<dd>Dimension of the codebooks.</dd>
+<dt><strong><code>n_q</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of residual vector quantizers used.</dd>
+<dt><strong><code>q_dropout</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Random quantizer drop out at train time.</dd>
+<dt><strong><code>bins</code></strong> :&ensp;<code>int</code></dt>
+<dd>Codebook size.</dd>
+<dt><strong><code>decay</code></strong> :&ensp;<code>float</code></dt>
+<dd>Decay for exponential moving average over the codebooks.</dd>
+<dt><strong><code>kmeans_init</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to use kmeans to initialize the codebooks.</dd>
+<dt><strong><code>kmeans_iters</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of iterations used for kmeans initialization.</dd>
+<dt><strong><code>threshold_ema_dead_code</code></strong> :&ensp;<code>int</code></dt>
+<dd>Threshold for dead code expiration. Replace any codes
+that have an exponential moving average cluster size less than the specified threshold with
+randomly selected vector from the current batch.</dd>
+<dt><strong><code>orthogonal_reg_weight</code></strong> :&ensp;<code>float</code></dt>
+<dd>Orthogonal regularization weights.</dd>
+<dt><strong><code>orthogonal_reg_active_codes_only</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Apply orthogonal regularization only on active codes.</dd>
+<dt><strong><code>orthogonal_reg_max_codes</code></strong> :&ensp;<code>optional int</code></dt>
+<dd>Maximum number of codes to consider.
+for orthogonal regulariation.</dd>
+</dl>
+<p>Initializes internal Module state, shared by both nn.Module and ScriptModule.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class ResidualVectorQuantizer(BaseQuantizer):
+    &#34;&#34;&#34;Residual Vector Quantizer.
+
+    Args:
+        dimension (int): Dimension of the codebooks.
+        n_q (int): Number of residual vector quantizers used.
+        q_dropout (bool): Random quantizer drop out at train time.
+        bins (int): Codebook size.
+        decay (float): Decay for exponential moving average over the codebooks.
+        kmeans_init (bool): Whether to use kmeans to initialize the codebooks.
+        kmeans_iters (int): Number of iterations used for kmeans initialization.
+        threshold_ema_dead_code (int): Threshold for dead code expiration. Replace any codes
+            that have an exponential moving average cluster size less than the specified threshold with
+            randomly selected vector from the current batch.
+        orthogonal_reg_weight (float): Orthogonal regularization weights.
+        orthogonal_reg_active_codes_only (bool): Apply orthogonal regularization only on active codes.
+        orthogonal_reg_max_codes (optional int): Maximum number of codes to consider.
+            for orthogonal regulariation.
+    &#34;&#34;&#34;
+    def __init__(
+        self,
+        dimension: int = 256,
+        n_q: int = 8,
+        q_dropout: bool = False,
+        bins: int = 1024,
+        decay: float = 0.99,
+        kmeans_init: bool = True,
+        kmeans_iters: int = 10,
+        threshold_ema_dead_code: int = 2,
+        orthogonal_reg_weight: float = 0.0,
+        orthogonal_reg_active_codes_only: bool = False,
+        orthogonal_reg_max_codes: tp.Optional[int] = None,
+    ):
+        super().__init__()
+        self.max_n_q = n_q
+        self.n_q = n_q
+        self.q_dropout = q_dropout
+        self.dimension = dimension
+        self.bins = bins
+        self.decay = decay
+        self.kmeans_init = kmeans_init
+        self.kmeans_iters = kmeans_iters
+        self.threshold_ema_dead_code = threshold_ema_dead_code
+        self.orthogonal_reg_weight = orthogonal_reg_weight
+        self.orthogonal_reg_active_codes_only = orthogonal_reg_active_codes_only
+        self.orthogonal_reg_max_codes = orthogonal_reg_max_codes
+        self.vq = ResidualVectorQuantization(
+            dim=self.dimension,
+            codebook_size=self.bins,
+            num_quantizers=self.n_q,
+            decay=self.decay,
+            kmeans_init=self.kmeans_init,
+            kmeans_iters=self.kmeans_iters,
+            threshold_ema_dead_code=self.threshold_ema_dead_code,
+            orthogonal_reg_weight=self.orthogonal_reg_weight,
+            orthogonal_reg_active_codes_only=self.orthogonal_reg_active_codes_only,
+            orthogonal_reg_max_codes=self.orthogonal_reg_max_codes,
+            channels_last=False
+        )
+
+    def forward(self, x: torch.Tensor, frame_rate: int):
+        n_q = self.n_q
+        if self.training and self.q_dropout:
+            n_q = int(torch.randint(1, self.n_q + 1, (1,)).item())
+        bw_per_q = math.log2(self.bins) * frame_rate / 1000
+        quantized, codes, commit_loss = self.vq(x, n_q=n_q)
+        codes = codes.transpose(0, 1)
+        # codes is [B, K, T], with T frames, K nb of codebooks.
+        bw = torch.tensor(n_q * bw_per_q).to(x)
+        return QuantizedResult(quantized, codes, bw, penalty=torch.mean(commit_loss))
+
+    def encode(self, x: torch.Tensor) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Encode a given input tensor with the specified frame rate at the given bandwidth.
+        The RVQ encode method sets the appropriate number of quantizer to use
+        and returns indices for each quantizer.
+        &#34;&#34;&#34;
+        n_q = self.n_q
+        codes = self.vq.encode(x, n_q=n_q)
+        codes = codes.transpose(0, 1)
+        # codes is [B, K, T], with T frames, K nb of codebooks.
+        return codes
+
+    def decode(self, codes: torch.Tensor) -&gt; torch.Tensor:
+        &#34;&#34;&#34;Decode the given codes to the quantized representation.
+        &#34;&#34;&#34;
+        # codes is [B, K, T], with T frames, K nb of codebooks, vq.decode expects [K, B, T].
+        codes = codes.transpose(0, 1)
+        quantized = self.vq.decode(codes)
+        return quantized
+
+    @property
+    def total_codebooks(self):
+        return self.max_n_q
+
+    @property
+    def num_codebooks(self):
+        return self.n_q
+
+    def set_num_codebooks(self, n: int):
+        assert n &gt; 0 and n &lt;= self.max_n_q
+        self.n_q = n</code></pre>
+</details>
+<h3>Ancestors</h3>
+<ul class="hlist">
+<li><a title="audiocraft.quantization.base.BaseQuantizer" href="base.html#audiocraft.quantization.base.BaseQuantizer">BaseQuantizer</a></li>
+<li>torch.nn.modules.module.Module</li>
+</ul>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.quantization.vq.ResidualVectorQuantizer.call_super_init"><code class="name">var <span class="ident">call_super_init</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.quantization.vq.ResidualVectorQuantizer.dump_patches"><code class="name">var <span class="ident">dump_patches</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt id="audiocraft.quantization.vq.ResidualVectorQuantizer.training"><code class="name">var <span class="ident">training</span> : bool</code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.quantization.vq.ResidualVectorQuantizer.encode"><code class="name flex">
+<span>def <span class="ident">encode</span></span>(<span>self, x: torch.Tensor) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Encode a given input tensor with the specified frame rate at the given bandwidth.
+The RVQ encode method sets the appropriate number of quantizer to use
+and returns indices for each quantizer.</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def encode(self, x: torch.Tensor) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Encode a given input tensor with the specified frame rate at the given bandwidth.
+    The RVQ encode method sets the appropriate number of quantizer to use
+    and returns indices for each quantizer.
+    &#34;&#34;&#34;
+    n_q = self.n_q
+    codes = self.vq.encode(x, n_q=n_q)
+    codes = codes.transpose(0, 1)
+    # codes is [B, K, T], with T frames, K nb of codebooks.
+    return codes</code></pre>
+</details>
+</dd>
+</dl>
+<h3>Inherited members</h3>
+<ul class="hlist">
+<li><code><b><a title="audiocraft.quantization.base.BaseQuantizer" href="base.html#audiocraft.quantization.base.BaseQuantizer">BaseQuantizer</a></b></code>:
+<ul class="hlist">
+<li><code><a title="audiocraft.quantization.base.BaseQuantizer.decode" href="base.html#audiocraft.quantization.base.BaseQuantizer.decode">decode</a></code></li>
+<li><code><a title="audiocraft.quantization.base.BaseQuantizer.forward" href="base.html#audiocraft.quantization.base.BaseQuantizer.forward">forward</a></code></li>
+<li><code><a title="audiocraft.quantization.base.BaseQuantizer.num_codebooks" href="base.html#audiocraft.quantization.base.BaseQuantizer.num_codebooks">num_codebooks</a></code></li>
+<li><code><a title="audiocraft.quantization.base.BaseQuantizer.set_num_codebooks" href="base.html#audiocraft.quantization.base.BaseQuantizer.set_num_codebooks">set_num_codebooks</a></code></li>
+<li><code><a title="audiocraft.quantization.base.BaseQuantizer.total_codebooks" href="base.html#audiocraft.quantization.base.BaseQuantizer.total_codebooks">total_codebooks</a></code></li>
+</ul>
+</li>
+</ul>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.quantization" href="index.html">audiocraft.quantization</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.quantization.vq.ResidualVectorQuantizer" href="#audiocraft.quantization.vq.ResidualVectorQuantizer">ResidualVectorQuantizer</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.quantization.vq.ResidualVectorQuantizer.call_super_init" href="#audiocraft.quantization.vq.ResidualVectorQuantizer.call_super_init">call_super_init</a></code></li>
+<li><code><a title="audiocraft.quantization.vq.ResidualVectorQuantizer.dump_patches" href="#audiocraft.quantization.vq.ResidualVectorQuantizer.dump_patches">dump_patches</a></code></li>
+<li><code><a title="audiocraft.quantization.vq.ResidualVectorQuantizer.encode" href="#audiocraft.quantization.vq.ResidualVectorQuantizer.encode">encode</a></code></li>
+<li><code><a title="audiocraft.quantization.vq.ResidualVectorQuantizer.training" href="#audiocraft.quantization.vq.ResidualVectorQuantizer.training">training</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/docs/audiocraft/utils/autocast.html b/docs/audiocraft/utils/autocast.html
new file mode 100644
index 00000000..bbf4554e
--- /dev/null
+++ b/docs/audiocraft/utils/autocast.html
@@ -0,0 +1,163 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.utils.autocast API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.utils.autocast</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+
+class TorchAutocast:
+    &#34;&#34;&#34;TorchAutocast utility class.
+    Allows you to enable and disable autocast. This is specially useful
+    when dealing with different architectures and clusters with different
+    levels of support.
+
+    Args:
+        enabled (bool): Whether to enable torch.autocast or not.
+        args: Additional args for torch.autocast.
+        kwargs: Additional kwargs for torch.autocast
+    &#34;&#34;&#34;
+    def __init__(self, enabled: bool, *args, **kwargs):
+        self.autocast = torch.autocast(*args, **kwargs) if enabled else None
+
+    def __enter__(self):
+        if self.autocast is None:
+            return
+        try:
+            self.autocast.__enter__()
+        except RuntimeError:
+            device = self.autocast.device
+            dtype = self.autocast.fast_dtype
+            raise RuntimeError(
+                f&#34;There was an error autocasting with dtype={dtype} device={device}\n&#34;
+                &#34;If you are on the FAIR Cluster, you might need to use autocast_dtype=float16&#34;
+            )
+
+    def __exit__(self, *args, **kwargs):
+        if self.autocast is None:
+            return
+        self.autocast.__exit__(*args, **kwargs)</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.utils.autocast.TorchAutocast"><code class="flex name class">
+<span>class <span class="ident">TorchAutocast</span></span>
+<span>(</span><span>enabled: bool, *args, **kwargs)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>TorchAutocast utility class.
+Allows you to enable and disable autocast. This is specially useful
+when dealing with different architectures and clusters with different
+levels of support.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>enabled</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to enable torch.autocast or not.</dd>
+<dt><strong><code>args</code></strong></dt>
+<dd>Additional args for torch.autocast.</dd>
+<dt><strong><code>kwargs</code></strong></dt>
+<dd>Additional kwargs for torch.autocast</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class TorchAutocast:
+    &#34;&#34;&#34;TorchAutocast utility class.
+    Allows you to enable and disable autocast. This is specially useful
+    when dealing with different architectures and clusters with different
+    levels of support.
+
+    Args:
+        enabled (bool): Whether to enable torch.autocast or not.
+        args: Additional args for torch.autocast.
+        kwargs: Additional kwargs for torch.autocast
+    &#34;&#34;&#34;
+    def __init__(self, enabled: bool, *args, **kwargs):
+        self.autocast = torch.autocast(*args, **kwargs) if enabled else None
+
+    def __enter__(self):
+        if self.autocast is None:
+            return
+        try:
+            self.autocast.__enter__()
+        except RuntimeError:
+            device = self.autocast.device
+            dtype = self.autocast.fast_dtype
+            raise RuntimeError(
+                f&#34;There was an error autocasting with dtype={dtype} device={device}\n&#34;
+                &#34;If you are on the FAIR Cluster, you might need to use autocast_dtype=float16&#34;
+            )
+
+    def __exit__(self, *args, **kwargs):
+        if self.autocast is None:
+            return
+        self.autocast.__exit__(*args, **kwargs)</code></pre>
+</details>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.utils" href="index.html">audiocraft.utils</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.utils.autocast.TorchAutocast" href="#audiocraft.utils.autocast.TorchAutocast">TorchAutocast</a></code></h4>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/docs/audiocraft/utils/export.html b/docs/audiocraft/utils/export.html
new file mode 100644
index 00000000..70e932e5
--- /dev/null
+++ b/docs/audiocraft/utils/export.html
@@ -0,0 +1,168 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.utils.export API documentation</title>
+<meta name="description" content="Utility to export a training checkpoint to a lightweight release checkpoint." />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.utils.export</code></h1>
+</header>
+<section id="section-intro">
+<p>Utility to export a training checkpoint to a lightweight release checkpoint.</p>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+&#34;&#34;&#34;
+Utility to export a training checkpoint to a lightweight release checkpoint.
+&#34;&#34;&#34;
+
+from pathlib import Path
+import typing as tp
+
+from omegaconf import OmegaConf, DictConfig
+import torch
+
+
+def _clean_lm_cfg(cfg: DictConfig):
+    OmegaConf.set_struct(cfg, False)
+    # This used to be set automatically in the LM solver, need a more robust solution
+    # for the future.
+    cfg[&#39;transformer_lm&#39;][&#39;card&#39;] = 2048
+    cfg[&#39;transformer_lm&#39;][&#39;n_q&#39;] = 4
+    # Experimental params no longer supported.
+    bad_params = [&#39;spectral_norm_attn_iters&#39;, &#39;spectral_norm_ff_iters&#39;,
+                  &#39;residual_balancer_attn&#39;, &#39;residual_balancer_ff&#39;, &#39;layer_drop&#39;]
+    for name in bad_params:
+        del cfg[&#39;transformer_lm&#39;][name]
+    OmegaConf.set_struct(cfg, True)
+    return cfg
+
+
+def export_encodec(checkpoint_path: tp.Union[Path, str], out_folder: tp.Union[Path, str]):
+    sig = Path(checkpoint_path).parent.name
+    assert len(sig) == 8, &#34;Not a valid Dora signature&#34;
+    pkg = torch.load(checkpoint_path, &#39;cpu&#39;)
+    new_pkg = {
+        &#39;best_state&#39;: pkg[&#39;ema&#39;][&#39;state&#39;][&#39;model&#39;],
+        &#39;xp.cfg&#39;: OmegaConf.to_yaml(pkg[&#39;xp.cfg&#39;]),
+    }
+    out_file = Path(out_folder) / f&#39;{sig}.th&#39;
+    torch.save(new_pkg, out_file)
+    return out_file
+
+
+def export_lm(checkpoint_path: tp.Union[Path, str], out_folder: tp.Union[Path, str]):
+    sig = Path(checkpoint_path).parent.name
+    assert len(sig) == 8, &#34;Not a valid Dora signature&#34;
+    pkg = torch.load(checkpoint_path, &#39;cpu&#39;)
+    new_pkg = {
+        &#39;best_state&#39;: pkg[&#39;fsdp_best_state&#39;][&#39;model&#39;],
+        &#39;xp.cfg&#39;: OmegaConf.to_yaml(_clean_lm_cfg(pkg[&#39;xp.cfg&#39;]))
+    }
+    out_file = Path(out_folder) / f&#39;{sig}.th&#39;
+    torch.save(new_pkg, out_file)
+    return out_file</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-functions">Functions</h2>
+<dl>
+<dt id="audiocraft.utils.export.export_encodec"><code class="name flex">
+<span>def <span class="ident">export_encodec</span></span>(<span>checkpoint_path: Union[str, pathlib.Path], out_folder: Union[str, pathlib.Path])</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def export_encodec(checkpoint_path: tp.Union[Path, str], out_folder: tp.Union[Path, str]):
+    sig = Path(checkpoint_path).parent.name
+    assert len(sig) == 8, &#34;Not a valid Dora signature&#34;
+    pkg = torch.load(checkpoint_path, &#39;cpu&#39;)
+    new_pkg = {
+        &#39;best_state&#39;: pkg[&#39;ema&#39;][&#39;state&#39;][&#39;model&#39;],
+        &#39;xp.cfg&#39;: OmegaConf.to_yaml(pkg[&#39;xp.cfg&#39;]),
+    }
+    out_file = Path(out_folder) / f&#39;{sig}.th&#39;
+    torch.save(new_pkg, out_file)
+    return out_file</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.utils.export.export_lm"><code class="name flex">
+<span>def <span class="ident">export_lm</span></span>(<span>checkpoint_path: Union[str, pathlib.Path], out_folder: Union[str, pathlib.Path])</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def export_lm(checkpoint_path: tp.Union[Path, str], out_folder: tp.Union[Path, str]):
+    sig = Path(checkpoint_path).parent.name
+    assert len(sig) == 8, &#34;Not a valid Dora signature&#34;
+    pkg = torch.load(checkpoint_path, &#39;cpu&#39;)
+    new_pkg = {
+        &#39;best_state&#39;: pkg[&#39;fsdp_best_state&#39;][&#39;model&#39;],
+        &#39;xp.cfg&#39;: OmegaConf.to_yaml(_clean_lm_cfg(pkg[&#39;xp.cfg&#39;]))
+    }
+    out_file = Path(out_folder) / f&#39;{sig}.th&#39;
+    torch.save(new_pkg, out_file)
+    return out_file</code></pre>
+</details>
+</dd>
+</dl>
+</section>
+<section>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.utils" href="index.html">audiocraft.utils</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-functions">Functions</a></h3>
+<ul class="">
+<li><code><a title="audiocraft.utils.export.export_encodec" href="#audiocraft.utils.export.export_encodec">export_encodec</a></code></li>
+<li><code><a title="audiocraft.utils.export.export_lm" href="#audiocraft.utils.export.export_lm">export_lm</a></code></li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/docs/audiocraft/utils/index.html b/docs/audiocraft/utils/index.html
new file mode 100644
index 00000000..f6515be6
--- /dev/null
+++ b/docs/audiocraft/utils/index.html
@@ -0,0 +1,90 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.utils API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.utils</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.</code></pre>
+</details>
+</section>
+<section>
+<h2 class="section-title" id="header-submodules">Sub-modules</h2>
+<dl>
+<dt><code class="name"><a title="audiocraft.utils.autocast" href="autocast.html">audiocraft.utils.autocast</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.utils.export" href="export.html">audiocraft.utils.export</a></code></dt>
+<dd>
+<div class="desc"><p>Utility to export a training checkpoint to a lightweight release checkpoint.</p></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.utils.notebook" href="notebook.html">audiocraft.utils.notebook</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+<dt><code class="name"><a title="audiocraft.utils.utils" href="utils.html">audiocraft.utils.utils</a></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft" href="../index.html">audiocraft</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-submodules">Sub-modules</a></h3>
+<ul>
+<li><code><a title="audiocraft.utils.autocast" href="autocast.html">audiocraft.utils.autocast</a></code></li>
+<li><code><a title="audiocraft.utils.export" href="export.html">audiocraft.utils.export</a></code></li>
+<li><code><a title="audiocraft.utils.notebook" href="notebook.html">audiocraft.utils.notebook</a></code></li>
+<li><code><a title="audiocraft.utils.utils" href="utils.html">audiocraft.utils.utils</a></code></li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/docs/audiocraft/utils/notebook.html b/docs/audiocraft/utils/notebook.html
new file mode 100644
index 00000000..075a78d7
--- /dev/null
+++ b/docs/audiocraft/utils/notebook.html
@@ -0,0 +1,133 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.utils.notebook API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.utils.notebook</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+try:
+    import IPython.display as ipd  # type: ignore
+except ImportError:
+    # Note in a notebook...
+    pass
+
+
+import torch
+
+
+def display_audio(samples: torch.Tensor, sample_rate: int):
+    &#34;&#34;&#34;Renders an audio player for the given audio samples.
+
+    Args:
+        samples (torch.Tensor): a Tensor of decoded audio samples
+            with shapes [B, C, T] or [C, T]
+        sample_rate (int): sample rate audio should be displayed with.
+    &#34;&#34;&#34;
+    assert samples.dim() == 2 or samples.dim() == 3
+
+    samples = samples.detach().cpu()
+    if samples.dim() == 2:
+        samples = samples[None, ...]
+
+    for audio in samples:
+        ipd.display(ipd.Audio(audio, rate=sample_rate))</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-functions">Functions</h2>
+<dl>
+<dt id="audiocraft.utils.notebook.display_audio"><code class="name flex">
+<span>def <span class="ident">display_audio</span></span>(<span>samples: torch.Tensor, sample_rate: int)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Renders an audio player for the given audio samples.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>samples</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>a Tensor of decoded audio samples
+with shapes [B, C, T] or [C, T]</dd>
+<dt><strong><code>sample_rate</code></strong> :&ensp;<code>int</code></dt>
+<dd>sample rate audio should be displayed with.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def display_audio(samples: torch.Tensor, sample_rate: int):
+    &#34;&#34;&#34;Renders an audio player for the given audio samples.
+
+    Args:
+        samples (torch.Tensor): a Tensor of decoded audio samples
+            with shapes [B, C, T] or [C, T]
+        sample_rate (int): sample rate audio should be displayed with.
+    &#34;&#34;&#34;
+    assert samples.dim() == 2 or samples.dim() == 3
+
+    samples = samples.detach().cpu()
+    if samples.dim() == 2:
+        samples = samples[None, ...]
+
+    for audio in samples:
+        ipd.display(ipd.Audio(audio, rate=sample_rate))</code></pre>
+</details>
+</dd>
+</dl>
+</section>
+<section>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.utils" href="index.html">audiocraft.utils</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-functions">Functions</a></h3>
+<ul class="">
+<li><code><a title="audiocraft.utils.notebook.display_audio" href="#audiocraft.utils.notebook.display_audio">display_audio</a></code></li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file
diff --git a/docs/audiocraft/utils/utils.html b/docs/audiocraft/utils/utils.html
new file mode 100644
index 00000000..837913bc
--- /dev/null
+++ b/docs/audiocraft/utils/utils.html
@@ -0,0 +1,796 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
+<meta name="generator" content="pdoc 0.10.0" />
+<title>audiocraft.utils.utils API documentation</title>
+<meta name="description" content="" />
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
+<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
+<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
+<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
+<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
+<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
+<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
+<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
+</head>
+<body>
+<main>
+<article id="content">
+<header>
+<h1 class="title">Module <code>audiocraft.utils.utils</code></h1>
+</header>
+<section id="section-intro">
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python"># Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from concurrent.futures import ProcessPoolExecutor
+from functools import wraps
+import hashlib
+import logging
+import typing as tp
+
+import flashy
+import flashy.distrib
+import omegaconf
+import torch
+from torch.nn.utils.rnn import pad_sequence
+
+
+logger = logging.getLogger(__name__)
+
+
+def dict_from_config(cfg: omegaconf.DictConfig) -&gt; dict:
+    &#34;&#34;&#34;Convenience function to map an omegaconf configuration to a dictionary.
+
+    Args:
+        cfg (omegaconf.DictConfig): Original configuration to map to dict.
+    Returns:
+        dict: Config as dictionary object.
+    &#34;&#34;&#34;
+    dct = omegaconf.OmegaConf.to_container(cfg, resolve=True)
+    assert isinstance(dct, dict)
+    return dct
+
+
+def random_subset(dataset, max_samples: int, seed: int = 42) -&gt; torch.utils.data.Subset:
+    if max_samples &gt;= len(dataset):
+        return dataset
+
+    generator = torch.Generator().manual_seed(seed)
+    perm = torch.randperm(len(dataset), generator=generator)
+    return torch.utils.data.Subset(dataset, perm[:max_samples].tolist())
+
+
+def get_loader(dataset, num_samples: tp.Optional[int], batch_size: int,
+               num_workers: int, seed: int, **kwargs) -&gt; torch.utils.data.DataLoader:
+    &#34;&#34;&#34;Convenience function to load dataset into a dataloader with optional subset sampling.
+
+    Args:
+        dataset: Dataset to load.
+        num_samples (Optional[int]): Number of samples to limit subset size.
+        batch_size (int): Batch size.
+        num_workers (int): Number of workers for data loading.
+        seed (int): Random seed.
+    &#34;&#34;&#34;
+    if num_samples is not None:
+        dataset = random_subset(dataset, num_samples, seed)
+
+    dataloader = flashy.distrib.loader(
+        dataset,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        **kwargs
+    )
+    return dataloader
+
+
+def get_dataset_from_loader(dataloader):
+    dataset = dataloader.dataset
+    if isinstance(dataset, torch.utils.data.Subset):
+        return dataset.dataset
+    else:
+        return dataset
+
+
+def multinomial(input: torch.Tensor, num_samples: int, replacement=False, *, generator=None):
+    &#34;&#34;&#34;torch.multinomial with arbitrary number of dimensions, and number of candidates on the last dimension.
+
+    Args:
+        input (torch.Tensor): The input tensor containing probabilities.
+        num_samples (int): Number of samples to draw.
+        replacement (bool): Whether to draw with replacement or not.
+    Keywords args:
+        generator (torch.Generator): A pseudorandom number generator for sampling.
+    Returns:
+        torch.Tensor: Last dimension contains num_samples indices
+            sampled from the multinomial probability distribution
+            located in the last dimension of tensor input.
+    &#34;&#34;&#34;
+    input_ = input.reshape(-1, input.shape[-1])
+    output_ = torch.multinomial(input_, num_samples=num_samples, replacement=replacement, generator=generator)
+    output = output_.reshape(*list(input.shape[:-1]), -1)
+    return output
+
+
+def sample_top_k(probs: torch.Tensor, k: int) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Sample next token from top K values along the last dimension of the input probs tensor.
+
+    Args:
+        probs (torch.Tensor): Input probabilities with token candidates on the last dimension.
+        k (int): The k in “top-k”.
+    Returns:
+        torch.Tensor: Sampled tokens.
+    &#34;&#34;&#34;
+    top_k_value, _ = torch.topk(probs, k, dim=-1)
+    min_value_top_k = top_k_value[..., [-1]]
+    probs *= (probs &gt;= min_value_top_k).float()
+    probs.div_(probs.sum(dim=-1, keepdim=True))
+    next_token = multinomial(probs, num_samples=1)
+    return next_token
+
+
+def sample_top_p(probs: torch.Tensor, p: float) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Sample next token from top P probabilities along the last dimension of the input probs tensor.
+
+    Args:
+        probs (torch.Tensor): Input probabilities with token candidates on the last dimension.
+        p (int): The p in “top-p”.
+    Returns:
+        torch.Tensor: Sampled tokens.
+    &#34;&#34;&#34;
+    probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True)
+    probs_sum = torch.cumsum(probs_sort, dim=-1)
+    mask = probs_sum - probs_sort &gt; p
+    probs_sort *= (~mask).float()
+    probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
+    next_token = multinomial(probs_sort, num_samples=1)
+    next_token = torch.gather(probs_idx, -1, next_token)
+    return next_token
+
+
+class DummyPoolExecutor:
+    &#34;&#34;&#34;Dummy pool executor to use when we actually have only 1 worker.
+    (e.g. instead of ProcessPoolExecutor).
+    &#34;&#34;&#34;
+    class DummyResult:
+        def __init__(self, func, *args, **kwargs):
+            self.func = func
+            self.args = args
+            self.kwargs = kwargs
+
+        def result(self):
+            return self.func(*self.args, **self.kwargs)
+
+    def __init__(self, workers, mp_context=None):
+        pass
+
+    def submit(self, func, *args, **kwargs):
+        return DummyPoolExecutor.DummyResult(func, *args, **kwargs)
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, exc_tb):
+        return
+
+
+def get_pool_executor(num_workers: int, mp_context=None):
+    return ProcessPoolExecutor(num_workers, mp_context) if num_workers &gt; 1 else DummyPoolExecutor(1)
+
+
+def length_to_mask(lengths: torch.Tensor, max_len: tp.Optional[int] = None) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Utility function to convert a tensor of sequence lengths to a mask (useful when working on padded sequences).
+    For example: [3, 5] =&gt; [[1, 1, 1, 0, 0], [1, 1, 1, 1, 1]]
+
+    Args:
+        lengths (torch.Tensor): tensor with lengths
+        max_len (int): can set the max length manually. Defaults to None.
+    Returns:
+        torch.Tensor: mask with 0s where there is pad tokens else 1s
+    &#34;&#34;&#34;
+    assert len(lengths.shape) == 1, &#34;Length shape should be 1 dimensional.&#34;
+    final_length = lengths.max().item() if not max_len else max_len
+    final_length = max(final_length, 1)  # if all seqs are of len zero we don&#39;t want a zero-size tensor
+    return torch.arange(final_length)[None, :].to(lengths.device) &lt; lengths[:, None]
+
+
+def hash_trick(word: str, vocab_size: int) -&gt; int:
+    &#34;&#34;&#34;Hash trick to pair each word with an index
+
+    Args:
+        word (str): word we wish to convert to an index
+        vocab_size (int): size of the vocabulary
+    Returns:
+        int: index of the word in the embedding LUT
+    &#34;&#34;&#34;
+    hash = int(hashlib.sha256(word.encode(&#34;utf-8&#34;)).hexdigest(), 16)
+    return hash % vocab_size
+
+
+def with_rank_rng(base_seed: int = 1234):
+    &#34;&#34;&#34;Decorator for a function so that the function will use a Random Number Generator
+    whose state depend on the GPU rank. The original RNG state is restored upon returning.
+
+    Args:
+        base_seed (int): Random seed.
+    &#34;&#34;&#34;
+    def _decorator(fun: tp.Callable):
+        @wraps(fun)
+        def _decorated(*args, **kwargs):
+            state = torch.get_rng_state()
+            seed = base_seed ^ flashy.distrib.rank()
+            torch.manual_seed(seed)
+            logger.debug(&#39;Rank dependent seed set to %d&#39;, seed)
+            try:
+                return fun(*args, **kwargs)
+            finally:
+                torch.set_rng_state(state)
+                logger.debug(&#39;RNG state restored.&#39;)
+        return _decorated
+    return _decorator
+
+
+def collate(tensors: tp.List[torch.Tensor], dim: int = 0) -&gt; tp.Tuple[torch.Tensor, torch.Tensor]:
+    &#34;&#34;&#34;Get a list of tensors and collate them to a single tensor. according to the following logic:
+    - `dim` specifies the time dimension which will be stacked and padded.
+    - The output will contain 1 new dimension (dimension index 0) which will be the size of
+    of the original list.
+
+    Args:
+        tensors (tp.List[torch.Tensor]): List of tensors to collate.
+        dim (int): Dimension which will be stacked and padded.
+    Returns:
+        tp.Tuple[torch.Tensor, torch.Tensor]:
+            torch.Tensor: Stacked and padded tensor. The output will contain 1 new dimension
+                (dimension index 0) which will be the size of the original list.
+            torch.Tensor: Tensor containing length of original tensor sizes (without padding).
+    &#34;&#34;&#34;
+    tensors = [x.transpose(0, dim) for x in tensors]
+    lens = torch.LongTensor([len(x) for x in tensors])
+    padded_tensors = pad_sequence(tensors)
+    padded_tensors = padded_tensors.transpose(0, 1)
+    padded_tensors = padded_tensors.transpose(1, dim + 1)
+    return padded_tensors, lens</code></pre>
+</details>
+</section>
+<section>
+</section>
+<section>
+</section>
+<section>
+<h2 class="section-title" id="header-functions">Functions</h2>
+<dl>
+<dt id="audiocraft.utils.utils.collate"><code class="name flex">
+<span>def <span class="ident">collate</span></span>(<span>tensors: List[torch.Tensor], dim: int = 0) ‑> Tuple[torch.Tensor, torch.Tensor]</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Get a list of tensors and collate them to a single tensor. according to the following logic:
+- <code>dim</code> specifies the time dimension which will be stacked and padded.
+- The output will contain 1 new dimension (dimension index 0) which will be the size of
+of the original list.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>tensors</code></strong> :&ensp;<code>tp.List[torch.Tensor]</code></dt>
+<dd>List of tensors to collate.</dd>
+<dt><strong><code>dim</code></strong> :&ensp;<code>int</code></dt>
+<dd>Dimension which will be stacked and padded.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt>tp.Tuple[torch.Tensor, torch.Tensor]:</dt>
+<dt><code>
+torch.Tensor</code></dt>
+<dd>Stacked and padded tensor. The output will contain 1 new dimension
+(dimension index 0) which will be the size of the original list.
+torch.Tensor: Tensor containing length of original tensor sizes (without padding).</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def collate(tensors: tp.List[torch.Tensor], dim: int = 0) -&gt; tp.Tuple[torch.Tensor, torch.Tensor]:
+    &#34;&#34;&#34;Get a list of tensors and collate them to a single tensor. according to the following logic:
+    - `dim` specifies the time dimension which will be stacked and padded.
+    - The output will contain 1 new dimension (dimension index 0) which will be the size of
+    of the original list.
+
+    Args:
+        tensors (tp.List[torch.Tensor]): List of tensors to collate.
+        dim (int): Dimension which will be stacked and padded.
+    Returns:
+        tp.Tuple[torch.Tensor, torch.Tensor]:
+            torch.Tensor: Stacked and padded tensor. The output will contain 1 new dimension
+                (dimension index 0) which will be the size of the original list.
+            torch.Tensor: Tensor containing length of original tensor sizes (without padding).
+    &#34;&#34;&#34;
+    tensors = [x.transpose(0, dim) for x in tensors]
+    lens = torch.LongTensor([len(x) for x in tensors])
+    padded_tensors = pad_sequence(tensors)
+    padded_tensors = padded_tensors.transpose(0, 1)
+    padded_tensors = padded_tensors.transpose(1, dim + 1)
+    return padded_tensors, lens</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.utils.utils.dict_from_config"><code class="name flex">
+<span>def <span class="ident">dict_from_config</span></span>(<span>cfg: omegaconf.dictconfig.DictConfig) ‑> dict</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Convenience function to map an omegaconf configuration to a dictionary.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>cfg</code></strong> :&ensp;<code>omegaconf.DictConfig</code></dt>
+<dd>Original configuration to map to dict.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><code>dict</code></dt>
+<dd>Config as dictionary object.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def dict_from_config(cfg: omegaconf.DictConfig) -&gt; dict:
+    &#34;&#34;&#34;Convenience function to map an omegaconf configuration to a dictionary.
+
+    Args:
+        cfg (omegaconf.DictConfig): Original configuration to map to dict.
+    Returns:
+        dict: Config as dictionary object.
+    &#34;&#34;&#34;
+    dct = omegaconf.OmegaConf.to_container(cfg, resolve=True)
+    assert isinstance(dct, dict)
+    return dct</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.utils.utils.get_dataset_from_loader"><code class="name flex">
+<span>def <span class="ident">get_dataset_from_loader</span></span>(<span>dataloader)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_dataset_from_loader(dataloader):
+    dataset = dataloader.dataset
+    if isinstance(dataset, torch.utils.data.Subset):
+        return dataset.dataset
+    else:
+        return dataset</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.utils.utils.get_loader"><code class="name flex">
+<span>def <span class="ident">get_loader</span></span>(<span>dataset, num_samples: Optional[int], batch_size: int, num_workers: int, seed: int, **kwargs) ‑> torch.utils.data.dataloader.DataLoader</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Convenience function to load dataset into a dataloader with optional subset sampling.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>dataset</code></strong></dt>
+<dd>Dataset to load.</dd>
+<dt><strong><code>num_samples</code></strong> :&ensp;<code>Optional[int]</code></dt>
+<dd>Number of samples to limit subset size.</dd>
+<dt><strong><code>batch_size</code></strong> :&ensp;<code>int</code></dt>
+<dd>Batch size.</dd>
+<dt><strong><code>num_workers</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of workers for data loading.</dd>
+<dt><strong><code>seed</code></strong> :&ensp;<code>int</code></dt>
+<dd>Random seed.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_loader(dataset, num_samples: tp.Optional[int], batch_size: int,
+               num_workers: int, seed: int, **kwargs) -&gt; torch.utils.data.DataLoader:
+    &#34;&#34;&#34;Convenience function to load dataset into a dataloader with optional subset sampling.
+
+    Args:
+        dataset: Dataset to load.
+        num_samples (Optional[int]): Number of samples to limit subset size.
+        batch_size (int): Batch size.
+        num_workers (int): Number of workers for data loading.
+        seed (int): Random seed.
+    &#34;&#34;&#34;
+    if num_samples is not None:
+        dataset = random_subset(dataset, num_samples, seed)
+
+    dataloader = flashy.distrib.loader(
+        dataset,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        **kwargs
+    )
+    return dataloader</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.utils.utils.get_pool_executor"><code class="name flex">
+<span>def <span class="ident">get_pool_executor</span></span>(<span>num_workers: int, mp_context=None)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def get_pool_executor(num_workers: int, mp_context=None):
+    return ProcessPoolExecutor(num_workers, mp_context) if num_workers &gt; 1 else DummyPoolExecutor(1)</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.utils.utils.hash_trick"><code class="name flex">
+<span>def <span class="ident">hash_trick</span></span>(<span>word: str, vocab_size: int) ‑> int</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Hash trick to pair each word with an index</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>word</code></strong> :&ensp;<code>str</code></dt>
+<dd>word we wish to convert to an index</dd>
+<dt><strong><code>vocab_size</code></strong> :&ensp;<code>int</code></dt>
+<dd>size of the vocabulary</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><code>int</code></dt>
+<dd>index of the word in the embedding LUT</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def hash_trick(word: str, vocab_size: int) -&gt; int:
+    &#34;&#34;&#34;Hash trick to pair each word with an index
+
+    Args:
+        word (str): word we wish to convert to an index
+        vocab_size (int): size of the vocabulary
+    Returns:
+        int: index of the word in the embedding LUT
+    &#34;&#34;&#34;
+    hash = int(hashlib.sha256(word.encode(&#34;utf-8&#34;)).hexdigest(), 16)
+    return hash % vocab_size</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.utils.utils.length_to_mask"><code class="name flex">
+<span>def <span class="ident">length_to_mask</span></span>(<span>lengths: torch.Tensor, max_len: Optional[int] = None) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Utility function to convert a tensor of sequence lengths to a mask (useful when working on padded sequences).
+For example: [3, 5] =&gt; [[1, 1, 1, 0, 0], [1, 1, 1, 1, 1]]</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>lengths</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>tensor with lengths</dd>
+<dt><strong><code>max_len</code></strong> :&ensp;<code>int</code></dt>
+<dd>can set the max length manually. Defaults to None.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><code>torch.Tensor</code></dt>
+<dd>mask with 0s where there is pad tokens else 1s</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def length_to_mask(lengths: torch.Tensor, max_len: tp.Optional[int] = None) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Utility function to convert a tensor of sequence lengths to a mask (useful when working on padded sequences).
+    For example: [3, 5] =&gt; [[1, 1, 1, 0, 0], [1, 1, 1, 1, 1]]
+
+    Args:
+        lengths (torch.Tensor): tensor with lengths
+        max_len (int): can set the max length manually. Defaults to None.
+    Returns:
+        torch.Tensor: mask with 0s where there is pad tokens else 1s
+    &#34;&#34;&#34;
+    assert len(lengths.shape) == 1, &#34;Length shape should be 1 dimensional.&#34;
+    final_length = lengths.max().item() if not max_len else max_len
+    final_length = max(final_length, 1)  # if all seqs are of len zero we don&#39;t want a zero-size tensor
+    return torch.arange(final_length)[None, :].to(lengths.device) &lt; lengths[:, None]</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.utils.utils.multinomial"><code class="name flex">
+<span>def <span class="ident">multinomial</span></span>(<span>input: torch.Tensor, num_samples: int, replacement=False, *, generator=None)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>torch.multinomial with arbitrary number of dimensions, and number of candidates on the last dimension.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>input</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>The input tensor containing probabilities.</dd>
+<dt><strong><code>num_samples</code></strong> :&ensp;<code>int</code></dt>
+<dd>Number of samples to draw.</dd>
+<dt><strong><code>replacement</code></strong> :&ensp;<code>bool</code></dt>
+<dd>Whether to draw with replacement or not.</dd>
+</dl>
+<p>Keywords args:
+generator (torch.Generator): A pseudorandom number generator for sampling.</p>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><code>torch.Tensor</code></dt>
+<dd>Last dimension contains num_samples indices
+sampled from the multinomial probability distribution
+located in the last dimension of tensor input.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def multinomial(input: torch.Tensor, num_samples: int, replacement=False, *, generator=None):
+    &#34;&#34;&#34;torch.multinomial with arbitrary number of dimensions, and number of candidates on the last dimension.
+
+    Args:
+        input (torch.Tensor): The input tensor containing probabilities.
+        num_samples (int): Number of samples to draw.
+        replacement (bool): Whether to draw with replacement or not.
+    Keywords args:
+        generator (torch.Generator): A pseudorandom number generator for sampling.
+    Returns:
+        torch.Tensor: Last dimension contains num_samples indices
+            sampled from the multinomial probability distribution
+            located in the last dimension of tensor input.
+    &#34;&#34;&#34;
+    input_ = input.reshape(-1, input.shape[-1])
+    output_ = torch.multinomial(input_, num_samples=num_samples, replacement=replacement, generator=generator)
+    output = output_.reshape(*list(input.shape[:-1]), -1)
+    return output</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.utils.utils.random_subset"><code class="name flex">
+<span>def <span class="ident">random_subset</span></span>(<span>dataset, max_samples: int, seed: int = 42) ‑> torch.utils.data.dataset.Subset</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def random_subset(dataset, max_samples: int, seed: int = 42) -&gt; torch.utils.data.Subset:
+    if max_samples &gt;= len(dataset):
+        return dataset
+
+    generator = torch.Generator().manual_seed(seed)
+    perm = torch.randperm(len(dataset), generator=generator)
+    return torch.utils.data.Subset(dataset, perm[:max_samples].tolist())</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.utils.utils.sample_top_k"><code class="name flex">
+<span>def <span class="ident">sample_top_k</span></span>(<span>probs: torch.Tensor, k: int) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Sample next token from top K values along the last dimension of the input probs tensor.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>probs</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>Input probabilities with token candidates on the last dimension.</dd>
+<dt><strong><code>k</code></strong> :&ensp;<code>int</code></dt>
+<dd>The k in “top-k”.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><code>torch.Tensor</code></dt>
+<dd>Sampled tokens.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def sample_top_k(probs: torch.Tensor, k: int) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Sample next token from top K values along the last dimension of the input probs tensor.
+
+    Args:
+        probs (torch.Tensor): Input probabilities with token candidates on the last dimension.
+        k (int): The k in “top-k”.
+    Returns:
+        torch.Tensor: Sampled tokens.
+    &#34;&#34;&#34;
+    top_k_value, _ = torch.topk(probs, k, dim=-1)
+    min_value_top_k = top_k_value[..., [-1]]
+    probs *= (probs &gt;= min_value_top_k).float()
+    probs.div_(probs.sum(dim=-1, keepdim=True))
+    next_token = multinomial(probs, num_samples=1)
+    return next_token</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.utils.utils.sample_top_p"><code class="name flex">
+<span>def <span class="ident">sample_top_p</span></span>(<span>probs: torch.Tensor, p: float) ‑> torch.Tensor</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Sample next token from top P probabilities along the last dimension of the input probs tensor.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>probs</code></strong> :&ensp;<code>torch.Tensor</code></dt>
+<dd>Input probabilities with token candidates on the last dimension.</dd>
+<dt><strong><code>p</code></strong> :&ensp;<code>int</code></dt>
+<dd>The p in “top-p”.</dd>
+</dl>
+<h2 id="returns">Returns</h2>
+<dl>
+<dt><code>torch.Tensor</code></dt>
+<dd>Sampled tokens.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def sample_top_p(probs: torch.Tensor, p: float) -&gt; torch.Tensor:
+    &#34;&#34;&#34;Sample next token from top P probabilities along the last dimension of the input probs tensor.
+
+    Args:
+        probs (torch.Tensor): Input probabilities with token candidates on the last dimension.
+        p (int): The p in “top-p”.
+    Returns:
+        torch.Tensor: Sampled tokens.
+    &#34;&#34;&#34;
+    probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True)
+    probs_sum = torch.cumsum(probs_sort, dim=-1)
+    mask = probs_sum - probs_sort &gt; p
+    probs_sort *= (~mask).float()
+    probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
+    next_token = multinomial(probs_sort, num_samples=1)
+    next_token = torch.gather(probs_idx, -1, next_token)
+    return next_token</code></pre>
+</details>
+</dd>
+<dt id="audiocraft.utils.utils.with_rank_rng"><code class="name flex">
+<span>def <span class="ident">with_rank_rng</span></span>(<span>base_seed: int = 1234)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Decorator for a function so that the function will use a Random Number Generator
+whose state depend on the GPU rank. The original RNG state is restored upon returning.</p>
+<h2 id="args">Args</h2>
+<dl>
+<dt><strong><code>base_seed</code></strong> :&ensp;<code>int</code></dt>
+<dd>Random seed.</dd>
+</dl></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def with_rank_rng(base_seed: int = 1234):
+    &#34;&#34;&#34;Decorator for a function so that the function will use a Random Number Generator
+    whose state depend on the GPU rank. The original RNG state is restored upon returning.
+
+    Args:
+        base_seed (int): Random seed.
+    &#34;&#34;&#34;
+    def _decorator(fun: tp.Callable):
+        @wraps(fun)
+        def _decorated(*args, **kwargs):
+            state = torch.get_rng_state()
+            seed = base_seed ^ flashy.distrib.rank()
+            torch.manual_seed(seed)
+            logger.debug(&#39;Rank dependent seed set to %d&#39;, seed)
+            try:
+                return fun(*args, **kwargs)
+            finally:
+                torch.set_rng_state(state)
+                logger.debug(&#39;RNG state restored.&#39;)
+        return _decorated
+    return _decorator</code></pre>
+</details>
+</dd>
+</dl>
+</section>
+<section>
+<h2 class="section-title" id="header-classes">Classes</h2>
+<dl>
+<dt id="audiocraft.utils.utils.DummyPoolExecutor"><code class="flex name class">
+<span>class <span class="ident">DummyPoolExecutor</span></span>
+<span>(</span><span>workers, mp_context=None)</span>
+</code></dt>
+<dd>
+<div class="desc"><p>Dummy pool executor to use when we actually have only 1 worker.
+(e.g. instead of ProcessPoolExecutor).</p></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">class DummyPoolExecutor:
+    &#34;&#34;&#34;Dummy pool executor to use when we actually have only 1 worker.
+    (e.g. instead of ProcessPoolExecutor).
+    &#34;&#34;&#34;
+    class DummyResult:
+        def __init__(self, func, *args, **kwargs):
+            self.func = func
+            self.args = args
+            self.kwargs = kwargs
+
+        def result(self):
+            return self.func(*self.args, **self.kwargs)
+
+    def __init__(self, workers, mp_context=None):
+        pass
+
+    def submit(self, func, *args, **kwargs):
+        return DummyPoolExecutor.DummyResult(func, *args, **kwargs)
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, exc_tb):
+        return</code></pre>
+</details>
+<h3>Class variables</h3>
+<dl>
+<dt id="audiocraft.utils.utils.DummyPoolExecutor.DummyResult"><code class="name">var <span class="ident">DummyResult</span></code></dt>
+<dd>
+<div class="desc"></div>
+</dd>
+</dl>
+<h3>Methods</h3>
+<dl>
+<dt id="audiocraft.utils.utils.DummyPoolExecutor.submit"><code class="name flex">
+<span>def <span class="ident">submit</span></span>(<span>self, func, *args, **kwargs)</span>
+</code></dt>
+<dd>
+<div class="desc"></div>
+<details class="source">
+<summary>
+<span>Expand source code</span>
+</summary>
+<pre><code class="python">def submit(self, func, *args, **kwargs):
+    return DummyPoolExecutor.DummyResult(func, *args, **kwargs)</code></pre>
+</details>
+</dd>
+</dl>
+</dd>
+</dl>
+</section>
+</article>
+<nav id="sidebar">
+<h1>Index</h1>
+<div class="toc">
+<ul></ul>
+</div>
+<ul id="index">
+<li><h3>Super-module</h3>
+<ul>
+<li><code><a title="audiocraft.utils" href="index.html">audiocraft.utils</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-functions">Functions</a></h3>
+<ul class="">
+<li><code><a title="audiocraft.utils.utils.collate" href="#audiocraft.utils.utils.collate">collate</a></code></li>
+<li><code><a title="audiocraft.utils.utils.dict_from_config" href="#audiocraft.utils.utils.dict_from_config">dict_from_config</a></code></li>
+<li><code><a title="audiocraft.utils.utils.get_dataset_from_loader" href="#audiocraft.utils.utils.get_dataset_from_loader">get_dataset_from_loader</a></code></li>
+<li><code><a title="audiocraft.utils.utils.get_loader" href="#audiocraft.utils.utils.get_loader">get_loader</a></code></li>
+<li><code><a title="audiocraft.utils.utils.get_pool_executor" href="#audiocraft.utils.utils.get_pool_executor">get_pool_executor</a></code></li>
+<li><code><a title="audiocraft.utils.utils.hash_trick" href="#audiocraft.utils.utils.hash_trick">hash_trick</a></code></li>
+<li><code><a title="audiocraft.utils.utils.length_to_mask" href="#audiocraft.utils.utils.length_to_mask">length_to_mask</a></code></li>
+<li><code><a title="audiocraft.utils.utils.multinomial" href="#audiocraft.utils.utils.multinomial">multinomial</a></code></li>
+<li><code><a title="audiocraft.utils.utils.random_subset" href="#audiocraft.utils.utils.random_subset">random_subset</a></code></li>
+<li><code><a title="audiocraft.utils.utils.sample_top_k" href="#audiocraft.utils.utils.sample_top_k">sample_top_k</a></code></li>
+<li><code><a title="audiocraft.utils.utils.sample_top_p" href="#audiocraft.utils.utils.sample_top_p">sample_top_p</a></code></li>
+<li><code><a title="audiocraft.utils.utils.with_rank_rng" href="#audiocraft.utils.utils.with_rank_rng">with_rank_rng</a></code></li>
+</ul>
+</li>
+<li><h3><a href="#header-classes">Classes</a></h3>
+<ul>
+<li>
+<h4><code><a title="audiocraft.utils.utils.DummyPoolExecutor" href="#audiocraft.utils.utils.DummyPoolExecutor">DummyPoolExecutor</a></code></h4>
+<ul class="">
+<li><code><a title="audiocraft.utils.utils.DummyPoolExecutor.DummyResult" href="#audiocraft.utils.utils.DummyPoolExecutor.DummyResult">DummyResult</a></code></li>
+<li><code><a title="audiocraft.utils.utils.DummyPoolExecutor.submit" href="#audiocraft.utils.utils.DummyPoolExecutor.submit">submit</a></code></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</nav>
+</main>
+<footer id="footer">
+<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
+</footer>
+</body>
+</html>
\ No newline at end of file