huggingface · ArthurZucker · Nov 7, 2023 · Oct 16, 2023 · Oct 16, 2023 · Oct 16, 2023
diff --git a/docs/source/en/model_doc/whisper.md b/docs/source/en/model_doc/whisper.md
@@ -35,6 +35,85 @@ Tips:
 This model was contributed by [Arthur Zucker](https://huggingface.co/ArthurZ). The Tensorflow version of this model was contributed by [amyeroberts](https://huggingface.co/amyeroberts).
 The original code can be found [here](https://github.com/openai/whisper).
 
+## Inference
+
+Here is a step-by-step guide to transcribing an audio sample using a pre-trained Whisper model:
+
+```python
+>>> import torchaudio
+>>> from transformers import WhisperProcessor, WhisperForConditionalGeneration
+
+>>> # Select an audio file:
+>>> audio_path = "https://huggingface.co/datasets/sanchit-gandhi/librispeech_long/resolve/main/audio.wav"
+
+>>> # Load the Whisper model in Hugging Face format:
+>>> processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
+>>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
+>>> model.config.forced_decoder_ids = None
->>> model.config.forced_decoder_ids = None
->>> model.config.forced_decoder_ids = None
+
+>>> # Select an audio file:
+>>> waveform, sampling_rate = torchaudio.load(audio_path)
+
+>>> # Use the model and processor to transcribe the audio:
+>>> input_features = processor(
+...     waveform.squeeze().numpy(), sampling_rate=sampling_rate, return_tensors="pt"
+... ).input_features
+
+>>> # Generate token ids
+>>> predicted_ids = model.generate(input_features)
+
+>>> # Decode token ids to text
+>>> transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
+
+>>> transcription[0]
+' Chapter 16.'
+```
+
+## Format Conversion
+
+For users with models in the original OpenAI format who wish to utilize them with the Hugging Face library, a conversion script is provided. The example below demonstrates how to transform Whisper models from OpenAI to Hugging Face format:
+
+```bash
+# Change to the whisper directory where the script resides:
+cd src/transformers/models/whisper/
+# Converts the model from OpenAI to Hugging Face format:
+convert_openai_to_hf.py \
+  --checkpoint_path tiny \
+  --pytorch_dump_folder_path whisper-tiny-hf
+```
+
+For those more comfortable working directly in Python, the conversion can also be achieved with the code snippet below:
+
+```python
+>>> from transformers.models.whisper.convert_openai_to_hf import convert_openai_whisper_to_tfms
+>>> convert_openai_whisper_to_tfms("tiny.en", "whisper-tiny.en-hf")  # doctest: +IGNORE_RESULT
+```
+
+Now can test it by doing inference with an audio file:
+
+```python
+>>> # Load the newly converted model:
+>>> processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
+>>> model = WhisperForConditionalGeneration.from_pretrained("whisper-tiny.en-hf")
+>>> model.config.forced_decoder_ids = None
+
+>>> # Select an audio file:
+>>> waveform, sampling_rate = torchaudio.load(audio_path)
+
+>>> # Use the model and processor to transcribe the audio:
+>>> input_features = processor(
+...     waveform.squeeze().numpy(), sampling_rate=sampling_rate, return_tensors="pt"
+... ).input_features
+
+>>> # Transcribe the example:
+>>> predicted_ids = model.generate(input_features)
+>>> transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
+
+>>> transcription[0]
+' Chapter 16. I might have told you of the beginning of this liaison in a few lines'
+```
+
+This step is not usually required if we are using the models already [provided by OpenAI in the Hugging Face Hub](https://huggingface.co/openai).
 
 ## WhisperConfig
 

diff --git a/src/transformers/models/whisper/configuration_whisper.py b/src/transformers/models/whisper/configuration_whisper.py
@@ -77,13 +77,13 @@ class WhisperConfig(PretrainedConfig):
         num_mel_bins (`int`, *optional*, defaults to 80):
             Number of mel features used per input features. Should correspond to the value used in the
             `WhisperProcessor` class.
-        encoder_layers (`int`, *optional*, defaults to 6):
+        encoder_layers (`int`, *optional*, defaults to 4):
             Number of encoder layers.
-        decoder_layers (`int`, *optional*, defaults to 6):
+        decoder_layers (`int`, *optional*, defaults to 4):
             Number of decoder layers.
-        encoder_attention_heads (`int`, *optional*, defaults to 4):
+        encoder_attention_heads (`int`, *optional*, defaults to 6):
             Number of attention heads for each attention layer in the Transformer encoder.
-        decoder_attention_heads (`int`, *optional*, defaults to 4):
+        decoder_attention_heads (`int`, *optional*, defaults to 6):
             Number of attention heads for each attention layer in the Transformer decoder.
         encoder_ffn_dim (`int`, *optional*, defaults to 1536):
             Dimensionality of the "intermediate" (often named feed-forward) layer in encoder.
@@ -106,7 +106,7 @@ class WhisperConfig(PretrainedConfig):
         activation_function (`str`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
             `"relu"`, `"silu"` and `"gelu_new"` are supported.
-        d_model (`int`, *optional*, defaults to 256):
+        d_model (`int`, *optional*, defaults to 384):
             Dimensionality of the layers.
         dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
@@ -197,10 +197,10 @@ def __init__(
         self,
         vocab_size=51865,
         num_mel_bins=80,
-        encoder_layers=6,
-        encoder_attention_heads=4,
-        decoder_layers=6,
-        decoder_attention_heads=4,
+        encoder_layers=4,
+        encoder_attention_heads=6,
+        decoder_layers=4,
+        decoder_attention_heads=6,
         decoder_ffn_dim=1536,
         encoder_ffn_dim=1536,
         encoder_layerdrop=0.0,
@@ -209,7 +209,7 @@ def __init__(
         use_cache=True,
         is_encoder_decoder=True,
         activation_function="gelu",
-        d_model=256,
+        d_model=384,
         dropout=0.0,
         attention_dropout=0.0,
         activation_dropout=0.0,

diff --git a/src/transformers/models/whisper/convert_openai_to_hf.py b/src/transformers/models/whisper/convert_openai_to_hf.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python
+"""Converts a Whisper model in OpenAI format to Hugging Face format."""
 # Copyright 2022 The HuggingFace Inc. team and the OpenAI team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -14,6 +16,7 @@
 
 import argparse
 import hashlib
+import io
 import os
 import urllib
 import warnings
@@ -90,7 +93,7 @@ def make_linear_from_emb(emb):
     return lin_layer
 
 
-def _download(url: str, root: str) -> bytes:
+def _download(url: str, root: str) -> io.BytesIO:
     os.makedirs(root, exist_ok=True)
     filename = os.path.basename(url)
 
@@ -103,7 +106,7 @@ def _download(url: str, root: str) -> bytes:
     if os.path.isfile(download_target):
         model_bytes = open(download_target, "rb").read()
         if hashlib.sha256(model_bytes).hexdigest() == expected_sha256:
-            return model_bytes
+            return torch.load(io.BytesIO(model_bytes))
         else:
             warnings.warn(f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file")
 
@@ -125,12 +128,13 @@ def _download(url: str, root: str) -> bytes:
             "Model has been downloaded but the SHA256 checksum does not not match. Please retry loading the model."
         )
 
-    return model_bytes
+    return torch.load(io.BytesIO(model_bytes))
 
 
 def convert_openai_whisper_to_tfms(checkpoint_path, pytorch_dump_folder_path):
     if ".pt" not in checkpoint_path:
-        original_checkpoint = _download(_MODELS[checkpoint_path])
+        root = os.path.dirname(pytorch_dump_folder_path) or "."
+        original_checkpoint = _download(_MODELS[checkpoint_path], root)
     else:
         original_checkpoint = torch.load(checkpoint_path, map_location="cpu")
     dimensions = original_checkpoint["dims"]
@@ -151,7 +155,7 @@ def convert_openai_whisper_to_tfms(checkpoint_path, pytorch_dump_folder_path):
         encoder_layers=dimensions["n_audio_layer"],
         encoder_attention_heads=dimensions["n_audio_head"],
         decoder_layers=dimensions["n_text_layer"],
-        decoder_attention_heads=dimensions["n_text_state"],
 decoder_attention_heads=dimensions["n_text_head"], 
 decoder_attention_heads=dimensions["n_text_head"], 
+        decoder_attention_heads=dimensions["n_text_head"],
         max_source_positions=dimensions["n_audio_ctx"],
     )