From c2583ab1e086f6c9d5c139c852a8459644162d4a Mon Sep 17 00:00:00 2001
From: Fangjun Kuang <csukuangfj@gmail.com>
Date: Fri, 13 May 2022 08:47:08 +0800
Subject: [PATCH 1/2] Validate that there are no OOV tokens in BPE-based
 lexicons.

---
 egs/librispeech/ASR/local/prepare_lang_bpe.py |  9 ++-
 .../ASR/local/validate_bpe_lexicon.py         | 77 +++++++++++++++++++
 egs/librispeech/ASR/prepare.sh                | 15 +++-
 3 files changed, 96 insertions(+), 5 deletions(-)
 create mode 100755 egs/librispeech/ASR/local/validate_bpe_lexicon.py

diff --git a/egs/librispeech/ASR/local/prepare_lang_bpe.py b/egs/librispeech/ASR/local/prepare_lang_bpe.py
index cf32f308df..dec8a74426 100755
--- a/egs/librispeech/ASR/local/prepare_lang_bpe.py
+++ b/egs/librispeech/ASR/local/prepare_lang_bpe.py
@@ -145,7 +145,14 @@ def generate_lexicon(
     sp = spm.SentencePieceProcessor()
     sp.load(str(model_file))
 
-    words_pieces: List[List[str]] = sp.encode(words, out_type=str)
+    # Convert word to word piece IDs instead of word piece strings
+    # to avoid OOV tokens.
+    words_pieces_ids: List[List[int]] = sp.encode(words, out_type=int)
+
+    # Now convert word piece IDs back to word piece strings.
+    words_pieces: List[List[str]] = [
+        sp.id_to_piece(ids) for ids in words_pieces_ids
+    ]
 
     lexicon = []
     for word, pieces in zip(words, words_pieces):
diff --git a/egs/librispeech/ASR/local/validate_bpe_lexicon.py b/egs/librispeech/ASR/local/validate_bpe_lexicon.py
new file mode 100755
index 0000000000..6fe5d400f5
--- /dev/null
+++ b/egs/librispeech/ASR/local/validate_bpe_lexicon.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python3
+# Copyright    2022  Xiaomi Corp.        (authors: Fangjun Kuang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This script checks that there are no OOV tokens in the BPE-based lexicon.
+
+Usage example:
+
+    python3 ./local/validate_lexicon.py \
+            --lexicon /path/to/lexicon.txt \
+            --bpe-model /path/to/bpe.model
+"""
+
+import argparse
+from pathlib import Path
+from typing import List, Tuple
+
+import sentencepiece as spm
+
+from icefall.lexicon import read_lexicon
+
+# Map word to word pieces
+Lexicon = List[Tuple[str, List[str]]]
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--lexicon",
+        required=True,
+        type=Path,
+        help="Path to lexicon.txt",
+    )
+
+    parser.add_argument(
+        "--bpe-model",
+        required=True,
+        type=Path,
+        help="Path to bpe.model",
+    )
+
+    return parser.parse_args()
+
+
+def main():
+    args = get_args()
+    assert args.lexicon.is_file(), args.lexicon
+    assert args.bpe_model.is_file(), args.bpe_model
+
+    lexicon = read_lexicon(args.lexicon)
+
+    sp = spm.SentencePieceProcessor()
+    sp.load(str(args.bpe_model))
+
+    word_pieces = set(sp.id_to_piece(list(range(sp.vocab_size()))))
+    for word, pieces in lexicon:
+        for p in pieces:
+            if p not in word_pieces:
+                raise ValueError(f"The word {word} contains an OOV token {p}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/librispeech/ASR/prepare.sh b/egs/librispeech/ASR/prepare.sh
index 33d298a7a5..8cfb046c84 100755
--- a/egs/librispeech/ASR/prepare.sh
+++ b/egs/librispeech/ASR/prepare.sh
@@ -184,13 +184,20 @@ if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
       done > $lang_dir/transcript_words.txt
     fi
 
-    ./local/train_bpe_model.py \
-      --lang-dir $lang_dir \
-      --vocab-size $vocab_size \
-      --transcript $lang_dir/transcript_words.txt
+    if [ ! -f $lang_dir/bpe.model ]; then
+      ./local/train_bpe_model.py \
+        --lang-dir $lang_dir \
+        --vocab-size $vocab_size \
+        --transcript $lang_dir/transcript_words.txt
+    fi
 
     if [ ! -f $lang_dir/L_disambig.pt ]; then
       ./local/prepare_lang_bpe.py --lang-dir $lang_dir
+
+      log "Validating $lang_dir/lexicon.txt"
+      ./local/validate_bpe_lexicon.py \
+        --lexicon $lang_dir/lexicon.txt \
+        --bpe-model $lang_dir/bpe.model
     fi
   done
 fi

From e780dccb00d5a422e2d555f264dcc39596c6e341 Mon Sep 17 00:00:00 2001
From: Fangjun Kuang <csukuangfj@gmail.com>
Date: Fri, 13 May 2022 13:58:05 +0800
Subject: [PATCH 2/2] Typo fixes.

---
 egs/librispeech/ASR/local/validate_bpe_lexicon.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/librispeech/ASR/local/validate_bpe_lexicon.py b/egs/librispeech/ASR/local/validate_bpe_lexicon.py
index 6fe5d400f5..c542f2fabd 100755
--- a/egs/librispeech/ASR/local/validate_bpe_lexicon.py
+++ b/egs/librispeech/ASR/local/validate_bpe_lexicon.py
@@ -19,7 +19,7 @@
 
 Usage example:
 
-    python3 ./local/validate_lexicon.py \
+    python3 ./local/validate_bpe_lexicon.py \
             --lexicon /path/to/lexicon.txt \
             --bpe-model /path/to/bpe.model
 """