h-vetinari · h-vetinari · Mar 21, 2022 · Mar 21, 2022
diff --git a/bindings/python/Cargo.lock b/bindings/python/Cargo.lock
diff --git a/bindings/python/Cargo.toml b/bindings/python/Cargo.toml
@@ -13,12 +13,12 @@ rayon = "1.3"
 serde = { version = "1.0", features = [ "rc", "derive" ]}
 serde_json = "1.0"
 libc = "0.2"
-env_logger = "0.7.1"
-pyo3 = "0.15.0"
-numpy = "0.15.0"
-ndarray = "0.13"
+env_logger = "0.9.0"
+pyo3 = "0.16.2"
+numpy = "0.16.2"
+ndarray = "0.15"
 onig = { version = "6.0", default-features = false }
-itertools = "0.9"
+itertools = "0.10"
 
 [dependencies.tokenizers]
 version = "*"

diff --git a/bindings/python/src/decoders.rs b/bindings/python/src/decoders.rs
@@ -21,7 +21,7 @@ use super::error::ToPyResult;
 ///
 /// This class is not supposed to be instantiated directly. Instead, any implementation of
 /// a Decoder will return an instance of this class when instantiated.
-#[pyclass(dict, module = "tokenizers.decoders", name = "Decoder")]
+#[pyclass(dict, module = "tokenizers.decoders", name = "Decoder", subclass)]
 #[derive(Clone, Deserialize, Serialize)]
 pub struct PyDecoder {
     #[serde(flatten)]

diff --git a/bindings/python/src/encoding.rs b/bindings/python/src/encoding.rs
@@ -393,7 +393,9 @@ impl PyEncoding {
     ///     pad_token (:obj:`str`, defaults to `[PAD]`):
     ///         The pad token to use
     #[args(kwargs = "**")]
-    #[pyo3(text_signature = "(self, length, direction='right', pad_id=0, pad_type_id=0, pad_token='[PAD]')")]
+    #[pyo3(
+        text_signature = "(self, length, direction='right', pad_id=0, pad_type_id=0, pad_token='[PAD]')"
+    )]
     fn pad(&mut self, length: usize, kwargs: Option<&PyDict>) -> PyResult<()> {
         let mut pad_id = 0;
         let mut pad_type_id = 0;

diff --git a/bindings/python/src/lib.rs b/bindings/python/src/lib.rs
@@ -126,7 +126,7 @@ fn normalizers(_py: Python, m: &PyModule) -> PyResult<()> {
 /// Tokenizers Module
 #[pymodule]
 fn tokenizers(_py: Python, m: &PyModule) -> PyResult<()> {
-    env_logger::init_from_env("TOKENIZERS_LOG");
+    let _ = env_logger::try_init_from_env("TOKENIZERS_LOG");
 
     // Register the fork callback
     #[cfg(target_family = "unix")]

diff --git a/bindings/python/src/models.rs b/bindings/python/src/models.rs
@@ -24,7 +24,7 @@ use super::error::{deprecation_warning, ToPyResult};
 /// will contain and manage the learned vocabulary.
 ///
 /// This class cannot be constructed directly. Please use one of the concrete models.
-#[pyclass(module = "tokenizers.models", name = "Model")]
+#[pyclass(module = "tokenizers.models", name = "Model", subclass)]
 #[derive(Clone, Serialize, Deserialize)]
 pub struct PyModel {
     #[serde(flatten)]
@@ -249,7 +249,9 @@ impl PyModel {
 ///     fuse_unk (:obj:`bool`, `optional`):
 ///         Whether to fuse any subsequent unknown tokens into a single one
 #[pyclass(extends=PyModel, module = "tokenizers.models", name = "BPE")]
-#[pyo3(text_signature = "(self, vocab=None, merges=None, cache_capacity=None, dropout=None, unk_token=None, continuing_subword_prefix=None, end_of_word_suffix=None, fuse_unk=None)")]
+#[pyo3(
+    text_signature = "(self, vocab=None, merges=None, cache_capacity=None, dropout=None, unk_token=None, continuing_subword_prefix=None, end_of_word_suffix=None, fuse_unk=None)"
+)]
 pub struct PyBPE {}
 
 impl PyBPE {

diff --git a/bindings/python/src/normalizers.rs b/bindings/python/src/normalizers.rs
@@ -43,7 +43,7 @@ impl PyNormalizedStringMut<'_> {
 ///
 /// This class is not supposed to be instantiated directly. Instead, any implementation of a
 /// Normalizer will return an instance of this class when instantiated.
-#[pyclass(dict, module = "tokenizers.normalizers", name = "Normalizer")]
+#[pyclass(dict, module = "tokenizers.normalizers", name = "Normalizer", subclass)]
 #[derive(Clone, Serialize, Deserialize)]
 pub struct PyNormalizer {
     #[serde(flatten)]
@@ -218,7 +218,9 @@ macro_rules! setter {
 ///     lowercase (:obj:`bool`, `optional`, defaults to :obj:`True`):
 ///         Whether to lowercase.
 #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "BertNormalizer")]
-#[pyo3(text_signature = "(self, clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True)")]
+#[pyo3(
+    text_signature = "(self, clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True)"
+)]
 pub struct PyBertNormalizer {}
 #[pymethods]
 impl PyBertNormalizer {

diff --git a/bindings/python/src/pre_tokenizers.rs b/bindings/python/src/pre_tokenizers.rs
@@ -28,7 +28,12 @@ use super::utils::*;
 ///
 /// This class is not supposed to be instantiated directly. Instead, any implementation of a
 /// PreTokenizer will return an instance of this class when instantiated.
-#[pyclass(dict, module = "tokenizers.pre_tokenizers", name = "PreTokenizer")]
+#[pyclass(
+    dict,
+    module = "tokenizers.pre_tokenizers",
+    name = "PreTokenizer",
+    subclass
+)]
 #[derive(Clone, Serialize, Deserialize)]
 pub struct PyPreTokenizer {
     #[serde(flatten)]
@@ -228,7 +233,7 @@ macro_rules! setter {
 ///     add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
 ///         Whether to add a space to the first word if there isn't already one. This
 ///         lets us treat `hello` exactly like `say hello`.
-#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=ByteLevel)]
+#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "ByteLevel")]
 #[pyo3(text_signature = "(self, add_prefix_space=True, use_regex=True)")]
 pub struct PyByteLevel {}
 #[pymethods]

diff --git a/bindings/python/src/processors.rs b/bindings/python/src/processors.rs
@@ -20,7 +20,12 @@ use tokenizers as tk;
 ///
 /// This class is not supposed to be instantiated directly. Instead, any implementation of
 /// a PostProcessor will return an instance of this class when instantiated.
-#[pyclass(dict, module = "tokenizers.processors", name = "PostProcessor")]
+#[pyclass(
+    dict,
+    module = "tokenizers.processors",
+    name = "PostProcessor",
+    subclass
+)]
 #[derive(Clone, Deserialize, Serialize)]
 pub struct PyPostProcessor {
     #[serde(flatten)]

diff --git a/bindings/python/src/tokenizer.rs b/bindings/python/src/tokenizer.rs
@@ -56,7 +56,9 @@ use crate::utils::{MaybeSizedIterator, PyBufferedIterator};
 ///         Yesterday"``.
 ///
 #[pyclass(dict, module = "tokenizers", name = "AddedToken")]
-#[pyo3(text_signature = "(self, content, single_word=False, lstrip=False, rstrip=False, normalized=True)")]
+#[pyo3(
+    text_signature = "(self, content, single_word=False, lstrip=False, rstrip=False, normalized=True)"
+)]
 pub struct PyAddedToken {
     pub content: String,
     pub is_special_token: bool,
@@ -665,7 +667,9 @@ impl PyTokenizer {
     ///     direction (:obj:`str`, defaults to :obj:`right`):
     ///         Truncate direction
     #[args(kwargs = "**")]
-    #[pyo3(text_signature = "(self, max_length, stride=0, strategy='longest_first', direction='right')")]
+    #[pyo3(
+        text_signature = "(self, max_length, stride=0, strategy='longest_first', direction='right')"
+    )]
     fn enable_truncation(&mut self, max_length: usize, kwargs: Option<&PyDict>) -> PyResult<()> {
         let mut params = TruncationParams {
             max_length,
@@ -765,7 +769,9 @@ impl PyTokenizer {
     ///         If specified, the length at which to pad. If not specified we pad using the size of
     ///         the longest sequence in a batch.
     #[args(kwargs = "**")]
-    #[pyo3(text_signature = "(self, direction='right', pad_id=0, pad_type_id=0, pad_token='[PAD]', length=None, pad_to_multiple_of=None)")]
+    #[pyo3(
+        text_signature = "(self, direction='right', pad_id=0, pad_type_id=0, pad_token='[PAD]', length=None, pad_to_multiple_of=None)"
+    )]
     fn enable_padding(&mut self, kwargs: Option<&PyDict>) -> PyResult<()> {
         let mut params = PaddingParams::default();
 
@@ -892,7 +898,9 @@ impl PyTokenizer {
     ///     :class:`~tokenizers.Encoding`: The encoded result
     ///
     #[args(pair = "None", is_pretokenized = "false", add_special_tokens = "true")]
-    #[pyo3(text_signature = "(self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True)")]
+    #[pyo3(
+        text_signature = "(self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True)"
+    )]
     fn encode(
         &self,
         sequence: &PyAny,

diff --git a/bindings/python/src/trainers.rs b/bindings/python/src/trainers.rs
@@ -15,7 +15,7 @@ use tokenizers as tk;
 ///
 /// This class is not supposed to be instantiated directly. Instead, any implementation of a
 /// Trainer will return an instance of this class when instantiated.
-#[pyclass(module = "tokenizers.trainers", name = "Trainer")]
+#[pyclass(module = "tokenizers.trainers", name = "Trainer", subclass)]
 #[derive(Clone, Deserialize, Serialize)]
 pub struct PyTrainer {
     #[serde(flatten)]
@@ -368,7 +368,9 @@ impl PyBpeTrainer {
 ///     end_of_word_suffix (:obj:`str`, `optional`):
 ///         A suffix to be used for every subword that is a end-of-word.
 #[pyclass(extends=PyTrainer, module = "tokenizers.trainers", name = "WordPieceTrainer")]
-#[pyo3(text_signature = "(self, vocab_size=30000, min_frequency=0, show_progress=True, special_tokens=[], limit_alphabet=None, initial_alphabet= [],continuing_subword_prefix=\"##\", end_of_word_suffix=None)")]
+#[pyo3(
+    text_signature = "(self, vocab_size=30000, min_frequency=0, show_progress=True, special_tokens=[], limit_alphabet=None, initial_alphabet= [],continuing_subword_prefix=\"##\", end_of_word_suffix=None)"
+)]
 pub struct PyWordPieceTrainer {}
 #[pymethods]
 impl PyWordPieceTrainer {
@@ -714,7 +716,9 @@ impl PyWordLevelTrainer {
 ///         The number of iterations of the EM algorithm to perform before
 ///         pruning the vocabulary.
 #[pyclass(extends=PyTrainer, module = "tokenizers.trainers", name = "UnigramTrainer")]
-#[pyo3(text_signature = "(self, vocab_size=8000, show_progress=True, special_tokens=[], shrinking_factor=0.75, unk_token=None, max_piece_length=16, n_sub_iterations=2)")]
+#[pyo3(
+    text_signature = "(self, vocab_size=8000, show_progress=True, special_tokens=[], shrinking_factor=0.75, unk_token=None, max_piece_length=16, n_sub_iterations=2)"
+)]
 pub struct PyUnigramTrainer {}
 #[pymethods]
 impl PyUnigramTrainer {