Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Upgrade pyo3 to 0.16 #1

Merged
merged 1 commit into from
Mar 21, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
613 changes: 288 additions & 325 deletions bindings/python/Cargo.lock

Large diffs are not rendered by default.

10 changes: 5 additions & 5 deletions bindings/python/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,12 @@ rayon = "1.3"
serde = { version = "1.0", features = [ "rc", "derive" ]}
serde_json = "1.0"
libc = "0.2"
env_logger = "0.7.1"
pyo3 = "0.15.0"
numpy = "0.15.0"
ndarray = "0.13"
env_logger = "0.9.0"
pyo3 = "0.16.2"
numpy = "0.16.2"
ndarray = "0.15"
onig = { version = "6.0", default-features = false }
itertools = "0.9"
itertools = "0.10"

[dependencies.tokenizers]
version = "*"
Expand Down
2 changes: 1 addition & 1 deletion bindings/python/src/decoders.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ use super::error::ToPyResult;
///
/// This class is not supposed to be instantiated directly. Instead, any implementation of
/// a Decoder will return an instance of this class when instantiated.
#[pyclass(dict, module = "tokenizers.decoders", name = "Decoder")]
#[pyclass(dict, module = "tokenizers.decoders", name = "Decoder", subclass)]
#[derive(Clone, Deserialize, Serialize)]
pub struct PyDecoder {
#[serde(flatten)]
Expand Down
4 changes: 3 additions & 1 deletion bindings/python/src/encoding.rs
Original file line number Diff line number Diff line change
Expand Up @@ -393,7 +393,9 @@ impl PyEncoding {
/// pad_token (:obj:`str`, defaults to `[PAD]`):
/// The pad token to use
#[args(kwargs = "**")]
#[pyo3(text_signature = "(self, length, direction='right', pad_id=0, pad_type_id=0, pad_token='[PAD]')")]
#[pyo3(
text_signature = "(self, length, direction='right', pad_id=0, pad_type_id=0, pad_token='[PAD]')"
)]
fn pad(&mut self, length: usize, kwargs: Option<&PyDict>) -> PyResult<()> {
let mut pad_id = 0;
let mut pad_type_id = 0;
Expand Down
2 changes: 1 addition & 1 deletion bindings/python/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ fn normalizers(_py: Python, m: &PyModule) -> PyResult<()> {
/// Tokenizers Module
#[pymodule]
fn tokenizers(_py: Python, m: &PyModule) -> PyResult<()> {
env_logger::init_from_env("TOKENIZERS_LOG");
let _ = env_logger::try_init_from_env("TOKENIZERS_LOG");

// Register the fork callback
#[cfg(target_family = "unix")]
Expand Down
6 changes: 4 additions & 2 deletions bindings/python/src/models.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ use super::error::{deprecation_warning, ToPyResult};
/// will contain and manage the learned vocabulary.
///
/// This class cannot be constructed directly. Please use one of the concrete models.
#[pyclass(module = "tokenizers.models", name = "Model")]
#[pyclass(module = "tokenizers.models", name = "Model", subclass)]
#[derive(Clone, Serialize, Deserialize)]
pub struct PyModel {
#[serde(flatten)]
Expand Down Expand Up @@ -249,7 +249,9 @@ impl PyModel {
/// fuse_unk (:obj:`bool`, `optional`):
/// Whether to fuse any subsequent unknown tokens into a single one
#[pyclass(extends=PyModel, module = "tokenizers.models", name = "BPE")]
#[pyo3(text_signature = "(self, vocab=None, merges=None, cache_capacity=None, dropout=None, unk_token=None, continuing_subword_prefix=None, end_of_word_suffix=None, fuse_unk=None)")]
#[pyo3(
text_signature = "(self, vocab=None, merges=None, cache_capacity=None, dropout=None, unk_token=None, continuing_subword_prefix=None, end_of_word_suffix=None, fuse_unk=None)"
)]
pub struct PyBPE {}

impl PyBPE {
Expand Down
6 changes: 4 additions & 2 deletions bindings/python/src/normalizers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ impl PyNormalizedStringMut<'_> {
///
/// This class is not supposed to be instantiated directly. Instead, any implementation of a
/// Normalizer will return an instance of this class when instantiated.
#[pyclass(dict, module = "tokenizers.normalizers", name = "Normalizer")]
#[pyclass(dict, module = "tokenizers.normalizers", name = "Normalizer", subclass)]
#[derive(Clone, Serialize, Deserialize)]
pub struct PyNormalizer {
#[serde(flatten)]
Expand Down Expand Up @@ -218,7 +218,9 @@ macro_rules! setter {
/// lowercase (:obj:`bool`, `optional`, defaults to :obj:`True`):
/// Whether to lowercase.
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "BertNormalizer")]
#[pyo3(text_signature = "(self, clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True)")]
#[pyo3(
text_signature = "(self, clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True)"
)]
pub struct PyBertNormalizer {}
#[pymethods]
impl PyBertNormalizer {
Expand Down
9 changes: 7 additions & 2 deletions bindings/python/src/pre_tokenizers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,12 @@ use super::utils::*;
///
/// This class is not supposed to be instantiated directly. Instead, any implementation of a
/// PreTokenizer will return an instance of this class when instantiated.
#[pyclass(dict, module = "tokenizers.pre_tokenizers", name = "PreTokenizer")]
#[pyclass(
dict,
module = "tokenizers.pre_tokenizers",
name = "PreTokenizer",
subclass
)]
#[derive(Clone, Serialize, Deserialize)]
pub struct PyPreTokenizer {
#[serde(flatten)]
Expand Down Expand Up @@ -228,7 +233,7 @@ macro_rules! setter {
/// add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
/// Whether to add a space to the first word if there isn't already one. This
/// lets us treat `hello` exactly like `say hello`.
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=ByteLevel)]
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "ByteLevel")]
#[pyo3(text_signature = "(self, add_prefix_space=True, use_regex=True)")]
pub struct PyByteLevel {}
#[pymethods]
Expand Down
7 changes: 6 additions & 1 deletion bindings/python/src/processors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,12 @@ use tokenizers as tk;
///
/// This class is not supposed to be instantiated directly. Instead, any implementation of
/// a PostProcessor will return an instance of this class when instantiated.
#[pyclass(dict, module = "tokenizers.processors", name = "PostProcessor")]
#[pyclass(
dict,
module = "tokenizers.processors",
name = "PostProcessor",
subclass
)]
#[derive(Clone, Deserialize, Serialize)]
pub struct PyPostProcessor {
#[serde(flatten)]
Expand Down
16 changes: 12 additions & 4 deletions bindings/python/src/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,9 @@ use crate::utils::{MaybeSizedIterator, PyBufferedIterator};
/// Yesterday"``.
///
#[pyclass(dict, module = "tokenizers", name = "AddedToken")]
#[pyo3(text_signature = "(self, content, single_word=False, lstrip=False, rstrip=False, normalized=True)")]
#[pyo3(
text_signature = "(self, content, single_word=False, lstrip=False, rstrip=False, normalized=True)"
)]
pub struct PyAddedToken {
pub content: String,
pub is_special_token: bool,
Expand Down Expand Up @@ -665,7 +667,9 @@ impl PyTokenizer {
/// direction (:obj:`str`, defaults to :obj:`right`):
/// Truncate direction
#[args(kwargs = "**")]
#[pyo3(text_signature = "(self, max_length, stride=0, strategy='longest_first', direction='right')")]
#[pyo3(
text_signature = "(self, max_length, stride=0, strategy='longest_first', direction='right')"
)]
fn enable_truncation(&mut self, max_length: usize, kwargs: Option<&PyDict>) -> PyResult<()> {
let mut params = TruncationParams {
max_length,
Expand Down Expand Up @@ -765,7 +769,9 @@ impl PyTokenizer {
/// If specified, the length at which to pad. If not specified we pad using the size of
/// the longest sequence in a batch.
#[args(kwargs = "**")]
#[pyo3(text_signature = "(self, direction='right', pad_id=0, pad_type_id=0, pad_token='[PAD]', length=None, pad_to_multiple_of=None)")]
#[pyo3(
text_signature = "(self, direction='right', pad_id=0, pad_type_id=0, pad_token='[PAD]', length=None, pad_to_multiple_of=None)"
)]
fn enable_padding(&mut self, kwargs: Option<&PyDict>) -> PyResult<()> {
let mut params = PaddingParams::default();

Expand Down Expand Up @@ -892,7 +898,9 @@ impl PyTokenizer {
/// :class:`~tokenizers.Encoding`: The encoded result
///
#[args(pair = "None", is_pretokenized = "false", add_special_tokens = "true")]
#[pyo3(text_signature = "(self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True)")]
#[pyo3(
text_signature = "(self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True)"
)]
fn encode(
&self,
sequence: &PyAny,
Expand Down
10 changes: 7 additions & 3 deletions bindings/python/src/trainers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ use tokenizers as tk;
///
/// This class is not supposed to be instantiated directly. Instead, any implementation of a
/// Trainer will return an instance of this class when instantiated.
#[pyclass(module = "tokenizers.trainers", name = "Trainer")]
#[pyclass(module = "tokenizers.trainers", name = "Trainer", subclass)]
#[derive(Clone, Deserialize, Serialize)]
pub struct PyTrainer {
#[serde(flatten)]
Expand Down Expand Up @@ -368,7 +368,9 @@ impl PyBpeTrainer {
/// end_of_word_suffix (:obj:`str`, `optional`):
/// A suffix to be used for every subword that is a end-of-word.
#[pyclass(extends=PyTrainer, module = "tokenizers.trainers", name = "WordPieceTrainer")]
#[pyo3(text_signature = "(self, vocab_size=30000, min_frequency=0, show_progress=True, special_tokens=[], limit_alphabet=None, initial_alphabet= [],continuing_subword_prefix=\"##\", end_of_word_suffix=None)")]
#[pyo3(
text_signature = "(self, vocab_size=30000, min_frequency=0, show_progress=True, special_tokens=[], limit_alphabet=None, initial_alphabet= [],continuing_subword_prefix=\"##\", end_of_word_suffix=None)"
)]
pub struct PyWordPieceTrainer {}
#[pymethods]
impl PyWordPieceTrainer {
Expand Down Expand Up @@ -714,7 +716,9 @@ impl PyWordLevelTrainer {
/// The number of iterations of the EM algorithm to perform before
/// pruning the vocabulary.
#[pyclass(extends=PyTrainer, module = "tokenizers.trainers", name = "UnigramTrainer")]
#[pyo3(text_signature = "(self, vocab_size=8000, show_progress=True, special_tokens=[], shrinking_factor=0.75, unk_token=None, max_piece_length=16, n_sub_iterations=2)")]
#[pyo3(
text_signature = "(self, vocab_size=8000, show_progress=True, special_tokens=[], shrinking_factor=0.75, unk_token=None, max_piece_length=16, n_sub_iterations=2)"
)]
pub struct PyUnigramTrainer {}
#[pymethods]
impl PyUnigramTrainer {
Expand Down