Skip to content

Commit

Permalink
Merge pull request #9 from CrazyBoyM/support_subfolder
Browse files Browse the repository at this point in the history
Fix subfolder && add subfolder tests
  • Loading branch information
JunnYu committed Dec 26, 2023
2 parents 1f808a9 + 121fcda commit a376387
Show file tree
Hide file tree
Showing 11 changed files with 1,311 additions and 132 deletions.
2 changes: 1 addition & 1 deletion paddlenlp/transformers/auto/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, *model_args, **kwar
config = AutoConfig.from_pretrained("bert-base-uncased")
config.save_pretrained('./bert-base-uncased')
"""
subfolder = kwargs.pop("subfolder", "")
subfolder = kwargs.get("subfolder", "")
if subfolder is None:
subfolder = ""
from_aistudio = kwargs.pop("from_aistudio", False)
Expand Down
2 changes: 1 addition & 1 deletion paddlenlp/transformers/auto/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
all_tokenizer_names.append(name)
# From local dir path
if os.path.isdir(pretrained_model_name_or_path):
config_file = os.path.join(pretrained_model_name_or_path, cls.tokenizer_config_file)
config_file = os.path.join(pretrained_model_name_or_path, subfolder, cls.tokenizer_config_file)
if os.path.exists(config_file):
tokenizer_class = cls._get_tokenizer_class_from_config(
pretrained_model_name_or_path, config_file, use_fast
Expand Down
1 change: 0 additions & 1 deletion paddlenlp/transformers/conversion_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1012,7 +1012,6 @@ def convert(cls, weight_file: str, config: PretrainedConfig, cache_dir: str) ->
"""
# FIXME(wj-Mcat): add compatibility with downstream models
name_mappings = cls._get_name_mappings(config)

if weight_file.endswith(".index.json"):
if ".safetensors." in weight_file:
files = [file for file in os.listdir(os.path.dirname(weight_file)) if file.startswith("model-")]
Expand Down
264 changes: 136 additions & 128 deletions paddlenlp/transformers/model_utils.py

Large diffs are not rendered by default.

4 changes: 3 additions & 1 deletion paddlenlp/transformers/tokenizer_utils_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1559,7 +1559,9 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
for k, v in resolved_vocab_files.items():
if v is not None and os.path.isfile(v):
tokenizer_config_file_dir_list.add(os.path.dirname(v))
assert len(tokenizer_config_file_dir_list) == 1, "All tokenizer files should be in the same directory."
tokenizer_config_file_dir_list = list(tokenizer_config_file_dir_list)
# TODO: check this
assert len(tokenizer_config_file_dir_list) > 0, "All tokenizer files should be in the same directory."
# Prepare tokenizer initialization kwargs
# Did we saved some inputs and kwargs to reload ?
has_tokenizer_file = resolved_vocab_files.get("tokenizer_file", None) is not None
Expand Down
8 changes: 8 additions & 0 deletions paddlenlp/transformers/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -612,6 +612,7 @@ def get_checkpoint_shard_files(
cache_dir=None,
subfolder="",
from_aistudio=False,
from_hf_hub=False,
):
"""
For a given model:
Expand Down Expand Up @@ -666,6 +667,13 @@ def get_checkpoint_shard_files(
subfolder=subfolder,
cache_dir=cache_dir,
)
elif from_hf_hub:
cached_filename = hf_hub_download(
repo_id=pretrained_model_name_or_path,
filename=shard_filename,
subfolder=subfolder,
cache_dir=cache_dir,
)
else:
cached_filename = paddlenlp_hub_download(
pretrained_model_name_or_path,
Expand Down
87 changes: 87 additions & 0 deletions tests/transformers/load_subfolder/test_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import unittest

from paddlenlp.transformers import AutoConfig, BertConfig, CLIPConfig, T5Config
from paddlenlp.utils.log import logger


class ConfigLoadTester(unittest.TestCase):
def test_bert_config_load(self):
logger.info("Download Bert Config from PaddleNLP BOS")
bert_config = BertConfig.from_pretrained("bert-base-uncased", from_hf_hub=False)
bert_config = AutoConfig.from_pretrained("bert-base-uncased", from_hf_hub=False)

logger.info("Download config from local")
bert_config.save_pretrained("./paddlenlp-test-config/bert-base-uncased")
bert_config = BertConfig.from_pretrained("./paddlenlp-test-config/bert-base-uncased")
bert_config = AutoConfig.from_pretrained("./paddlenlp-test-config/bert-base-uncased")
logger.info("Download config from local with subfolder")
bert_config = BertConfig.from_pretrained("./paddlenlp-test-config", subfolder="bert-base-uncased")
bert_config = AutoConfig.from_pretrained("./paddlenlp-test-config", subfolder="bert-base-uncased")

logger.info("Download Bert Config from PaddleNLP BOS with subfolder")
bert_config = BertConfig.from_pretrained(
"baicai/paddlenlp-test-model", subfolder="bert-base-uncased", from_hf_hub=False
)
bert_config = AutoConfig.from_pretrained(
"baicai/paddlenlp-test-model", subfolder="bert-base-uncased", from_hf_hub=False
)

logger.info("Download Bert Config from aistudio")
bert_config = BertConfig.from_pretrained("aistudio/bert-base-uncased", from_aistudio=True)
bert_config = AutoConfig.from_pretrained("aistudio/bert-base-uncased", from_aistudio=True)

def test_clip_config_load(self):
logger.info("Download CLIP Config from PaddleNLP BOS")
clip_config = CLIPConfig.from_pretrained("openai/clip-vit-base-patch32", from_hf_hub=False)
clip_config = AutoConfig.from_pretrained("openai/clip-vit-base-patch32", from_hf_hub=False)

logger.info("Download CLIP Config from local")
clip_config.save_pretrained("./paddlenlp-test-config/clip-vit-base-patch32")
clip_config = CLIPConfig.from_pretrained("./paddlenlp-test-config/clip-vit-base-patch32")
clip_config = AutoConfig.from_pretrained("./paddlenlp-test-config/clip-vit-base-patch32")
logger.info("Download CLIP Config from local with subfolder")
clip_config = CLIPConfig.from_pretrained("./paddlenlp-test-config", subfolder="clip-vit-base-patch32")
clip_config = AutoConfig.from_pretrained("./paddlenlp-test-config", subfolder="clip-vit-base-patch32")

logger.info("Download CLIP Config from PaddleNLP BOS with subfolder")
clip_config = CLIPConfig.from_pretrained(
"baicai/paddlenlp-test-model", subfolder="clip-vit-base-patch32", from_hf_hub=False
)
clip_config = AutoConfig.from_pretrained(
"baicai/paddlenlp-test-model", subfolder="clip-vit-base-patch32", from_hf_hub=False
)

logger.info("Download CLIP Config from aistudio")
clip_config = CLIPConfig.from_pretrained("aistudio/clip-vit-base-patch32", from_aistudio=True)
clip_config = AutoConfig.from_pretrained("aistudio/clip-vit-base-patch32", from_aistudio=True)

def test_t5_config_load(self):
logger.info("Download T5 Config from PaddleNLP BOS")
t5_config = T5Config.from_pretrained("t5-small", from_hf_hub=False)
t5_config = AutoConfig.from_pretrained("t5-small", from_hf_hub=False)

logger.info("Download T5 Config from PaddleNLP BOS with subfolder")
t5_config = T5Config.from_pretrained("baicai/paddlenlp-test-model", subfolder="t5-small", from_hf_hub=False)
t5_config = AutoConfig.from_pretrained("baicai/paddlenlp-test-model", subfolder="t5-small", from_hf_hub=False)
logger.info("Download T5 Config from local")
t5_config.save_pretrained("./paddlenlp-test-config/t5-small")
t5_config = T5Config.from_pretrained("./paddlenlp-test-config/t5-small")
t5_config = AutoConfig.from_pretrained("./paddlenlp-test-config/t5-small")

logger.info("Download T5 Config from aistudio")
t5_config = T5Config.from_pretrained("aistudio/t5-small", from_aistudio=True)
t5_config = AutoConfig.from_pretrained("aistudio/t5-small", from_aistudio=True)
57 changes: 57 additions & 0 deletions tests/transformers/load_subfolder/test_image_processor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import unittest

from paddlenlp.transformers import AutoImageProcessor, CLIPImageProcessor
from paddlenlp.utils.log import logger


class ImageProcessorLoadTester(unittest.TestCase):
def test_clip_load(self):
logger.info("Download model from PaddleNLP BOS")
clip_processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-base-patch32", from_hf_hub=False)
clip_processor = AutoImageProcessor.from_pretrained("openai/clip-vit-base-patch32", from_hf_hub=False)

logger.info("Download model from local")
clip_processor.save_pretrained("./paddlenlp-test-model/clip-vit-base-patch32")
clip_processor = CLIPImageProcessor.from_pretrained("./paddlenlp-test-model/clip-vit-base-patch32")
clip_processor = AutoImageProcessor.from_pretrained("./paddlenlp-test-model/clip-vit-base-patch32")
logger.info("Download model from PaddleNLP BOS with subfolder")
clip_processor = CLIPImageProcessor.from_pretrained(
"./paddlenlp-test-model/", subfolder="clip-vit-base-patch32"
)
clip_processor = AutoImageProcessor.from_pretrained(
"./paddlenlp-test-model/", subfolder="clip-vit-base-patch32"
)

logger.info("Download model from PaddleNLP BOS with subfolder")
clip_processor = CLIPImageProcessor.from_pretrained(
"baicai/paddlenlp-test-model", subfolder="clip-vit-base-patch32", from_hf_hub=False
)
clip_processor = AutoImageProcessor.from_pretrained(
"baicai/paddlenlp-test-model", subfolder="clip-vit-base-patch32", from_hf_hub=False
)

logger.info("Download model from aistudio")
clip_processor = CLIPImageProcessor.from_pretrained("aistudio/clip-vit-base-patch32", from_aistudio=True)
clip_processor = AutoImageProcessor.from_pretrained("aistudio/clip-vit-base-patch32", from_aistudio=True)

logger.info("Download model from aistudio with subfolder")
clip_processor = CLIPImageProcessor.from_pretrained(
"aistudio/paddlenlp-test-model", subfolder="clip-vit-base-patch32", from_aistudio=True
)
clip_processor = AutoImageProcessor.from_pretrained(
"aistudio/paddlenlp-test-model", subfolder="clip-vit-base-patch32", from_aistudio=True
)
Loading

0 comments on commit a376387

Please sign in to comment.