Skip to content

Commit

Permalink
use secret hfhub token
Browse files Browse the repository at this point in the history
  • Loading branch information
chengchingwen committed Dec 20, 2023
1 parent c2076d7 commit cb1e8b1
Show file tree
Hide file tree
Showing 4 changed files with 28 additions and 23 deletions.
1 change: 1 addition & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ jobs:
env:
JULIA_NUM_THREADS: ${{ matrix.julia-threads }}
JL_TRF_TEST_TKR: ${{ matrix.test-hgf-tkr }}
HUGGINGFACEHUB_TOKEN: ${{ secrets.HUGGINGFACEHUB_TOKEN }}
strategy:
fail-fast: false
matrix:
Expand Down
26 changes: 13 additions & 13 deletions test/huggingface/load.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,19 @@
using Logging
using Transformers.HuggingFace
model_list = Dict([
:bert => :[
Model, ForPreTraining, LMHeadModel, ForMaskedLM, ForNextSentencePrediction,
ForSequenceClassification, ForTokenClassification, ForQuestionAnswering,
].args,
:roberta => :[
Model, ForMaskedLM, ForCausalLM, ForSequenceClassification, ForTokenClassification, ForQuestionAnswering,
].args,
:gpt2 => [:Model, :LMHeadModel],
:t5 => [:Model, :ForConditionalGeneration],
:gpt_neo => [:Model, :ForCausalLM],
:gptj => [:Model, :ForCausalLM],
:gpt_neox => [:Model, :ForCausalLM],
:bloom => [:Model, :ForCausalLM],
# :bert => :[
# Model, ForPreTraining, LMHeadModel, ForMaskedLM, ForNextSentencePrediction,
# ForSequenceClassification, ForTokenClassification, ForQuestionAnswering,
# ].args,
# :roberta => :[
# Model, ForMaskedLM, ForCausalLM, ForSequenceClassification, ForTokenClassification, ForQuestionAnswering,
# ].args,
# :gpt2 => [:Model, :LMHeadModel],
# :t5 => [:Model, :ForConditionalGeneration],
# :gpt_neo => [:Model, :ForCausalLM],
# :gptj => [:Model, :ForCausalLM],
# :gpt_neox => [:Model, :ForCausalLM],
# :bloom => [:Model, :ForCausalLM],
# :llama => [:Model, :ForCausalLM], No hf-internal-testing/tiny-random-$hgf_type_name
])

Expand Down
16 changes: 9 additions & 7 deletions test/huggingface/tokenizer.jl
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
@assert !isnothing(HFHUB_Token)
using Artifacts, LazyArtifacts
const artifact_dir = @artifact_str("xnli_dev")
const xnli = joinpath(artifact_dir, "xnli-dev.txt")
Expand All @@ -22,18 +23,18 @@ macro tryrun(ex, msg = nothing)
end

function test_tokenizer(name, corpus; to = TimerOutput())
global hgf_trf
global hgf_trf, HFHUB_Token
@info "Validate $name tokenizer"
@info "Load $name configure file in Julia"
config = @tryrun begin
@timeit to "jlload cfg" begin
cfg = HuggingFace.load_config(name)
cfg = HuggingFace.load_config(name; auth_token = HFHUB_Token)
HuggingFace.HGFConfig(cfg; layer_norm_eps = 1e-9, layer_norm_epsilon = 1e-9)
end
end "Failed to load $name configure file in Julia, probably unsupported"
@info "Load $name configure file in Python"
pyconfig = @tryrun begin
@timeit to "pyload cfg" hgf_trf.AutoConfig.from_pretrained(name,
@timeit to "pyload cfg" hgf_trf.AutoConfig.from_pretrained(name, token = HFHUB_Token,
layer_norm_eps = 1e-9, layer_norm_epsilon = 1e-9)
end "Failed to load $name configure file in Python, probably unsupported"
vocab_size = if haskey(config, :vocab_size)
Expand All @@ -44,12 +45,12 @@ function test_tokenizer(name, corpus; to = TimerOutput())
end
@info "Loading $name tokenizer in Python"
hgf_tkr = @tryrun begin
@timeit to "pyload tkr" hgf_trf.AutoTokenizer.from_pretrained(name, config = pyconfig)
@timeit to "pyload tkr" hgf_trf.AutoTokenizer.from_pretrained(name, config = pyconfig, token = HFHUB_Token)
end "Failed to load $name tokenizer in Python"
@info "Python $name tokenizer loaded successfully"
@info "Loading $name tokenizer in Julia"
tkr = @tryrun begin
@timeit to "jlload tkr" HuggingFace.load_tokenizer(name; config)
@timeit to "jlload tkr" HuggingFace.load_tokenizer(name; config, auth_token = HFHUB_Token)
end "Failed to load $name tokenizer in Julia"
@info "Julia $name tokenizer loaded successfully"
@info "Testing: $name Tokenizer"
Expand Down Expand Up @@ -113,8 +114,9 @@ end
@testset "HuggingFace Tokenizer" begin
corpus = readlines(xnli)
for name in [
"bert-base-cased", "bert-base-uncased", "roberta-base", "gpt2", "t5-small", "google/flan-t5-xl",
"EleutherAI/pythia-70m", "databricks/dolly-v2-3b", "bigscience/bloom-560m", "TheBloke/Llama-2-7B-Chat-GPTQ",
# "bert-base-cased", "bert-base-uncased", "roberta-base", "gpt2", "t5-small", "google/flan-t5-xl",
# "EleutherAI/pythia-70m", "databricks/dolly-v2-3b", "bigscience/bloom-560m",
"meta-llama/Llama-2-7b-chat-hf"
]
@testset "$name Tokenizer" begin
to = TimerOutput()
Expand Down
8 changes: 5 additions & 3 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ using Flux: gradient

using CUDA

const HFHUB_Token = get(ENV, "HUGGINGFACEHUB_TOKEN", nothing)

function envget(var)
e = get(ENV, var, false)
e isa Bool && return e
Expand All @@ -37,7 +39,7 @@ dones(arg...) = ones(arg...) |> device
dzeros(arg...) = zeros(arg...) |> device

const tests = [
"tokenizer",
# "tokenizer",
"huggingface",
]

Expand All @@ -55,6 +57,6 @@ Random.seed!(0)
end
end
end
include("loss.jl")
include("grad.jl")
# include("loss.jl")
# include("grad.jl")
end

0 comments on commit cb1e8b1

Please sign in to comment.