Skip to content

Commit

Permalink
Expose stream-ordering in subword tokenizer API (#17206)
Browse files Browse the repository at this point in the history
Add stream parameter to public APIs:
```
nvtext::subword_tokenize
nvtext::load_vocabulary_file
```
Added stream gtest.

Reference: #13744

Authors:
  - Shruti Shivakumar (https://github.com/shrshi)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - David Wendt (https://github.com/davidwendt)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

URL: #17206
  • Loading branch information
shrshi authored Nov 4, 2024
1 parent 3d07509 commit 0d37506
Show file tree
Hide file tree
Showing 5 changed files with 93 additions and 10 deletions.
4 changes: 4 additions & 0 deletions cpp/include/nvtext/subword_tokenize.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,13 @@ struct hashed_vocabulary {
* @param filename_hashed_vocabulary A path to the preprocessed vocab.txt file.
* Note that this is the file AFTER python/perfect_hash.py has been used
* for preprocessing.
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Memory resource to allocate any returned objects.
* @return vocabulary hash-table elements
*/
std::unique_ptr<hashed_vocabulary> load_vocabulary_file(
std::string const& filename_hashed_vocabulary,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

/**
Expand Down Expand Up @@ -147,6 +149,7 @@ struct tokenizer_result {
* @param do_truncate If true, the tokenizer will discard all the token-ids after
* `max_sequence_length` for each input string. If false, it will use a new row
* in the output token-ids to continue generating the output.
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Memory resource to allocate any returned objects.
* @return token-ids, attention-mask, and metadata
*/
Expand All @@ -157,6 +160,7 @@ tokenizer_result subword_tokenize(
uint32_t stride,
bool do_lower_case,
bool do_truncate,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

/** @} */ // end of group
Expand Down
6 changes: 4 additions & 2 deletions cpp/src/text/subword/load_hash_file.cu
Original file line number Diff line number Diff line change
Expand Up @@ -289,10 +289,12 @@ std::unique_ptr<hashed_vocabulary> load_vocabulary_file(
} // namespace detail

std::unique_ptr<hashed_vocabulary> load_vocabulary_file(
std::string const& filename_hashed_vocabulary, rmm::device_async_resource_ref mr)
std::string const& filename_hashed_vocabulary,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
CUDF_FUNC_RANGE();
return detail::load_vocabulary_file(filename_hashed_vocabulary, cudf::get_default_stream(), mr);
return detail::load_vocabulary_file(filename_hashed_vocabulary, stream, mr);
}

} // namespace nvtext
11 changes: 3 additions & 8 deletions cpp/src/text/subword/subword_tokenize.cu
Original file line number Diff line number Diff line change
Expand Up @@ -293,17 +293,12 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,
uint32_t stride,
bool do_lower_case,
bool do_truncate,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
CUDF_FUNC_RANGE();
return detail::subword_tokenize(strings,
vocabulary_table,
max_sequence_length,
stride,
do_lower_case,
do_truncate,
cudf::get_default_stream(),
mr);
return detail::subword_tokenize(
strings, vocabulary_table, max_sequence_length, stride, do_lower_case, do_truncate, stream, mr);
}

} // namespace nvtext
1 change: 1 addition & 0 deletions cpp/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -743,6 +743,7 @@ ConfigureTest(
streams/text/ngrams_test.cpp
streams/text/replace_test.cpp
streams/text/stemmer_test.cpp
streams/text/subword_tokenize_test.cpp
streams/text/tokenize_test.cpp
STREAM_MODE
testing
Expand Down
81 changes: 81 additions & 0 deletions cpp/tests/streams/text/subword_tokenize_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cudf_test/base_fixture.hpp>
#include <cudf_test/column_utilities.hpp>
#include <cudf_test/column_wrapper.hpp>
#include <cudf_test/default_stream.hpp>

#include <cudf/column/column.hpp>
#include <cudf/strings/strings_column_view.hpp>

#include <nvtext/subword_tokenize.hpp>

#include <fstream>
#include <vector>

// Global environment for temporary files
auto const temp_env = static_cast<cudf::test::TempDirTestEnvironment*>(
::testing::AddGlobalTestEnvironment(new cudf::test::TempDirTestEnvironment));

struct TextSubwordTest : public cudf::test::BaseFixture {};

// Create a fake hashed vocab text file for the tests in this source file.
// The vocab only includes the following words:
// 'this', 'is', 'a', 'test', 'tést'
// The period '.' character also has a token id.
void create_hashed_vocab(std::string const& hash_file)
{
constexpr size_t coefsize = 23;
std::vector<std::pair<int, int>> coefficients(coefsize, {65559, 0});
std::ofstream outfile(hash_file, std::ofstream::out);
outfile << "1\n0\n" << coefficients.size() << "\n";
for (auto c : coefficients) {
outfile << c.first << " " << c.second << "\n";
}
std::vector<uint64_t> hash_table(coefsize, 0);
outfile << hash_table.size() << "\n";
hash_table[0] = 3015668L; // based on values
hash_table[1] = 6205475701751155871L; // from the
hash_table[5] = 6358029; // bert_hash_table.txt
hash_table[16] = 451412625363L; // file for the test
hash_table[20] = 6206321707968235495L; // words above
for (auto h : hash_table) {
outfile << h << "\n";
}
outfile << "100\n101\n102\n\n";
}

TEST(TextSubwordTest, Tokenize)
{
uint32_t const nrows = 100;
std::vector<char const*> h_strings(nrows, "This is a test. A test this is.");
cudf::test::strings_column_wrapper strings(h_strings.cbegin(), h_strings.cend());
std::string const hash_file = temp_env->get_temp_filepath("hashed_vocab.txt");
create_hashed_vocab(hash_file);
auto vocab = nvtext::load_vocabulary_file(hash_file, cudf::test::get_default_stream());

uint32_t const max_sequence_length = 16;
uint32_t const stride = 16;

auto result = nvtext::subword_tokenize(cudf::strings_column_view{strings},
*vocab,
max_sequence_length,
stride,
true, // do_lower_case
false, // do_truncate
cudf::test::get_default_stream());
}

0 comments on commit 0d37506

Please sign in to comment.