Skip to content

Commit

Permalink
examples : Add tokenize
Browse files Browse the repository at this point in the history
  • Loading branch information
zakkor committed Nov 11, 2023
1 parent d96ca7d commit d8d898e
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 0 deletions.
1 change: 1 addition & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ else()
add_subdirectory(llama-bench)
add_subdirectory(llava)
add_subdirectory(main)
add_subdirectory(tokenize)
add_subdirectory(parallel)
add_subdirectory(perplexity)
add_subdirectory(quantize)
Expand Down
5 changes: 5 additions & 0 deletions examples/tokenize/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
set(TARGET tokenize)
add_executable(${TARGET} tokenize.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
61 changes: 61 additions & 0 deletions examples/tokenize/tokenize.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#include "common.h"
#include "llama.h"

#include <cmath>
#include <cstdio>
#include <string>
#include <vector>

template <typename C, typename T>
inline std::string LOG_TOKENS_TOSTR_LINES(const C & ctx, const T & tokens)
{
std::stringstream buf;

for (const auto &token : tokens)
{
auto detokenized = llama_token_to_piece(ctx, token);

detokenized.erase(
std::remove_if(
detokenized.begin(),
detokenized.end(),
[](const unsigned char c) { return !std::isprint(c); }),
detokenized.end());

buf << std::to_string(token) << "=" << detokenized << std::endl;
}

return buf.str();
}

int main(int argc, char ** argv) {
gpt_params params;

if (argc != 3 || argv[1][0] == '-') {
printf("usage: %s MODEL_PATH PROMPT\n" , argv[0]);
return 1;
}

params.model = argv[1];

params.prompt = argv[2];

llama_backend_init(params.numa);

llama_model_params model_params = llama_model_default_params();
model_params.vocab_only = true;
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);

llama_context_params ctx_params = llama_context_default_params();
llama_context * ctx = llama_new_context_with_model(model, ctx_params);

const bool add_bos = true;

std::vector<llama_token> tokens;

tokens = ::llama_tokenize(model, params.prompt, add_bos, true);

std::cout << LOG_TOKENS_TOSTR_LINES(ctx, tokens).c_str();

return 0;
}

0 comments on commit d8d898e

Please sign in to comment.