Update TensorRT-LLM (#1891)

* Update TensorRT-LLM --------- Co-authored-by: Marks101 <[email protected]> Co-authored-by: lkm2835 <[email protected]>
NVIDIA · Jul 4, 2024 · 9dbc5b3 · 9dbc5b3
1 parent 9691e12
commit 9dbc5b3
Show file tree

Hide file tree

Showing 216 changed files with 6,178 additions and 3,563 deletions.
diff --git a/README.md b/README.md
@@ -7,7 +7,7 @@ TensorRT-LLM
 [![Documentation](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](https://nvidia.github.io/TensorRT-LLM/)
 [![python](https://img.shields.io/badge/python-3.10.12-green)](https://www.python.org/downloads/release/python-31012/)
 [![cuda](https://img.shields.io/badge/cuda-12.4.1-green)](https://developer.nvidia.com/cuda-downloads)
-[![trt](https://img.shields.io/badge/TRT-10.0.1-green)](https://developer.nvidia.com/tensorrt)
+[![trt](https://img.shields.io/badge/TRT-10.1.0-green)](https://developer.nvidia.com/tensorrt)
 [![version](https://img.shields.io/badge/release-0.11.0.dev-green)](./tensorrt_llm/version.py)
 [![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE)
 
@@ -17,7 +17,10 @@ TensorRT-LLM
 <div align="left">
 
 ## Latest News
-* [*Weekly*] Check out **[@NVIDIAAIDev](https://twitter.com/nvidiaaidev?lang=en)** & **[NVIDIA AI](https://www.linkedin.com/showcase/nvidia-ai/)** LinkedIn for the latest updates!
+* [2024/07/02] Let the @MistralAI MoE tokens fly 📈 🚀 #Mixtral 8x7B with NVIDIA #TensorRT #LLM on #H100.
+[➡️ Tech blog](https://developer.nvidia.com/blog/achieving-high-mixtral-8x7b-performance-with-nvidia-h100-tensor-core-gpus-and-tensorrt-llm?ncid=so-twit-928467)
+![Example Image](docs/source/media/picture-07-02-2024.png)
+
 * [2024/02/06] [🚀 Speed up inference with SOTA quantization techniques in TRT-LLM](./docs/source/blogs/quantization-in-TRT-LLM.md)
 * [2024/01/30] [ New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget](./docs/source/blogs/XQA-kernel.md)
 * [2023/12/04] [Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100](./docs/source/blogs/Falcon180B-H200.md)

diff --git a/benchmarks/cpp/gptManagerBenchmark.cpp b/benchmarks/cpp/gptManagerBenchmark.cpp
@@ -150,7 +150,9 @@ struct BenchmarkParams
     bool streaming{false};
     bool enableExpDelays{false};
     std::optional<float> requestRate{std::nullopt};
+    std::optional<int> concurrency{std::nullopt};
     std::optional<SizeType32> maxBatchSize{std::nullopt};
+    std::optional<SizeType32> maxNumTokens{std::nullopt};
     int randomSeed = 430;
     std::optional<int> maxAttentionWindow{std::nullopt};
 
@@ -773,7 +775,9 @@ class ExecutorServer
         : mRecorder(std::move(recorder))
         , mWaitSleep(waitSleep)
         , mStaticEmulatedBatchSize(staticEmulatedBatchSize)
+        , mConcurrency(benchmarkParams.concurrency)
         , mActiveCount(0)
+        , mNumFinished(0)
         , mShutdown(false)
     {
 
@@ -793,6 +797,10 @@ class ExecutorServer
         {
             executorConfig.setMaxBatchSize(benchmarkParams.maxBatchSize.value());
         }
+        if (benchmarkParams.maxNumTokens)
+        {
+            executorConfig.setMaxNumTokens(benchmarkParams.maxNumTokens.value());
+        }
 
         executorConfig.setDecodingConfig(texec::DecodingConfig(
             benchmarkParams.medusaChoices.has_value() ? texec::DecodingMode::Medusa() : texec::DecodingMode::Auto(),
@@ -843,10 +851,19 @@ class ExecutorServer
         }
     }
 
+    void resetNumFinished()
+    {
+        mNumFinished = 0;
+    }
+
+    bool canEnqueue(int numSentRequests) const
+    {
+        return !mConcurrency || (numSentRequests - mNumFinished < mConcurrency);
+    }
+
     void waitForResponses(SizeType32 numRequests, bool warmup = false)
     {
-        SizeType32 numFinished = 0;
-        while (mActiveCount || (numFinished < numRequests))
+        while (mActiveCount || (mNumFinished < numRequests))
         {
             auto responses = mExecutor->awaitResponses(mWaitSleep);
             for (auto const& response : responses)
@@ -856,7 +873,7 @@ class ExecutorServer
                 if (response.getResult().isFinal)
                 {
                     mActiveCount--;
-                    numFinished++;
+                    mNumFinished++;
                     if (!warmup)
                     {
                         mRecorder->recordEnd(reqId, response);
@@ -873,7 +890,7 @@ class ExecutorServer
         }
     }
 
-    void collectStats()
+    void collectStats() const
     {
         while (!mShutdown)
         {
@@ -893,7 +910,9 @@ class ExecutorServer
     std::shared_ptr<Recorder> mRecorder;
     std::chrono::milliseconds mWaitSleep;
     std::optional<int> mStaticEmulatedBatchSize;
+    std::optional<int> mConcurrency;
     std::atomic<uint64_t> mActiveCount;
+    std::atomic<uint64_t> mNumFinished;
     std::atomic<bool> mShutdown;
 }; // class ExecutorServer
 
@@ -914,9 +933,7 @@ class GptServer
         , mInferReqSyncSndHdl(nullptr)
     {
         auto const jsonConfig = GptJsonConfig::parse(trtEnginePath / "config.json");
-        SizeType32 deviceCount{0};
-        TLLM_CUDA_CHECK(cudaGetDeviceCount(&deviceCount));
-        mWorldConfig = WorldConfig::mpi(deviceCount, jsonConfig.getTensorParallelism(),
+        mWorldConfig = WorldConfig::mpi(jsonConfig.getGpusPerNode(), jsonConfig.getTensorParallelism(),
             jsonConfig.getPipelineParallelism(), optionalParams.deviceIds);
         auto& comm = COMM_SESSION;
         mCommTensorParallel = std::make_shared<tensorrt_llm::mpi::MpiComm>(
@@ -1352,16 +1369,15 @@ void benchmarkGptManager(std::filesystem::path const& engineDir, TrtGptModelType
     optionalParams.gpuWeightsPercent = benchmarkParams.gpuWeightsPercent;
     optionalParams.maxBeamWidth = beamWidth;
     optionalParams.maxBatchSize = benchmarkParams.maxBatchSize;
+    optionalParams.maxNumTokens = benchmarkParams.maxNumTokens;
     optionalParams.schedulerConfig = texec::SchedulerConfig{capacitySchedulerPolicy};
     optionalParams.decodingConfig = texec::DecodingConfig(
         benchmarkParams.medusaChoices.has_value() ? texec::DecodingMode::Medusa() : texec::DecodingMode::Auto(),
         std::nullopt, benchmarkParams.medusaChoices);
 
     auto const jsonConfig = GptJsonConfig::parse(engineDir / "config.json");
-    SizeType32 deviceCount{0};
-    TLLM_CUDA_CHECK(cudaGetDeviceCount(&deviceCount));
-    auto const worldConfig = WorldConfig::mpi(
-        deviceCount, jsonConfig.getTensorParallelism(), jsonConfig.getPipelineParallelism(), optionalParams.deviceIds);
+    auto const worldConfig = WorldConfig::mpi(jsonConfig.getGpusPerNode(), jsonConfig.getTensorParallelism(),
+        jsonConfig.getPipelineParallelism(), optionalParams.deviceIds);
 
     BufferManager bufferManager{std::make_shared<CudaStream>()}; // the stream is not used
 
@@ -1551,53 +1567,49 @@ void benchmarkExecutor(std::filesystem::path const& engineDir, TrtGptModelType m
                     benchmarkParams.streaming, returnContextLogits, returnGenerationLogits, loraConfig));
             }
 
-            bool hasDelay
+            bool const hasDelay
                 = std::any_of(timeDelays.begin(), timeDelays.end(), [](auto const& delay) { return delay > 0.0; });
-            if (hasDelay && staticEmulatedBatchSize)
+            executorServer->resetNumFinished();
+            if (!staticEmulatedBatchSize)
             {
-                TLLM_THROW("Executor benchmark doesn't support delays with emulated static batch sizes");
-            }
+                // Launch a thread that will wait for responses
+                std::thread waitThread(
+                    [numSamples, executorServer]() { executorServer->waitForResponses(numSamples); });
 
-            if (!hasDelay)
-            {
-                if (!staticEmulatedBatchSize)
-                {
-                    executorServer->enqueue(std::move(requests));
-                    executorServer->waitForResponses(numSamples);
-                }
-                else
+                // Enqueue requests one by one
+                int numSentRequests = 0;
+                while (numSentRequests < numSamples)
                 {
-                    SizeType32 numRequests = requests.size();
-                    SizeType32 maxBatchSize = staticEmulatedBatchSize.value();
-                    for (SizeType32 req = 0; req < numRequests; req += maxBatchSize)
+                    if (executorServer->canEnqueue(numSentRequests))
                     {
-                        auto batchSize = std::min(maxBatchSize, numRequests - req);
-
-                        std::vector<texec::Request> requestsBatch(std::make_move_iterator(requests.begin() + req),
-                            std::make_move_iterator(requests.begin() + req + batchSize));
-                        // Enqueue in batches
-                        executorServer->enqueue(std::move(requestsBatch));
-                        // Wait for current batch to be done
-                        executorServer->waitForResponses(batchSize);
+                        executorServer->enqueue({requests.at(numSentRequests)});
+                        if (hasDelay && numSentRequests < numSamples - 1)
+                        {
+                            std::this_thread::sleep_for(
+                                std::chrono::milliseconds(static_cast<int>(timeDelays.at(numSentRequests) * 1000)));
+                        }
+                        numSentRequests += 1;
                     }
                 }
+                waitThread.join();
             }
             else
             {
-                // Launch a thread that will wait for responses
-                std::thread waitThread(
-                    [numSamples, executorServer]() { executorServer->waitForResponses(numSamples); });
-                // Enqueue requests one by one
-                for (std::size_t i = 0; i < numSamples; ++i)
+                TLLM_CHECK_WITH_INFO(
+                    !hasDelay, "Executor benchmark doesn't support delays with emulated static batch sizes");
+                SizeType32 numRequests = requests.size();
+                SizeType32 maxBatchSize = staticEmulatedBatchSize.value();
+                for (SizeType32 req = 0; req < numRequests; req += maxBatchSize)
                 {
-                    executorServer->enqueue({std::move(requests.at(i))});
-                    if (i < numSamples - 1)
-                    {
-                        std::this_thread::sleep_for(
-                            std::chrono::milliseconds(static_cast<int>(timeDelays.at(i) * 1000)));
-                    }
+                    auto batchSize = std::min(maxBatchSize, numRequests - req);
+
+                    std::vector<texec::Request> requestsBatch(std::make_move_iterator(requests.begin() + req),
+                        std::make_move_iterator(requests.begin() + req + batchSize));
+                    // Enqueue in batches
+                    executorServer->enqueue(std::move(requestsBatch));
+                    // Wait for current batch to be done
+                    executorServer->waitForResponses(batchSize);
                 }
-                waitThread.join();
             }
         }
         recorder->finalize();
@@ -1670,7 +1682,10 @@ int main(int argc, char* argv[])
     options.add_options()("request_rate",
         "request rate in reqs/sec. Skipping this arg or negative value will trigger offline/0-delay.",
         cxxopts::value<float>());
+    options.add_options()("concurrency", "Concurrent number of connections with the server.", cxxopts::value<int>());
     options.add_options()("max_batch_size", "The max runtime batch size when benchmarking", cxxopts::value<int>());
+    options.add_options()(
+        "max_num_tokens", "The max runtime number of tokens per batch when benchmarking", cxxopts::value<int>());
     options.add_options()("enable_trt_overlap", "Overlap TRT context preparation and execution",
         cxxopts::value<bool>()->default_value("false"));
     options.add_options()("enable_exp_delays", "Enables exponential delay distr to mimic real world request arrival",
@@ -1816,18 +1831,33 @@ int main(int argc, char* argv[])
     // Argument: streaming
     benchmarkParams.streaming = result["streaming"].as<bool>();
 
+    TLLM_CHECK_WITH_INFO(!(result.count("request_rate") && result.count("concurrency")),
+        "request_rate and concurrency cannot be specified at the same time.");
+
     // Argument: request rate
     if (result.count("request_rate"))
     {
         benchmarkParams.requestRate = result["request_rate"].as<float>();
     }
 
+    // Argument: concurrency
+    if (result.count("concurrency"))
+    {
+        benchmarkParams.concurrency = result["concurrency"].as<int>();
+    }
+
     // Argument: request rate
     if (result.count("max_batch_size"))
     {
         benchmarkParams.maxBatchSize = result["max_batch_size"].as<int>();
     }
 
+    // Argument: request rate
+    if (result.count("max_num_tokens"))
+    {
+        benchmarkParams.maxNumTokens = result["max_num_tokens"].as<int>();
+    }
+
     benchmarkParams.enableExpDelays = result["enable_exp_delays"].as<bool>();
 
     // Argument: Enable batch stats output

diff --git a/benchmarks/cpp/gptSessionBenchmark.cpp b/benchmarks/cpp/gptSessionBenchmark.cpp
@@ -75,9 +75,8 @@ void benchmarkGptSession(std::filesystem::path const& dataPath, std::vector<int>
     auto const json = GptJsonConfig::parse(jsonFileName);
     auto const modelConfig = json.getModelConfig();
     auto const inputPacked = modelConfig.usePackedInput();
-    SizeType32 deviceCount{0};
-    TLLM_CUDA_CHECK(cudaGetDeviceCount(&deviceCount));
-    auto const worldConfig = WorldConfig::mpi(deviceCount, json.getTensorParallelism(), json.getPipelineParallelism());
+    auto const worldConfig
+        = WorldConfig::mpi(json.getGpusPerNode(), json.getTensorParallelism(), json.getPipelineParallelism());
     auto& comm = COMM_SESSION;
     auto const enginePath = dataPath / json.engineFilename(worldConfig);
     auto const dtype = modelConfig.getDataType();

diff --git a/benchmarks/python/allowed_configs.py b/benchmarks/python/allowed_configs.py
@@ -41,7 +41,6 @@ class BuildConfig:
     type_vocab_size: Optional[int] = None
     pre_norm: Optional[bool] = None
     do_layer_norm_before: Optional[bool] = None
-    enable_qk_half_accum: bool = False
     enable_context_fmha: bool = True
     enable_multi_block_mode: bool = False
     # The enum name of PositionEmbeddingType
@@ -651,7 +650,6 @@ class ModelConfig:
                     max_batch_size=256,
                     max_input_len=512,
                     builder_opt=None,
-                    enable_qk_half_accum=False,
                     enable_context_fmha=False,
                 )),
     "bert_large":
@@ -669,7 +667,6 @@ class ModelConfig:
                     max_batch_size=64,
                     max_input_len=512,
                     builder_opt=None,
-                    enable_qk_half_accum=False,
                     enable_context_fmha=False,
                 )),
     "roberta_base":
@@ -687,7 +684,6 @@ class ModelConfig:
                     max_batch_size=64,
                     max_input_len=512,
                     builder_opt=None,
-                    enable_qk_half_accum=False,
                     enable_context_fmha=False,
                 )),
     "falcon_rw_1b":

diff --git a/benchmarks/python/build.py b/benchmarks/python/build.py
@@ -264,21 +264,6 @@ def build_gpt(args):
     max_input_len = build_config['max_input_len'] \
         if args.max_input_len is None else args.max_input_len
 
-    if args.max_output_len:
-        logger.warning(
-            '--max_output_len has been deprecated in favor of --max_seq_len')
-        if args.max_input_len:
-            if args.max_seq_len:
-                logger.warning(
-                    '--max_seq_len has been overwritten due to --max_output_len being specified'
-                )
-            args.max_seq_len = args.max_input_len + args.max_output_len
-        else:
-            raise Exception(
-                f"max_output_len is specified but not max_input_len")
-
-        del args.max_output_len
-
     max_seq_len = build_config['max_seq_len'] \
         if args.max_seq_len is None else args.max_seq_len
     max_beam_width = build_config['max_beam_width'] \
@@ -1113,7 +1098,6 @@ def build_bert(args):
     if args.mode == 'plugin':
         network.plugin_config.bert_attention_plugin = args.dtype
         network.plugin_config.gemm_plugin = args.dtype
-        network.plugin_config.attention_qk_half_accumulation = True
         network.plugin_config.set_context_fmha(ContextFMHAType.enabled)
     elif args.mode == 'ootb-except-mha':
         network.plugin_config.bert_attention_plugin = args.dtype
@@ -1573,21 +1557,6 @@ def build_enc_dec(args):
         if args.max_input_len is None else args.max_input_len
     build_config['max_decoder_input_len'] = 1
 
-    if args.max_output_len:
-        logger.warning(
-            '--max_output_len has been deprecated in favor of --max_seq_len')
-        if args.max_input_len:
-            if args.max_seq_len:
-                logger.warning(
-                    '--max_seq_len has been overwritten due to --max_output_len being specified'
-                )
-            args.max_seq_len = args.max_input_len + args.max_output_len
-        else:
-            raise Exception(
-                f"max_output_len is specified but not max_input_len")
-
-        del args.max_output_len
-
     build_config['max_seq_len'] = build_config['max_seq_len'] \
         if args.max_seq_len is None else args.max_seq_len
     build_config[

diff --git a/benchmarks/suite/tensorrt_llm_bench/benchmarkers/static.py b/benchmarks/suite/tensorrt_llm_bench/benchmarkers/static.py
@@ -1,3 +1,4 @@
+import platform
 from pathlib import Path
 from subprocess import CompletedProcess
 from typing import Dict, List
@@ -143,11 +144,9 @@ def benchmark(self):
         """Benchmarks a TRT-LLM for a configured instance."""
 
         # Compile the command for running
-        cmd = [
-            "mpirun",
-            "-allow-run-as-root",
-            "-n",
-            self.config.world_size,
+        cmd = ["mpiexec", "-n", self.config.world_size]
+        cmd += ["-allow-run-as-root"] if platform.system() != "Windows" else ""
+        cmd += [
             self.gpt_session_path,
             "--engine_dir",
             self.config.engine_path,

diff --git a/benchmarks/suite/tensorrt_llm_bench/ifb.py b/benchmarks/suite/tensorrt_llm_bench/ifb.py
@@ -285,7 +285,7 @@ def executor_benchmark(
         # the
         logger.info("Launching benchmark...")
         bench_cmd = \
-            ["mpirun", "-n", f"{benchmark_cfg.world_size}", "python"] +  \
+            ["mpiexec", "-n", f"{benchmark_cfg.world_size}", "python"] +  \
             sys.argv + ["--run"]
         process = subprocess.Popen(
             bench_cmd,