NVIDIA
diff --git a/‎.pre-commit-config.yaml
+2-1 b/‎.pre-commit-config.yaml
+2-1
diff --git a/‎README.md
+1-1 b/‎README.md
+1-1
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/decoderBuffers.h
-3 b/‎cpp/include/tensorrt_llm/batch_manager/decoderBuffers.h
-3
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/runtimeBuffers.h
-1 b/‎cpp/include/tensorrt_llm/batch_manager/runtimeBuffers.h
-1
diff --git a/‎cpp/tensorrt_llm/batch_manager/decoderBuffers.cpp
-5 b/‎cpp/tensorrt_llm/batch_manager/decoderBuffers.cpp
-5
diff --git a/‎cpp/tensorrt_llm/batch_manager/runtimeBuffers.cpp
-3 b/‎cpp/tensorrt_llm/batch_manager/runtimeBuffers.cpp
-3
diff --git a/‎cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp
+1-6 b/‎cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp
+1-6
diff --git a/‎cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
-4 b/‎cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
-4
diff --git a/‎cpp/tests/README.md
+1-1 b/‎cpp/tests/README.md
+1-1
diff --git a/‎docs/source/advanced/disaggregated-service.md
+33-1 b/‎docs/source/advanced/disaggregated-service.md
+33-1
diff --git a/‎examples/constraints.txt
+1-1 b/‎examples/constraints.txt
+1-1
diff --git a/‎examples/deepseek_v3/README.md
+97-14 b/‎examples/deepseek_v3/README.md
+97-14
diff --git a/‎examples/multimodal/eval.py
+4-1 b/‎examples/multimodal/eval.py
+4-1
diff --git a/‎examples/multimodal/run.py
+8-9 b/‎examples/multimodal/run.py
+8-9
diff --git a/‎examples/multimodal/utils.py
+8 b/‎examples/multimodal/utils.py
+8
@@ -64,7 +64,8 @@ repos:
     -   id: codespell
         additional_dependencies:
         - tomli
-        args: ["-L", "Mor"]
+        # add ignore words list
+        args: ["-L", "Mor,ans"]
 -   repo: https://github.com/astral-sh/ruff-pre-commit
     rev: v0.9.4
     hooks:
 
@@ -9,7 +9,7 @@ TensorRT-LLM
 [![python](https://img.shields.io/badge/python-3.10-green)](https://www.python.org/downloads/release/python-31012/)
 [![cuda](https://img.shields.io/badge/cuda-12.8.0-green)](https://developer.nvidia.com/cuda-downloads)
 [![trt](https://img.shields.io/badge/TRT-10.8.0-green)](https://developer.nvidia.com/tensorrt)
-[![version](https://img.shields.io/badge/release-0.19.0.dev-green)](./tensorrt_llm/version.py)
+[![version](https://img.shields.io/badge/release-0.19.0rc-green)](./tensorrt_llm/version.py)
 [![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE)
 
 [Architecture](./docs/source/torch/arch_overview.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Performance](./docs/source/performance/perf-overview.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Examples](https://nvidia.github.io/TensorRT-LLM/quick-start-guide.html)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Documentation](./docs/source/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Roadmap](https://docs.google.com/presentation/d/1gycPmtdh7uUcH6laOvW65Dbp9F1McUkGDIcAyjicBZs/edit?usp=sharing)
 
@@ -107,12 +107,9 @@ class DecoderBuffers
     using TensorPtr = runtime::ITensor::SharedPtr;
 
     std::vector<TensorPtr> logits;
-    TensorPtr slotOutputIds;     // [mMaxNumRequests, beamWidth, maxSeqLen], outputIds of all batch slots
-    TensorPtr slotOutputIdsHost; // [beamWidth, maxSeqLen], outputIds of single batch slot
     TensorPtr cacheIndirectionInput;
     TensorPtr cacheIndirectionOutput;
     TensorPtr sequenceLengthsHost; // [mMaxNumRequests, beamWidth], pinned host tensor
-    TensorPtr newOutputTokens;     // [maxTokensPerStep, mMaxNumRequests, beamWidth]
     TensorPtr newOutputTokensHost; // [maxTokensPerStep, mMaxNumRequests, beamWidth]
     TensorPtr cumLogProbsHost;     // [mMaxNumRequests, beamWidth]
     TensorPtr logProbsHost;        // [mMaxNumRequests, beamWidth, maxSeqLen]
 
@@ -144,7 +144,6 @@ class RuntimeBuffers
     TensorPtr lastTokenIdsHost;
     TensorPtr lastTokenIdsDevice;
     TensorPtr logitsIdsHost;
-    TensorPtr logitsIdsDevice;
 
     //! Pipeline-Parallelism
     TensorPtr hiddenStates;
 
@@ -72,9 +72,6 @@ DecoderBuffers::DecoderBuffers(SizeType32 maxNumSequences, SizeType32 maxBeamWid
 
     finishedSumHost = BufferManager::pinned(ITensor::makeShape({maxNumSequences}), nvinfer1::DataType::kINT32);
 
-    newOutputTokens
-        = manager.gpu(ITensor::makeShape({maxTokensPerStep, maxNumSequences, maxBeamWidth}), TRTTokenIdType);
-
     newOutputTokensHost
         = BufferManager::pinned(ITensor::makeShape({maxTokensPerStep, maxNumSequences, maxBeamWidth}), TRTTokenIdType);
 
@@ -151,15 +148,13 @@ void DecoderBuffers::DraftBuffers::create(SizeType32 maxNumSequences, SizeType32
 void DecoderBuffers::enableLookaheadDecoding(SizeType32 maxNumSequences, SizeType32 maxTokensPerStep)
 {
     TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
-    newOutputTokens->reshape(ITensor::makeShape({maxTokensPerStep, maxNumSequences, 1}));
     newOutputTokensHost->reshape(ITensor::makeShape({maxTokensPerStep, maxNumSequences, 1}));
     TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
 }
 
 void DecoderBuffers::disableLookaheadDecoding(SizeType32 maxNumSequences)
 {
     TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
-    newOutputTokens->reshape(ITensor::makeShape({1, maxNumSequences, 1}));
     newOutputTokensHost->reshape(ITensor::makeShape({1, maxNumSequences, 1}));
     TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
 }
 
@@ -114,7 +114,6 @@ void RuntimeBuffers::reshape(TllmRuntime const& runtime, ModelConfig const& mode
     lastTokenIdsHost->reshape(numLogitsShape);
     lastTokenIdsDevice->reshape(numLogitsShape);
     logitsIdsHost->reshape(numLogitsShape);
-    logitsIdsDevice->reshape(numLogitsShape);
 
     if (transformerBuffers)
     {
@@ -252,7 +251,6 @@ void RuntimeBuffers::create(SizeType32 maxBatchSize, SizeType32 maxBeamWidth,
     lastTokenIdsHost = manager.emptyTensor(MemoryType::kCPU, nvinfer1::DataType::kINT32);
     lastTokenIdsDevice = manager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32);
     logitsIdsHost = manager.emptyTensor(MemoryType::kCPU, nvinfer1::DataType::kINT32);
-    logitsIdsDevice = manager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32);
 
     inputsIds = manager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32);
 
@@ -811,7 +809,6 @@ void RuntimeBuffers::setFromInputs(RequestVector const& contextRequests, Request
         // In generation phase, device ptr of context lengths need to be tiled.
         manager.copy(*contextLengthsHost, *contextLengthsDevice);
         manager.copy(*sequenceLengthsHost, *sequenceLengthsDevice);
-        manager.copy(*logitsIdsHost, *logitsIdsDevice);
         auto const logitsIdsHostRange = BufferRange<SizeType32>(*logitsIdsHost);
         auto lastTokenIdsHostRange = BufferRange<SizeType32>(*lastTokenIdsHost);
         common::stl_utils::inclusiveScan(
 
@@ -1805,9 +1805,6 @@ void TrtGptModelInflightBatching::getDecoderSlotHostOutputs(
         auto cumLogProbs = mDecoder->getDecoderState().getCumLogProbs(seqSlot);
         auto logProbs = mDecoder->getDecoderState().getLogProbs(seqSlot);
 
-        runtime::CudaEvent beforeEvent{};
-        mRuntime->getStreamPtr()->record(beforeEvent);
-        mCopyBufferManager.getStream().wait(beforeEvent);
         mCopyBufferManager.copy(*sequenceLengthView, *mSlotDecoderBuffers[seqSlot]->sequenceLengths);
         mCopyBufferManager.copy(*outputIds, *mSlotDecoderBuffers[seqSlot]->outputIds);
         if (returnLogProbs)
@@ -1987,9 +1984,7 @@ runtime::CudaEvent TrtGptModelInflightBatching::updateDecoderBuffers(
     // Chain copy after decoder event, using a different stream
     mCopyBufferManager.getStream().wait(decoderFinishEvent);
 
-    mDecoderBuffers->newOutputTokens = mDecoder->getDecoderState().getAllNewTokens();
-
-    mCopyBufferManager.copy(*mDecoderBuffers->newOutputTokens, *mDecoderBuffers->newOutputTokensHost);
+    mCopyBufferManager.copy(*mDecoder->getDecoderState().getAllNewTokens(), *mDecoderBuffers->newOutputTokensHost);
     mCopyBufferManager.copy(
         *mDecoder->getDecoderState().getJointDecodingOutput().lengths, *mDecoderBuffers->sequenceLengthsHost);
 
 
@@ -464,14 +464,10 @@ void initBindings(pybind11::module_& m)
             py::arg("max_seq_len"), py::arg("max_tokens_per_step"), py::arg("buffer_manager"), py::arg("model_config"),
             py::arg("world_config"))
         .def_readwrite("logits", &tb::DecoderBuffers::logits)
-        .def_readwrite("slot_output_ids", &tb::DecoderBuffers::slotOutputIds)
-        .def_readwrite("slot_output_ids_host", &tb::DecoderBuffers::slotOutputIdsHost)
         .def_readwrite("cache_indirection_input", &tb::DecoderBuffers::cacheIndirectionInput)
         .def_readwrite("cache_indirection_output", &tb::DecoderBuffers::cacheIndirectionOutput)
         .def_readwrite("sequence_lengths_host", &tb::DecoderBuffers::sequenceLengthsHost)
         .def_readwrite("finished_sum_host", &tb::DecoderBuffers::finishedSumHost)
-        .def_property_readonly(
-            "new_output_tokens", [](tb::DecoderBuffers& self) { return tr::Torch::tensor(self.newOutputTokens); })
         .def_property_readonly("new_output_tokens_host",
             [](tb::DecoderBuffers& self) { return tr::Torch::tensor(self.newOutputTokensHost); })
         .def_readwrite("cum_log_probs_host", &tb::DecoderBuffers::cumLogProbsHost)
 
@@ -65,7 +65,7 @@ PYTHONPATH=examples/llama:$PYTHONPATH python3 cpp/tests/resources/scripts/build_
 PYTHONPATH=examples/chatglm:$PYTHONPATH python3 cpp/tests/resources/scripts/build_chatglm_engines.py
 PYTHONPATH=examples/medusa:$PYTHONPATH python3 cpp/tests/resources/scripts/build_medusa_engines.py
 PYTHONPATH=examples/eagle:$PYTHONPATH python3 cpp/tests/resources/scripts/build_eagle_engines.py
-PYTHONPATH=examples/redrafter:$PYTHONPATH python3 cpp/tests/resources/scripts/build_redrafter_engines.py --has_tllm_checkpoint
+PYTHONPATH=examples/redrafter:$PYTHONPATH python3 cpp/tests/resources/scripts/build_redrafter_engines.py
 ```
 
 It is possible to build engines with tensor and pipeline parallelism for LLaMA using 4 GPUs.
 
@@ -144,7 +144,7 @@ When the environment variable `TRTLLM_USE_MPI_KVCACHE=1` is set, TRT-LLM will tr
 *Q. Why do some profiling tools show that TRT-LLM's KV cache transfer does not utilize NVLink even on devices equipped with NVLink?*
 
 A. Ensure TRT-LLM is running with `UCX`-backend `CUDA-aware MPI` , and check version of `UCX` with `ucx_info -v`.
-If the version of UCX <=1.17, set the environment variables `UCX_RNDV_FRAG_MEM_TYPE=cuda` and `UCX_MEMTYPE_CACHE=n` to enable NVLink.
+If the version of UCX <=1.17, set the environment variables `UCX_RNDV_FRAG_MEM_TYPE=cuda` and `UCX_MEMTYPE_CACHE=n` to enable NVLink. For BlackWell architecture GPUs, UCX version >=1.19 is required to enable NVLink.
 If the version of UCX >=1.18, there are several ways to enable NVLink:
 1. Set the environment variables `UCX_CUDA_COPY_ASYNC_MEM_TYPE=cuda`, `UCX_CUDA_COPY_DMABUF=no`, `UCX_MEMTYPE_CACHE=n` and `UCX_RNDV_PIPELINE_ERROR_HANDLING=y`.
 2. Set the environment variables `TRTLLM_KVCACHE_TRANSFER_BUFFER_SIZE=$Size`, `UCX_MEMTYPE_CACHE=n` and `UCX_RNDV_PIPELINE_ERROR_HANDLING=y`. $Size represents the size of the buffer for KV cache transfer, which is recommended to be larger than the size of the KV cache for the longest request.
@@ -155,3 +155,35 @@ A. Yes, TRT-LLM supports using GPU direct RDMA for inter-node KV cache transfer,
 1. Set the environment variables `UCX_RNDV_FRAG_MEM_TYPE=cuda`, `UCX_MEMTYPE_CACHE=n` and `UCX_RNDV_PIPELINE_ERROR_HANDLING=y`.
 2. Set the environment variables `TRTLLM_KVCACHE_TRANSFER_BUFFER_SIZE=$Size`, `UCX_MEMTYPE_CACHE=n` and `UCX_RNDV_PIPELINE_ERROR_HANDLING=y`, $Size represents the size of the buffer for KV cache transfer, which is recommended to be larger than the size of the KV cache for the longest request.
 To achieve the optimal performance when using GPU direct RDMA, it is advisable to create CUDA context before MPI initialization when TRTLLM_USE_MPI_KVCACHE=1 is set. One possible approach is to rely on MPI environment variables to set the correct device before MPI initialization.
+
+*Q. Are there any guidelines for performance tuning of KV cache transfer?*
+
+A. Depending on the user's use case, certain sets of environment variables can help avoid poor KV cache transfer performance.
+
+Environment Variable Set A
+
+```
+export UCX_RNDV_FRAG_MEM_TYPES=cuda
+export UCX_MEMTYPE_CACHE=n
+export UCX_RNDV_PIPELINE_ERROR_HANDLING=y
+```
+This set allows KV cache transfers to utilize NVLink within nodes and GDRDMA between nodes.
+
+Environment Variable Set B
+
+```
+export UCX_CUDA_COPY_ASYNC_MEM_TYPE=cuda
+export UCX_CUDA_COPY_DMABUF=no
+export UCX_MEMTYPE_CACHE=n
+export UCX_RNDV_PIPELINE_ERROR_HANDLING=y
+```
+Set B may provide slightly better performance on a single node compared to Set A. However, when transferring KV cache across multiple nodes, it may cause program instability.
+
+Environment Variable Set C
+
+```
+export TRTLLM_KVCACHE_TRANSFER_BUFFER_SIZE=$Size
+export UCX_MEMTYPE_CACHE=n
+export UCX_RNDV_PIPELINE_ERROR_HANDLING=y
+```
+Set C can achieve better performance than Sets A and B, both within and between nodes. However, if the KV cache size exceeds the specified $Size, performance may degrade.
@@ -1,3 +1,3 @@
-tensorrt_llm==0.19.0.dev2025041500
+tensorrt_llm==0.19.0rc0
 evaluate~=0.4.1
 rouge_score~=0.1.2
@@ -13,20 +13,23 @@ Please refer to [this guide](https://nvidia.github.io/TensorRT-LLM/installation/
 
 ## Table of Contents
 
-- [Table of Contents](#table-of-contents)
-- [Hardware Requirements](#hardware-requirements)
-- [Downloading the Model Weights](#downloading-the-model-weights)
-- [Quick Start](#quick-start)
-  - [Multi-Token Prediction (MTP)](#multi-token-prediction-mtp)
-  - [Run evaluation on GPQA dataset](#run-evaluation-on-gpqa-dataset)
-- [Serving](#serving)
-- [Advanced Usages](#advanced-usages)
-  - [Multi-node](#multi-node)
-    - [mpirun](#mpirun)
-    - [Slurm](#slurm)
-  - [FlashMLA](#flashmla)
-  - [DeepGEMM](#deepgemm)
-- [Notes and Troubleshooting](#notes-and-troubleshooting)
+- [DeepSeek‑V3 and DeepSeek-R1](#deepseekv3-and-deepseek-r1)
+  - [Table of Contents](#table-of-contents)
+  - [Hardware Requirements](#hardware-requirements)
+  - [Downloading the Model Weights](#downloading-the-model-weights)
+  - [Quick Start](#quick-start)
+    - [Run a single inference](#run-a-single-inference)
+    - [Multi-Token Prediction (MTP)](#multi-token-prediction-mtp)
+    - [Run evaluation on GPQA dataset](#run-evaluation-on-gpqa-dataset)
+  - [Serving](#serving)
+  - [Advanced Usages](#advanced-usages)
+    - [Multi-node](#multi-node)
+      - [mpirun](#mpirun)
+      - [Slurm](#slurm)
+      - [Example: Multi-node benchmark on GB200 Slurm cluster](#example-multi-node-benchmark-on-gb200-slurm-cluster)
+    - [FlashMLA](#flashmla)
+    - [DeepGEMM](#deepgemm)
+  - [Notes and Troubleshooting](#notes-and-troubleshooting)
 
 
 ## Hardware Requirements
@@ -267,6 +270,86 @@ trtllm-llmapi-launch trtllm-bench --model deepseek-ai/DeepSeek-V3 --model_path /
   bash -c "trtllm-llmapi-launch trtllm-bench --model deepseek-ai/DeepSeek-V3 --model_path <YOUR_MODEL_DIR> throughput --backend pytorch --max_batch_size 161 --max_num_tokens 1160 --dataset /workspace/dataset.txt --tp 16 --ep 4 --kv_cache_free_gpu_mem_fraction 0.95 --extra_llm_api_options ./extra-llm-api-config.yml"
 ```
 
+#### Example: Multi-node benchmark on GB200 Slurm cluster
+
+Step 1: Prepare dataset and `extra-llm-api-config.yml`.
+```bash
+python3 /path/to/TensorRT-LLM/benchmarks/cpp/prepare_dataset.py \
+    --tokenizer=/path/to/DeepSeek-R1 \
+    --stdout token-norm-dist --num-requests=49152 \
+    --input-mean=1024 --output-mean=2048 --input-stdev=0 --output-stdev=0 > /tmp/dataset.txt
+
+cat >/path/to/TensorRT-LLM/extra-llm-api-config.yml <<EOF
+pytorch_backend_config:
+    use_cuda_graph: true
+    cuda_graph_padding_enabled: true
+    cuda_graph_batch_sizes:
+    - 1
+    - 2
+    - 4
+    - 8
+    - 16
+    - 32
+    - 64
+    - 128
+    - 256
+    - 384
+    print_iter_log: true
+    enable_overlap_scheduler: true
+enable_attention_dp: true
+EOF
+```
+
+Step 2: Prepare `benchmark.slurm`.
+```bash
+#!/bin/bash
+#SBATCH --nodes=2
+#SBATCH --ntasks=8
+#SBATCH --ntasks-per-node=4
+#SBATCH --partition=<partition>
+#SBATCH --account=<account>
+#SBATCH --time=02:00:00
+#SBATCH --job-name=<job_name>
+
+srun --container-image=${container_image} --container-mounts=${mount_dir}:${mount_dir} --mpi=pmix \
+    --output ${logdir}/bench_%j_%t.srun.out \
+    bash benchmark.sh
+```
+
+Step 3: Prepare `benchmark.sh`.
+```bash
+#!/bin/bash
+cd /path/to/TensorRT-LLM
+# pip install build/tensorrt_llm*.whl
+if [ $SLURM_LOCALID == 0 ];then
+    pip install build/tensorrt_llm*.whl
+    echo "Install dependencies on rank 0."
+else
+    echo "Sleep 60 seconds on other ranks."
+    sleep 60
+fi
+
+export PATH=${HOME}/.local/bin:${PATH}
+export PYTHONPATH=/path/to/TensorRT-LLM
+DS_R1_NVFP4_MODEL_PATH=/path/to/DeepSeek-R1  # optional
+
+trtllm-llmapi-launch trtllm-bench \
+    --model deepseek-ai/DeepSeek-R1 \
+    --model_path $DS_R1_NVFP4_MODEL_PATH \
+    throughput --backend pytorch \
+    --num_requests 49152 \
+    --max_batch_size 384 --max_num_tokens 1536 \
+    --concurrency 3072 \
+    --dataset /path/to/dataset.txt \
+    --tp 8 --pp 1 --ep 8 --kv_cache_free_gpu_mem_fraction 0.85 \
+    --extra_llm_api_options ./extra-llm-api-config.yml --warmup 0
+```
+
+Step 4: Submit the job to Slurm cluster to launch the benchmark by executing:
+```
+sbatch --nodes=2 --ntasks=8 --ntasks-per-node=4 benchmark.slurm
+```
+
 ### FlashMLA
 TensorRT-LLM has already integrated FlashMLA in the PyTorch backend. It is enabled automatically when running DeepSeek-V3/R1.
 
 
@@ -274,7 +274,10 @@ def eval(output, task, data) -> bool:
         if args.test_trtllm:
             profiler.start('tensorrt_llm')
             _, output_text = trtllm_model.run(
-                prompts, image, max_new_tokens=args.max_new_tokens)
+                input_text=prompts,
+                input_image=image,
+                input_audio=None,
+                max_new_tokens=args.max_new_tokens)
             if runtime_rank == 0:
                 trtllm_result = output_text[0][0]
                 trtllm_correct += eval(trtllm_result, args.eval_task, data)
 
@@ -40,7 +40,8 @@ def print_result(model, input_text, output_text, args):
                 assert "characteristic | cat food, day | cat food, wet | cat treats" in output_text[
                     0][0].lower()
             elif model.model_type in [
-                    'blip2', 'neva', 'phi-3-vision', 'llava_next'
+                    'blip2', 'neva', 'phi-3-vision', 'llava_next',
+                    'phi-4-multimodal'
             ]:
                 assert 'singapore' in output_text[0][0].lower()
             elif model.model_type == 'video-neva':
@@ -104,23 +105,21 @@ def print_result(model, input_text, output_text, args):
     logger.set_level(args.log_level)
 
     model = MultimodalModelRunner(args)
-    input_multimodal_data = model.load_test_data(args.image_path,
-                                                 args.video_path)
+    visual_data = model.load_test_data(args.image_path, args.video_path)
+    audio_data = model.load_test_audio(args.audio_path)
 
     if args.run_profiling:
         num_warmup_iters = 3  # Multiple iterations to load both vision and LLM engines into memory
         for _ in range(num_warmup_iters):
-            input_text, output_text = model.run(args.input_text,
-                                                input_multimodal_data,
-                                                args.max_new_tokens)
+            input_text, output_text = model.run(args.input_text, visual_data,
+                                                audio_data, args.max_new_tokens)
         profiler.reset()
 
     num_iters = args.profiling_iterations if args.run_profiling else 1
 
     for _ in range(num_iters):
-        input_text, output_text = model.run(args.input_text,
-                                            input_multimodal_data,
-                                            args.max_new_tokens)
+        input_text, output_text = model.run(args.input_text, visual_data,
+                                            audio_data, args.max_new_tokens)
 
     runtime_rank = tensorrt_llm.mpi_rank()
     if runtime_rank == 0:
 
@@ -10,6 +10,10 @@ def add_common_args(parser):
                         type=str,
                         default='model.engine',
                         help='Name of visual TRT engine')
+    parser.add_argument('--audio_engine_name',
+                        type=str,
+                        default='model.engine',
+                        help='Name of audio TRT engine')
     parser.add_argument('--hf_model_dir',
                         type=str,
                         default=None,
@@ -55,6 +59,10 @@ def add_common_args(parser):
                         nargs='+',
                         default=None,
                         help='List of input image paths, separated by symbol')
+    parser.add_argument("--audio_path",
+                        type=str,
+                        default=None,
+                        help='input audio path')
     parser.add_argument("--path_sep",
                         type=str,
                         default=",",
Original file line number	Diff line number	Diff line change
`@@ -72,9 +72,6 @@ DecoderBuffers::DecoderBuffers(SizeType32 maxNumSequences, SizeType32 maxBeamWid`
`72`	`72`
`73`	`73`	`finishedSumHost = BufferManager::pinned(ITensor::makeShape({maxNumSequences}), nvinfer1::DataType::kINT32);`
`74`	`74`
`75`		`- newOutputTokens`
`76`		`- = manager.gpu(ITensor::makeShape({maxTokensPerStep, maxNumSequences, maxBeamWidth}), TRTTokenIdType);`
`77`		`-`
`78`	`75`	`newOutputTokensHost`
`79`	`76`	`= BufferManager::pinned(ITensor::makeShape({maxTokensPerStep, maxNumSequences, maxBeamWidth}), TRTTokenIdType);`
`80`	`77`
`@@ -151,15 +148,13 @@ void DecoderBuffers::DraftBuffers::create(SizeType32 maxNumSequences, SizeType32`
`151`	`148`	`void DecoderBuffers::enableLookaheadDecoding(SizeType32 maxNumSequences, SizeType32 maxTokensPerStep)`
`152`	`149`	`{`
`153`	`150`	`TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);`
`154`		`- newOutputTokens->reshape(ITensor::makeShape({maxTokensPerStep, maxNumSequences, 1}));`
`155`	`151`	`newOutputTokensHost->reshape(ITensor::makeShape({maxTokensPerStep, maxNumSequences, 1}));`
`156`	`152`	`TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);`
`157`	`153`	`}`
`158`	`154`
`159`	`155`	`void DecoderBuffers::disableLookaheadDecoding(SizeType32 maxNumSequences)`
`160`	`156`	`{`
`161`	`157`	`TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);`
`162`		`- newOutputTokens->reshape(ITensor::makeShape({1, maxNumSequences, 1}));`
`163`	`158`	`newOutputTokensHost->reshape(ITensor::makeShape({1, maxNumSequences, 1}));`
`164`	`159`	`TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);`
`165`	`160`	`}`