Skip to content

Commit 7997ab4

Browse files
authored
Merge branch 'main' into trtllm-bench/quick_fix_config
2 parents 3ae3810 + b286b51 commit 7997ab4

File tree

30 files changed

+706
-152
lines changed

30 files changed

+706
-152
lines changed

.pre-commit-config.yaml

+2-1
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,8 @@ repos:
6464
- id: codespell
6565
additional_dependencies:
6666
- tomli
67-
args: ["-L", "Mor"]
67+
# add ignore words list
68+
args: ["-L", "Mor,ans"]
6869
- repo: https://github.com/astral-sh/ruff-pre-commit
6970
rev: v0.9.4
7071
hooks:

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ TensorRT-LLM
99
[![python](https://img.shields.io/badge/python-3.10-green)](https://www.python.org/downloads/release/python-31012/)
1010
[![cuda](https://img.shields.io/badge/cuda-12.8.0-green)](https://developer.nvidia.com/cuda-downloads)
1111
[![trt](https://img.shields.io/badge/TRT-10.8.0-green)](https://developer.nvidia.com/tensorrt)
12-
[![version](https://img.shields.io/badge/release-0.19.0.dev-green)](./tensorrt_llm/version.py)
12+
[![version](https://img.shields.io/badge/release-0.19.0rc-green)](./tensorrt_llm/version.py)
1313
[![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE)
1414

1515
[Architecture](./docs/source/torch/arch_overview.md)   |   [Performance](./docs/source/performance/perf-overview.md)   |   [Examples](https://nvidia.github.io/TensorRT-LLM/quick-start-guide.html)   |   [Documentation](./docs/source/)   |   [Roadmap](https://docs.google.com/presentation/d/1gycPmtdh7uUcH6laOvW65Dbp9F1McUkGDIcAyjicBZs/edit?usp=sharing)

cpp/include/tensorrt_llm/batch_manager/decoderBuffers.h

-3
Original file line numberDiff line numberDiff line change
@@ -107,12 +107,9 @@ class DecoderBuffers
107107
using TensorPtr = runtime::ITensor::SharedPtr;
108108

109109
std::vector<TensorPtr> logits;
110-
TensorPtr slotOutputIds; // [mMaxNumRequests, beamWidth, maxSeqLen], outputIds of all batch slots
111-
TensorPtr slotOutputIdsHost; // [beamWidth, maxSeqLen], outputIds of single batch slot
112110
TensorPtr cacheIndirectionInput;
113111
TensorPtr cacheIndirectionOutput;
114112
TensorPtr sequenceLengthsHost; // [mMaxNumRequests, beamWidth], pinned host tensor
115-
TensorPtr newOutputTokens; // [maxTokensPerStep, mMaxNumRequests, beamWidth]
116113
TensorPtr newOutputTokensHost; // [maxTokensPerStep, mMaxNumRequests, beamWidth]
117114
TensorPtr cumLogProbsHost; // [mMaxNumRequests, beamWidth]
118115
TensorPtr logProbsHost; // [mMaxNumRequests, beamWidth, maxSeqLen]

cpp/include/tensorrt_llm/batch_manager/runtimeBuffers.h

-1
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,6 @@ class RuntimeBuffers
144144
TensorPtr lastTokenIdsHost;
145145
TensorPtr lastTokenIdsDevice;
146146
TensorPtr logitsIdsHost;
147-
TensorPtr logitsIdsDevice;
148147

149148
//! Pipeline-Parallelism
150149
TensorPtr hiddenStates;

cpp/tensorrt_llm/batch_manager/decoderBuffers.cpp

-5
Original file line numberDiff line numberDiff line change
@@ -72,9 +72,6 @@ DecoderBuffers::DecoderBuffers(SizeType32 maxNumSequences, SizeType32 maxBeamWid
7272

7373
finishedSumHost = BufferManager::pinned(ITensor::makeShape({maxNumSequences}), nvinfer1::DataType::kINT32);
7474

75-
newOutputTokens
76-
= manager.gpu(ITensor::makeShape({maxTokensPerStep, maxNumSequences, maxBeamWidth}), TRTTokenIdType);
77-
7875
newOutputTokensHost
7976
= BufferManager::pinned(ITensor::makeShape({maxTokensPerStep, maxNumSequences, maxBeamWidth}), TRTTokenIdType);
8077

@@ -151,15 +148,13 @@ void DecoderBuffers::DraftBuffers::create(SizeType32 maxNumSequences, SizeType32
151148
void DecoderBuffers::enableLookaheadDecoding(SizeType32 maxNumSequences, SizeType32 maxTokensPerStep)
152149
{
153150
TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
154-
newOutputTokens->reshape(ITensor::makeShape({maxTokensPerStep, maxNumSequences, 1}));
155151
newOutputTokensHost->reshape(ITensor::makeShape({maxTokensPerStep, maxNumSequences, 1}));
156152
TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
157153
}
158154

159155
void DecoderBuffers::disableLookaheadDecoding(SizeType32 maxNumSequences)
160156
{
161157
TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
162-
newOutputTokens->reshape(ITensor::makeShape({1, maxNumSequences, 1}));
163158
newOutputTokensHost->reshape(ITensor::makeShape({1, maxNumSequences, 1}));
164159
TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
165160
}

cpp/tensorrt_llm/batch_manager/runtimeBuffers.cpp

-3
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,6 @@ void RuntimeBuffers::reshape(TllmRuntime const& runtime, ModelConfig const& mode
114114
lastTokenIdsHost->reshape(numLogitsShape);
115115
lastTokenIdsDevice->reshape(numLogitsShape);
116116
logitsIdsHost->reshape(numLogitsShape);
117-
logitsIdsDevice->reshape(numLogitsShape);
118117

119118
if (transformerBuffers)
120119
{
@@ -252,7 +251,6 @@ void RuntimeBuffers::create(SizeType32 maxBatchSize, SizeType32 maxBeamWidth,
252251
lastTokenIdsHost = manager.emptyTensor(MemoryType::kCPU, nvinfer1::DataType::kINT32);
253252
lastTokenIdsDevice = manager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32);
254253
logitsIdsHost = manager.emptyTensor(MemoryType::kCPU, nvinfer1::DataType::kINT32);
255-
logitsIdsDevice = manager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32);
256254

257255
inputsIds = manager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32);
258256

@@ -811,7 +809,6 @@ void RuntimeBuffers::setFromInputs(RequestVector const& contextRequests, Request
811809
// In generation phase, device ptr of context lengths need to be tiled.
812810
manager.copy(*contextLengthsHost, *contextLengthsDevice);
813811
manager.copy(*sequenceLengthsHost, *sequenceLengthsDevice);
814-
manager.copy(*logitsIdsHost, *logitsIdsDevice);
815812
auto const logitsIdsHostRange = BufferRange<SizeType32>(*logitsIdsHost);
816813
auto lastTokenIdsHostRange = BufferRange<SizeType32>(*lastTokenIdsHost);
817814
common::stl_utils::inclusiveScan(

cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp

+1-6
Original file line numberDiff line numberDiff line change
@@ -1805,9 +1805,6 @@ void TrtGptModelInflightBatching::getDecoderSlotHostOutputs(
18051805
auto cumLogProbs = mDecoder->getDecoderState().getCumLogProbs(seqSlot);
18061806
auto logProbs = mDecoder->getDecoderState().getLogProbs(seqSlot);
18071807

1808-
runtime::CudaEvent beforeEvent{};
1809-
mRuntime->getStreamPtr()->record(beforeEvent);
1810-
mCopyBufferManager.getStream().wait(beforeEvent);
18111808
mCopyBufferManager.copy(*sequenceLengthView, *mSlotDecoderBuffers[seqSlot]->sequenceLengths);
18121809
mCopyBufferManager.copy(*outputIds, *mSlotDecoderBuffers[seqSlot]->outputIds);
18131810
if (returnLogProbs)
@@ -1987,9 +1984,7 @@ runtime::CudaEvent TrtGptModelInflightBatching::updateDecoderBuffers(
19871984
// Chain copy after decoder event, using a different stream
19881985
mCopyBufferManager.getStream().wait(decoderFinishEvent);
19891986

1990-
mDecoderBuffers->newOutputTokens = mDecoder->getDecoderState().getAllNewTokens();
1991-
1992-
mCopyBufferManager.copy(*mDecoderBuffers->newOutputTokens, *mDecoderBuffers->newOutputTokensHost);
1987+
mCopyBufferManager.copy(*mDecoder->getDecoderState().getAllNewTokens(), *mDecoderBuffers->newOutputTokensHost);
19931988
mCopyBufferManager.copy(
19941989
*mDecoder->getDecoderState().getJointDecodingOutput().lengths, *mDecoderBuffers->sequenceLengthsHost);
19951990

cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp

-4
Original file line numberDiff line numberDiff line change
@@ -464,14 +464,10 @@ void initBindings(pybind11::module_& m)
464464
py::arg("max_seq_len"), py::arg("max_tokens_per_step"), py::arg("buffer_manager"), py::arg("model_config"),
465465
py::arg("world_config"))
466466
.def_readwrite("logits", &tb::DecoderBuffers::logits)
467-
.def_readwrite("slot_output_ids", &tb::DecoderBuffers::slotOutputIds)
468-
.def_readwrite("slot_output_ids_host", &tb::DecoderBuffers::slotOutputIdsHost)
469467
.def_readwrite("cache_indirection_input", &tb::DecoderBuffers::cacheIndirectionInput)
470468
.def_readwrite("cache_indirection_output", &tb::DecoderBuffers::cacheIndirectionOutput)
471469
.def_readwrite("sequence_lengths_host", &tb::DecoderBuffers::sequenceLengthsHost)
472470
.def_readwrite("finished_sum_host", &tb::DecoderBuffers::finishedSumHost)
473-
.def_property_readonly(
474-
"new_output_tokens", [](tb::DecoderBuffers& self) { return tr::Torch::tensor(self.newOutputTokens); })
475471
.def_property_readonly("new_output_tokens_host",
476472
[](tb::DecoderBuffers& self) { return tr::Torch::tensor(self.newOutputTokensHost); })
477473
.def_readwrite("cum_log_probs_host", &tb::DecoderBuffers::cumLogProbsHost)

cpp/tests/README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ PYTHONPATH=examples/llama:$PYTHONPATH python3 cpp/tests/resources/scripts/build_
6565
PYTHONPATH=examples/chatglm:$PYTHONPATH python3 cpp/tests/resources/scripts/build_chatglm_engines.py
6666
PYTHONPATH=examples/medusa:$PYTHONPATH python3 cpp/tests/resources/scripts/build_medusa_engines.py
6767
PYTHONPATH=examples/eagle:$PYTHONPATH python3 cpp/tests/resources/scripts/build_eagle_engines.py
68-
PYTHONPATH=examples/redrafter:$PYTHONPATH python3 cpp/tests/resources/scripts/build_redrafter_engines.py --has_tllm_checkpoint
68+
PYTHONPATH=examples/redrafter:$PYTHONPATH python3 cpp/tests/resources/scripts/build_redrafter_engines.py
6969
```
7070

7171
It is possible to build engines with tensor and pipeline parallelism for LLaMA using 4 GPUs.

docs/source/advanced/disaggregated-service.md

+33-1
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ When the environment variable `TRTLLM_USE_MPI_KVCACHE=1` is set, TRT-LLM will tr
144144
*Q. Why do some profiling tools show that TRT-LLM's KV cache transfer does not utilize NVLink even on devices equipped with NVLink?*
145145
146146
A. Ensure TRT-LLM is running with `UCX`-backend `CUDA-aware MPI` , and check version of `UCX` with `ucx_info -v`.
147-
If the version of UCX <=1.17, set the environment variables `UCX_RNDV_FRAG_MEM_TYPE=cuda` and `UCX_MEMTYPE_CACHE=n` to enable NVLink.
147+
If the version of UCX <=1.17, set the environment variables `UCX_RNDV_FRAG_MEM_TYPE=cuda` and `UCX_MEMTYPE_CACHE=n` to enable NVLink. For BlackWell architecture GPUs, UCX version >=1.19 is required to enable NVLink.
148148
If the version of UCX >=1.18, there are several ways to enable NVLink:
149149
1. Set the environment variables `UCX_CUDA_COPY_ASYNC_MEM_TYPE=cuda`, `UCX_CUDA_COPY_DMABUF=no`, `UCX_MEMTYPE_CACHE=n` and `UCX_RNDV_PIPELINE_ERROR_HANDLING=y`.
150150
2. Set the environment variables `TRTLLM_KVCACHE_TRANSFER_BUFFER_SIZE=$Size`, `UCX_MEMTYPE_CACHE=n` and `UCX_RNDV_PIPELINE_ERROR_HANDLING=y`. $Size represents the size of the buffer for KV cache transfer, which is recommended to be larger than the size of the KV cache for the longest request.
@@ -155,3 +155,35 @@ A. Yes, TRT-LLM supports using GPU direct RDMA for inter-node KV cache transfer,
155155
1. Set the environment variables `UCX_RNDV_FRAG_MEM_TYPE=cuda`, `UCX_MEMTYPE_CACHE=n` and `UCX_RNDV_PIPELINE_ERROR_HANDLING=y`.
156156
2. Set the environment variables `TRTLLM_KVCACHE_TRANSFER_BUFFER_SIZE=$Size`, `UCX_MEMTYPE_CACHE=n` and `UCX_RNDV_PIPELINE_ERROR_HANDLING=y`, $Size represents the size of the buffer for KV cache transfer, which is recommended to be larger than the size of the KV cache for the longest request.
157157
To achieve the optimal performance when using GPU direct RDMA, it is advisable to create CUDA context before MPI initialization when TRTLLM_USE_MPI_KVCACHE=1 is set. One possible approach is to rely on MPI environment variables to set the correct device before MPI initialization.
158+
159+
*Q. Are there any guidelines for performance tuning of KV cache transfer?*
160+
161+
A. Depending on the user's use case, certain sets of environment variables can help avoid poor KV cache transfer performance.
162+
163+
Environment Variable Set A
164+
165+
```
166+
export UCX_RNDV_FRAG_MEM_TYPES=cuda
167+
export UCX_MEMTYPE_CACHE=n
168+
export UCX_RNDV_PIPELINE_ERROR_HANDLING=y
169+
```
170+
This set allows KV cache transfers to utilize NVLink within nodes and GDRDMA between nodes.
171+
172+
Environment Variable Set B
173+
174+
```
175+
export UCX_CUDA_COPY_ASYNC_MEM_TYPE=cuda
176+
export UCX_CUDA_COPY_DMABUF=no
177+
export UCX_MEMTYPE_CACHE=n
178+
export UCX_RNDV_PIPELINE_ERROR_HANDLING=y
179+
```
180+
Set B may provide slightly better performance on a single node compared to Set A. However, when transferring KV cache across multiple nodes, it may cause program instability.
181+
182+
Environment Variable Set C
183+
184+
```
185+
export TRTLLM_KVCACHE_TRANSFER_BUFFER_SIZE=$Size
186+
export UCX_MEMTYPE_CACHE=n
187+
export UCX_RNDV_PIPELINE_ERROR_HANDLING=y
188+
```
189+
Set C can achieve better performance than Sets A and B, both within and between nodes. However, if the KV cache size exceeds the specified $Size, performance may degrade.

examples/constraints.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
tensorrt_llm==0.19.0.dev2025041500
1+
tensorrt_llm==0.19.0rc0
22
evaluate~=0.4.1
33
rouge_score~=0.1.2

examples/deepseek_v3/README.md

+97-14
Original file line numberDiff line numberDiff line change
@@ -13,20 +13,23 @@ Please refer to [this guide](https://nvidia.github.io/TensorRT-LLM/installation/
1313

1414
## Table of Contents
1515

16-
- [Table of Contents](#table-of-contents)
17-
- [Hardware Requirements](#hardware-requirements)
18-
- [Downloading the Model Weights](#downloading-the-model-weights)
19-
- [Quick Start](#quick-start)
20-
- [Multi-Token Prediction (MTP)](#multi-token-prediction-mtp)
21-
- [Run evaluation on GPQA dataset](#run-evaluation-on-gpqa-dataset)
22-
- [Serving](#serving)
23-
- [Advanced Usages](#advanced-usages)
24-
- [Multi-node](#multi-node)
25-
- [mpirun](#mpirun)
26-
- [Slurm](#slurm)
27-
- [FlashMLA](#flashmla)
28-
- [DeepGEMM](#deepgemm)
29-
- [Notes and Troubleshooting](#notes-and-troubleshooting)
16+
- [DeepSeek‑V3 and DeepSeek-R1](#deepseekv3-and-deepseek-r1)
17+
- [Table of Contents](#table-of-contents)
18+
- [Hardware Requirements](#hardware-requirements)
19+
- [Downloading the Model Weights](#downloading-the-model-weights)
20+
- [Quick Start](#quick-start)
21+
- [Run a single inference](#run-a-single-inference)
22+
- [Multi-Token Prediction (MTP)](#multi-token-prediction-mtp)
23+
- [Run evaluation on GPQA dataset](#run-evaluation-on-gpqa-dataset)
24+
- [Serving](#serving)
25+
- [Advanced Usages](#advanced-usages)
26+
- [Multi-node](#multi-node)
27+
- [mpirun](#mpirun)
28+
- [Slurm](#slurm)
29+
- [Example: Multi-node benchmark on GB200 Slurm cluster](#example-multi-node-benchmark-on-gb200-slurm-cluster)
30+
- [FlashMLA](#flashmla)
31+
- [DeepGEMM](#deepgemm)
32+
- [Notes and Troubleshooting](#notes-and-troubleshooting)
3033

3134

3235
## Hardware Requirements
@@ -267,6 +270,86 @@ trtllm-llmapi-launch trtllm-bench --model deepseek-ai/DeepSeek-V3 --model_path /
267270
bash -c "trtllm-llmapi-launch trtllm-bench --model deepseek-ai/DeepSeek-V3 --model_path <YOUR_MODEL_DIR> throughput --backend pytorch --max_batch_size 161 --max_num_tokens 1160 --dataset /workspace/dataset.txt --tp 16 --ep 4 --kv_cache_free_gpu_mem_fraction 0.95 --extra_llm_api_options ./extra-llm-api-config.yml"
268271
```
269272

273+
#### Example: Multi-node benchmark on GB200 Slurm cluster
274+
275+
Step 1: Prepare dataset and `extra-llm-api-config.yml`.
276+
```bash
277+
python3 /path/to/TensorRT-LLM/benchmarks/cpp/prepare_dataset.py \
278+
--tokenizer=/path/to/DeepSeek-R1 \
279+
--stdout token-norm-dist --num-requests=49152 \
280+
--input-mean=1024 --output-mean=2048 --input-stdev=0 --output-stdev=0 > /tmp/dataset.txt
281+
282+
cat >/path/to/TensorRT-LLM/extra-llm-api-config.yml <<EOF
283+
pytorch_backend_config:
284+
use_cuda_graph: true
285+
cuda_graph_padding_enabled: true
286+
cuda_graph_batch_sizes:
287+
- 1
288+
- 2
289+
- 4
290+
- 8
291+
- 16
292+
- 32
293+
- 64
294+
- 128
295+
- 256
296+
- 384
297+
print_iter_log: true
298+
enable_overlap_scheduler: true
299+
enable_attention_dp: true
300+
EOF
301+
```
302+
303+
Step 2: Prepare `benchmark.slurm`.
304+
```bash
305+
#!/bin/bash
306+
#SBATCH --nodes=2
307+
#SBATCH --ntasks=8
308+
#SBATCH --ntasks-per-node=4
309+
#SBATCH --partition=<partition>
310+
#SBATCH --account=<account>
311+
#SBATCH --time=02:00:00
312+
#SBATCH --job-name=<job_name>
313+
314+
srun --container-image=${container_image} --container-mounts=${mount_dir}:${mount_dir} --mpi=pmix \
315+
--output ${logdir}/bench_%j_%t.srun.out \
316+
bash benchmark.sh
317+
```
318+
319+
Step 3: Prepare `benchmark.sh`.
320+
```bash
321+
#!/bin/bash
322+
cd /path/to/TensorRT-LLM
323+
# pip install build/tensorrt_llm*.whl
324+
if [ $SLURM_LOCALID == 0 ];then
325+
pip install build/tensorrt_llm*.whl
326+
echo "Install dependencies on rank 0."
327+
else
328+
echo "Sleep 60 seconds on other ranks."
329+
sleep 60
330+
fi
331+
332+
export PATH=${HOME}/.local/bin:${PATH}
333+
export PYTHONPATH=/path/to/TensorRT-LLM
334+
DS_R1_NVFP4_MODEL_PATH=/path/to/DeepSeek-R1 # optional
335+
336+
trtllm-llmapi-launch trtllm-bench \
337+
--model deepseek-ai/DeepSeek-R1 \
338+
--model_path $DS_R1_NVFP4_MODEL_PATH \
339+
throughput --backend pytorch \
340+
--num_requests 49152 \
341+
--max_batch_size 384 --max_num_tokens 1536 \
342+
--concurrency 3072 \
343+
--dataset /path/to/dataset.txt \
344+
--tp 8 --pp 1 --ep 8 --kv_cache_free_gpu_mem_fraction 0.85 \
345+
--extra_llm_api_options ./extra-llm-api-config.yml --warmup 0
346+
```
347+
348+
Step 4: Submit the job to Slurm cluster to launch the benchmark by executing:
349+
```
350+
sbatch --nodes=2 --ntasks=8 --ntasks-per-node=4 benchmark.slurm
351+
```
352+
270353
### FlashMLA
271354
TensorRT-LLM has already integrated FlashMLA in the PyTorch backend. It is enabled automatically when running DeepSeek-V3/R1.
272355

examples/multimodal/eval.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -274,7 +274,10 @@ def eval(output, task, data) -> bool:
274274
if args.test_trtllm:
275275
profiler.start('tensorrt_llm')
276276
_, output_text = trtllm_model.run(
277-
prompts, image, max_new_tokens=args.max_new_tokens)
277+
input_text=prompts,
278+
input_image=image,
279+
input_audio=None,
280+
max_new_tokens=args.max_new_tokens)
278281
if runtime_rank == 0:
279282
trtllm_result = output_text[0][0]
280283
trtllm_correct += eval(trtllm_result, args.eval_task, data)

examples/multimodal/run.py

+8-9
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,8 @@ def print_result(model, input_text, output_text, args):
4040
assert "characteristic | cat food, day | cat food, wet | cat treats" in output_text[
4141
0][0].lower()
4242
elif model.model_type in [
43-
'blip2', 'neva', 'phi-3-vision', 'llava_next'
43+
'blip2', 'neva', 'phi-3-vision', 'llava_next',
44+
'phi-4-multimodal'
4445
]:
4546
assert 'singapore' in output_text[0][0].lower()
4647
elif model.model_type == 'video-neva':
@@ -104,23 +105,21 @@ def print_result(model, input_text, output_text, args):
104105
logger.set_level(args.log_level)
105106

106107
model = MultimodalModelRunner(args)
107-
input_multimodal_data = model.load_test_data(args.image_path,
108-
args.video_path)
108+
visual_data = model.load_test_data(args.image_path, args.video_path)
109+
audio_data = model.load_test_audio(args.audio_path)
109110

110111
if args.run_profiling:
111112
num_warmup_iters = 3 # Multiple iterations to load both vision and LLM engines into memory
112113
for _ in range(num_warmup_iters):
113-
input_text, output_text = model.run(args.input_text,
114-
input_multimodal_data,
115-
args.max_new_tokens)
114+
input_text, output_text = model.run(args.input_text, visual_data,
115+
audio_data, args.max_new_tokens)
116116
profiler.reset()
117117

118118
num_iters = args.profiling_iterations if args.run_profiling else 1
119119

120120
for _ in range(num_iters):
121-
input_text, output_text = model.run(args.input_text,
122-
input_multimodal_data,
123-
args.max_new_tokens)
121+
input_text, output_text = model.run(args.input_text, visual_data,
122+
audio_data, args.max_new_tokens)
124123

125124
runtime_rank = tensorrt_llm.mpi_rank()
126125
if runtime_rank == 0:

examples/multimodal/utils.py

+8
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,10 @@ def add_common_args(parser):
1010
type=str,
1111
default='model.engine',
1212
help='Name of visual TRT engine')
13+
parser.add_argument('--audio_engine_name',
14+
type=str,
15+
default='model.engine',
16+
help='Name of audio TRT engine')
1317
parser.add_argument('--hf_model_dir',
1418
type=str,
1519
default=None,
@@ -55,6 +59,10 @@ def add_common_args(parser):
5559
nargs='+',
5660
default=None,
5761
help='List of input image paths, separated by symbol')
62+
parser.add_argument("--audio_path",
63+
type=str,
64+
default=None,
65+
help='input audio path')
5866
parser.add_argument("--path_sep",
5967
type=str,
6068
default=",",

0 commit comments

Comments
 (0)