@@ -145,6 +145,7 @@ struct BenchmarkParams
145
145
{
146
146
std::optional<SizeType32> maxTokensInPagedKvCache{std::nullopt};
147
147
std::optional<float > freeGpuMemoryFraction{std::nullopt};
148
+ std::optional<float > crossKvCacheFraction{std::nullopt};
148
149
bool enableTrtOverlap{false };
149
150
bool enableBlockReuse{false };
150
151
bool enableChunkedContext{false };
@@ -159,6 +160,8 @@ struct BenchmarkParams
159
160
std::optional<int > sinkTokenLength{std::nullopt};
160
161
bool multiBlockMode{true };
161
162
bool enableContextFMHAFP32Acc{false };
163
+ bool cudaGraphMode{false };
164
+ SizeType32 cudaGraphCacheSize{0 };
162
165
163
166
// lora / peft params
164
167
std::optional<std::string> loraDir{std::nullopt};
@@ -470,7 +473,38 @@ class Recorder
470
473
mRequestBenchInfos [requestId].firstTokenSeen = true ;
471
474
}
472
475
473
- mRequestBenchInfos [requestId].outputLength += 1 ;
476
+ mRequestBenchInfos [requestId].decodingIter += 1 ;
477
+ }
478
+
479
+ void recordToken (uint64_t requestId, std::list<NamedTensor> const & responseTensors)
480
+ {
481
+ int32_t outputLength = 1 ;
482
+ for (auto & tensor : responseTensors)
483
+ {
484
+ if (tensor.name == inference_request::kSequenceLengthTensorName )
485
+ {
486
+ // Tensor of shape nBeams, and we only need the first one
487
+ outputLength = *(bufferCast<int32_t >(*(tensor.tensor )));
488
+ break ;
489
+ }
490
+ }
491
+
492
+ mRequestBenchInfos [requestId].outputLength += outputLength;
493
+ this ->recordToken (requestId);
494
+ }
495
+
496
+ void recordToken (uint64_t requestId, texec::Response const & response)
497
+ {
498
+ auto outputTokenIds = response.getResult ().outputTokenIds ;
499
+
500
+ int32_t outputLength = 1 ;
501
+ for (auto const & beam : outputTokenIds)
502
+ {
503
+ outputLength = std::max (static_cast <int32_t >(beam.size ()), outputLength);
504
+ }
505
+
506
+ mRequestBenchInfos [requestId].outputLength += outputLength;
507
+ this ->recordToken (requestId);
474
508
}
475
509
476
510
void recordEnd (uint64_t requestId, std::list<NamedTensor> const & responseTensors, bool hasError)
@@ -500,7 +534,7 @@ class Recorder
500
534
}
501
535
else
502
536
{
503
- this ->recordToken (requestId);
537
+ this ->recordToken (requestId, responseTensors );
504
538
}
505
539
}
506
540
@@ -532,7 +566,7 @@ class Recorder
532
566
}
533
567
else
534
568
{
535
- this ->recordToken (requestId);
569
+ this ->recordToken (requestId, response );
536
570
}
537
571
}
538
572
}
@@ -818,11 +852,13 @@ class ExecutorServer
818
852
texec::SchedulerConfig schedulerConfig (capacitySchedulerPolicy);
819
853
texec::KvCacheConfig kvCacheConfig (benchmarkParams.enableBlockReuse , benchmarkParams.maxTokensInPagedKvCache ,
820
854
benchmarkParams.maxAttentionWindowVec , benchmarkParams.sinkTokenLength ,
821
- benchmarkParams.freeGpuMemoryFraction , benchmarkParams.kvHostCacheSize , benchmarkParams.kvOnboardBlocks );
855
+ benchmarkParams.freeGpuMemoryFraction , benchmarkParams.kvHostCacheSize , benchmarkParams.kvOnboardBlocks ,
856
+ benchmarkParams.crossKvCacheFraction );
822
857
texec::PeftCacheConfig peftCacheConfig (0 , benchmarkParams.loraDeviceNumModLayers , 8 , 64 , 4 , 4 , 4 , 24 , 8 ,
823
858
std::nullopt, benchmarkParams.loraHostCacheSize );
824
- texec::ExtendedRuntimePerfKnobConfig extendedRuntimePerfKnobConfig (
825
- benchmarkParams.multiBlockMode , benchmarkParams.enableContextFMHAFP32Acc );
859
+ texec::ExtendedRuntimePerfKnobConfig extendedRuntimePerfKnobConfig (benchmarkParams.multiBlockMode ,
860
+ benchmarkParams.enableContextFMHAFP32Acc , benchmarkParams.cudaGraphMode ,
861
+ benchmarkParams.cudaGraphCacheSize );
826
862
texec::ExecutorConfig executorConfig (
827
863
maxBeamWidth, schedulerConfig, kvCacheConfig, benchmarkParams.enableChunkedContext , true );
828
864
executorConfig.setGpuWeightsPercent (benchmarkParams.gpuWeightsPercent );
@@ -940,7 +976,7 @@ class ExecutorServer
940
976
{
941
977
if (!warmup && !response.hasError ())
942
978
{
943
- mRecorder ->recordToken (reqId);
979
+ mRecorder ->recordToken (reqId, response );
944
980
}
945
981
}
946
982
}
@@ -1228,7 +1264,7 @@ class GptServer
1228
1264
{
1229
1265
if (errMsg.empty ())
1230
1266
{
1231
- mRecorder ->recordToken (requestId);
1267
+ mRecorder ->recordToken (requestId, response_tensors );
1232
1268
}
1233
1269
}
1234
1270
}
@@ -1430,6 +1466,10 @@ void benchmarkGptManager(std::filesystem::path const& engineDir, TrtGptModelType
1430
1466
{
1431
1467
optionalParams.kvCacheConfig .freeGpuMemoryFraction = benchmarkParams.freeGpuMemoryFraction ;
1432
1468
}
1469
+ if (benchmarkParams.crossKvCacheFraction )
1470
+ {
1471
+ optionalParams.kvCacheConfig .crossKvCacheFraction = benchmarkParams.crossKvCacheFraction ;
1472
+ }
1433
1473
if (benchmarkParams.maxAttentionWindowVec )
1434
1474
{
1435
1475
optionalParams.kvCacheConfig .maxAttentionWindowVec = benchmarkParams.maxAttentionWindowVec ;
@@ -1458,8 +1498,8 @@ void benchmarkGptManager(std::filesystem::path const& engineDir, TrtGptModelType
1458
1498
: benchmarkParams.executorLookaheadConfig .has_value () ? texec::DecodingMode::Lookahead ()
1459
1499
: texec::DecodingMode::Auto (),
1460
1500
benchmarkParams.executorLookaheadConfig , benchmarkParams.medusaChoices );
1461
- optionalParams.extendedRuntimePerfKnobConfig = texec::ExtendedRuntimePerfKnobConfig (
1462
- benchmarkParams.multiBlockMode , benchmarkParams.enableContextFMHAFP32Acc );
1501
+ optionalParams.extendedRuntimePerfKnobConfig = texec::ExtendedRuntimePerfKnobConfig (benchmarkParams. multiBlockMode ,
1502
+ benchmarkParams.enableContextFMHAFP32Acc , benchmarkParams.cudaGraphMode , benchmarkParams. cudaGraphCacheSize );
1463
1503
1464
1504
auto const jsonConfig = GptJsonConfig::parse (engineDir / " config.json" );
1465
1505
auto const worldConfig = WorldConfig::mpi (jsonConfig.getGpusPerNode (), jsonConfig.getTensorParallelism (),
@@ -1874,6 +1914,8 @@ int main(int argc, char* argv[])
1874
1914
" random_seed" , " integer random seed for exponential time delays." , cxxopts::value<int >()->default_value (" 420" ));
1875
1915
options.add_options ()(
1876
1916
" kv_cache_free_gpu_mem_fraction" , " K-V Cache Free Gpu Mem Fraction." , cxxopts::value<float >());
1917
+ options.add_options ()(
1918
+ " cross_kv_cache_fraction" , " Cross K-V Cache Fraction (from 0.0 to 1.0)." , cxxopts::value<float >());
1877
1919
options.add_options ()(" request_rate" ,
1878
1920
" request rate in reqs/sec. Skipping this arg or negative value will trigger offline/0-delay." ,
1879
1921
cxxopts::value<float >());
@@ -1895,7 +1937,8 @@ int main(int argc, char* argv[])
1895
1937
options.add_options ()(" return_generation_logits" , " Whether to return generation logits." ,
1896
1938
cxxopts::value<bool >()->default_value (" false" ));
1897
1939
1898
- options.add_options ()(" scheduler_policy" , " Choose scheduler policy between max_utilization/guaranteed_no_evict." ,
1940
+ options.add_options ()(" scheduler_policy" ,
1941
+ " Choose scheduler policy between max_utilization/guaranteed_no_evict/static_batch." ,
1899
1942
cxxopts::value<std::string>()->default_value (" guaranteed_no_evict" ));
1900
1943
1901
1944
options.add_options ()(" first_batch_delay" ,
@@ -1946,6 +1989,12 @@ int main(int argc, char* argv[])
1946
1989
cxxopts::value<bool >()->default_value (" true" ));
1947
1990
options.add_options ()(
1948
1991
" encoder_engine_dir" , " Directory that store the engines of the encoder models." , cxxopts::value<std::string>());
1992
+ options.add_options ()(" cuda_graph_mode" , " When enabled, inference is executed with cuda graph." ,
1993
+ cxxopts::value<bool >()->default_value (" false" ));
1994
+ options.add_options ()(" cuda_graph_cache_size" ,
1995
+ " Specify how many cuda graphs are cached in the runtime. Larger cache gives better perf, but consumes more GPU "
1996
+ " memory." ,
1997
+ cxxopts::value<SizeType32>()->default_value (" 0" ));
1949
1998
1950
1999
options.add_options ()(" enable_context_fmha_fp32_acc" , " Enable FMHA runner FP32 accumulation" ,
1951
2000
cxxopts::value<bool >()->default_value (" false" ));
@@ -2040,6 +2089,20 @@ int main(int argc, char* argv[])
2040
2089
{
2041
2090
benchmarkParams.freeGpuMemoryFraction = result[" kv_cache_free_gpu_mem_fraction" ].as <float >();
2042
2091
}
2092
+ // Argument: K-V Cache Cross Attention Fraction. Only applicable to enc-dec models.
2093
+ if (result.count (" encoder_engine_dir" ) && result.count (" decoder_engine_dir" ))
2094
+ {
2095
+ if (result.count (" cross_kv_cache_fraction" ))
2096
+ {
2097
+ benchmarkParams.crossKvCacheFraction = result[" cross_kv_cache_fraction" ].as <float >();
2098
+ }
2099
+ else
2100
+ {
2101
+ benchmarkParams.crossKvCacheFraction
2102
+ = 0 .5f ; // default value if not set. but non enc-dec should not even have this param set
2103
+ }
2104
+ }
2105
+
2043
2106
// Argument: Enable TRT overlap
2044
2107
benchmarkParams.enableTrtOverlap = result[" enable_trt_overlap" ].as <bool >();
2045
2108
@@ -2131,6 +2194,12 @@ int main(int argc, char* argv[])
2131
2194
// Argument: enable_context_fmha_fp32_acc
2132
2195
benchmarkParams.enableContextFMHAFP32Acc = result[" enable_context_fmha_fp32_acc" ].as <bool >();
2133
2196
2197
+ // Argument: cuda_graph_mode
2198
+ benchmarkParams.cudaGraphMode = result[" cuda_graph_mode" ].as <bool >();
2199
+
2200
+ // Argument: cuda_graph_mode
2201
+ benchmarkParams.cudaGraphCacheSize = result[" cuda_graph_cache_size" ].as <SizeType32>();
2202
+
2134
2203
std::optional<TokenIdType> padId;
2135
2204
// Argument: Padding token id
2136
2205
if (result.count (" pad_id" ))
@@ -2168,6 +2237,10 @@ int main(int argc, char* argv[])
2168
2237
{
2169
2238
capacitySchedulerPolicy = texec::CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT ;
2170
2239
}
2240
+ else if (capacitySchedulerPolicyArg == " static_batch" )
2241
+ {
2242
+ capacitySchedulerPolicy = texec::CapacitySchedulerPolicy::kSTATIC_BATCH ;
2243
+ }
2171
2244
else
2172
2245
{
2173
2246
TLLM_LOG_ERROR (" Unexpected scheduler policy: " + capacitySchedulerPolicyArg);
@@ -2246,14 +2319,14 @@ int main(int argc, char* argv[])
2246
2319
{
2247
2320
texec::ModelType executorModelType;
2248
2321
std::optional<std::string> decoderEngineDir = std::nullopt, encoderEngineDir = std::nullopt;
2249
- if (result.count (" encoder_engine_dir" ) && result.count (" engine_dir " ))
2322
+ if (result.count (" encoder_engine_dir" ) && result.count (" decoder_engine_dir " ))
2250
2323
{
2251
2324
TLLM_CHECK_WITH_INFO (api == " executor" , " encoder-decoder only support executor api." );
2252
2325
TLLM_CHECK_WITH_INFO (
2253
2326
modelType == TrtGptModelType::InflightFusedBatching, " encoder-decoder only support inflight batching." );
2254
2327
executorModelType = texec::ModelType::kENCODER_DECODER ;
2255
- decoderEngineDir = result[" engine_dir" ].as <std::string>();
2256
2328
encoderEngineDir = result[" encoder_engine_dir" ].as <std::string>();
2329
+ decoderEngineDir = result[" decoder_engine_dir" ].as <std::string>();
2257
2330
}
2258
2331
else if (result.count (" engine_dir" ))
2259
2332
{
0 commit comments