Closed
Description
I am testing trtllm backend v0.6.0 for llama2-7b with below setup. The code snippets is as below. If I set one request with length being about 1000, it took about 2-3 seconds to finish. And if I send 16 requests at the same time, it took about 30 seconds, so it feels like there is no inflight batching at all.
with multiprocessing.Manager() as manager:
def send(client, input_id, input_length, i, shared_list):
inputs = [
_input("input_ids", np.array(input_id, dtype=np.int32).reshape(1, -1)),
_input("input_lengths", np.array([input_length], dtype=np.int32).reshape(1, -1)),
_input("request_output_len", np.array([max_new_tokens], dtype=np.uint32).reshape(1, -1)),
_input("end_id", np.array([2], dtype=np.uint32).reshape(1, -1)),
]
with grpcclient.InferenceServerClient("localhost:8001", verbose=False) as client:
print('send')
shared_list[i] = client.infer('tensorrt_llm', inputs).as_numpy('sequence_length').reshape(-1)[0]
processes = []
shared_list = manager.list([""] * batch_size)
start = time.time()
for i in range(batch_size):
process = multiprocessing.Process(target=send, args=(None, input_id[i], input_lengths[i], i, shared_list))
processes.append(process)
process.start()
for process in processes:
process.join()
latency = time.time() - start
Trtllm setup as below
name: "tensorrt_llm"
backend: "tensorrtllm"
max_batch_size: 1024
model_transaction_policy {
decoupled: false
}
input [
{
name: "input_ids"
data_type: TYPE_INT32
dims: [ -1 ]
allow_ragged_batch: true
},
{
name: "input_lengths"
data_type: TYPE_INT32
dims: [ 1 ]
reshape: { shape: [ ] }
},
{
name: "request_output_len"
data_type: TYPE_UINT32
dims: [ 1 ]
},
{
name: "end_id"
data_type: TYPE_UINT32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "pad_id"
data_type: TYPE_UINT32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
}
]
output [
{
name: "output_ids"
data_type: TYPE_INT32
dims: [ -1, -1 ]
},
{
name: "sequence_length"
data_type: TYPE_INT32
dims: [ -1 ]
}
]
instance_group [
{
count: 1
kind : KIND_CPU
}
]
parameters: {
key: "max_beam_width"
value: {
string_value: "1"
}
}
parameters: {
key: "FORCE_CPU_ONLY_INPUT_TENSORS"
value: {
string_value: "no"
}
}
parameters: {
key: "gpt_model_type"
value: {
string_value: "inflight_batching"
}
}
parameters: {
key: "gpt_model_path"
value: {
string_value: "/models/trt_engines/llama-2-7b-chat-hf/1-gpu/"
}
}
parameters: {
key: "max_tokens_in_paged_kv_cache"
value: {
string_value: "8192"
}
}
parameters: {
key: "max_kv_cache_length"
value: {
string_value: "4096"
}
}
parameters: {
key: "batch_scheduler_policy"
value: {
string_value: "max_utilization"
}
}
parameters: {
key: "max_num_sequences"
value: {
string_value: "96"
}
}
parameters: {
key: "exclude_input_in_output"
value: {
string_value: "true"
}
}
Below is how we generate the engine
python build.py --model_dir /models/llama-2-7b-chat-hf/ \
--dtype float16 \
--remove_input_padding \
--use_gpt_attention_plugin float16 \
--use_inflight_batching \
--enable_context_fmha \
--paged_kv_cache \
--max_input_len 3072 \
--max_output_len 1024 \
--use_gemm_plugin float16 \
--output_dir /models/trt_engines/llama-2-7b-chat-hf/1-gpu/ \
--world_size 1 \
--max_batch_size 96 \
--tp_size 1