Add commits to explain copy from cpu <-> gpu using pinned memory

katec846 · katec846 · commit 7d823eafdfd8 · 2025-04-11T16:11:39.000-07:00
Signed-off-by: Kate Cheng &lt;yunhsuanc@nvidia.com&gt;
diff --git a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
@@ -302,6 +302,7 @@ void LlmRequest::movePromptEmbeddingTableToGpu(runtime::BufferManager const& man
     {
         return;
     }
+
     TensorPtr gpuPromptEmbeddingTable = manager.copyFrom(*mPromptEmbeddingTable.value(), runtime::MemoryType::kGPU);
     mPromptEmbeddingTable = gpuPromptEmbeddingTable;
 }
diff --git a/cpp/tensorrt_llm/batch_manager/promptTuningBuffers.cpp b/cpp/tensorrt_llm/batch_manager/promptTuningBuffers.cpp
@@ -34,7 +34,6 @@ PromptTuningBuffers::PromptTuningBuffers(SizeType32 maxBatchSize, runtime::Buffe
     // vocabSize and mMaxPromptVocabSize
     mPromptTuningParams.vocabSize = manager.gpu(runtime::ITensor::makeShape({1}), nvinfer1::DataType::kINT32);
     mMaxPromptVocabSize = maxPromptEmbeddingTableSize / maxBatchSize;
-    // optionalParams.enableChunkedContext || modelConfig.getContextFMHA()
 
     auto promptVocabSizeHost
         = runtime::BufferManager::pinned(runtime::ITensor::makeShape({1}), nvinfer1::DataType::kINT32);
@@ -143,23 +142,18 @@ void PromptTuningBuffers::fill(RequestVector const& contextRequests, RequestVect
 
             std::optional<TensorPtr> optReqPromptEmbeddingTable = std::nullopt;
             std::optional<SizeType32> optReqPromptVocabSize = std::nullopt;
-            // If context chunk mode, the context chunk size would be less than the total number of tokens in the
-            // request This if statement is to check if the context chunk mode is enabled
-            if (batchIdx < numContextRequests)
+
+            if (mPromptTableOffloading)
             {
-                if (mPromptTableOffloading)
-                {
-                    optReqPromptEmbeddingTable = getChunkPtableBuffer(getChunkPtableCurrentIndex());
-                    optReqPromptVocabSize = getChunkPtableBufferSliceSize(getChunkPtableCurrentIndex(), batchIdx);
-                }
-                else
-                {
-                    optReqPromptEmbeddingTable = llmReq->getPromptEmbeddingTable();
-                    optReqPromptVocabSize = llmReq->getPromptVocabSize();
-                }
+                optReqPromptEmbeddingTable = getChunkPtableBuffer(getChunkPtableCurrentIndex());
+                optReqPromptVocabSize = getChunkPtableBufferSliceSize(getChunkPtableCurrentIndex(), batchIdx);
+            }
+            else
+            {
+                optReqPromptEmbeddingTable = llmReq->getPromptEmbeddingTable();
+                optReqPromptVocabSize = llmReq->getPromptVocabSize();
             }
-            // auto optReqPromptEmbeddingTable = llmReq->getPromptEmbeddingTable();
-            // auto const optReqPromptVocabSize = llmReq->getPromptVocabSize();
+
             mPromptTuningParams.promptTuningEnabled.push_back(optReqPromptEmbeddingTable.has_value());
 
             // If context request & has embedding table, validate it
@@ -174,9 +168,8 @@ void PromptTuningBuffers::fill(RequestVector const& contextRequests, RequestVect
                         // The size depends on optReqPromptVocabSize which stores how many fake prompts are in the chunk
                         auto slicedPtable = runtime::ITensor::slice(
                             optReqPromptEmbeddingTable.value(), 0, optReqPromptVocabSize.value());
-                        // Add leading dimension 1 for batch
-                        slicedPtable->unsqueeze(0);                           // Call unsqueeze() as member function
-                        optReqPromptEmbeddingTable = std::move(slicedPtable); // Move ownership of the unique_ptr
+                        slicedPtable->unsqueeze(0);
+                        optReqPromptEmbeddingTable = std::move(slicedPtable);
                     }
                     else
                     {
diff --git a/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp b/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp
@@ -2615,6 +2615,36 @@ SizeType32 TrtGptModelInflightBatching::getMaxCapacityBatchSize(SizeType32 input
     return mKvCacheManager->getMaxCapacityBatchSize(inputLength, outputLength);
 }
 
+/*
+ * Manages prefetching of prompt table chunks using a double-buffer strategy
+ *
+ * Function Flow:
+ * 1. First Chunk Processing (isBeforePrepareBuffers == true):
+ *    - Uses blocking prefetch on main runtime stream
+ *    - Ensures initial data is ready before computation starts
+ *
+ * 2. Subsequent Chunks (isBeforePrepareBuffers == false):
+ *    - Uses non-blocking prefetch on separate copy stream
+ *    - Overlaps data transfer with computation
+ *
+ * Synchronization:
+ * - First prefetch: No wait needed (fresh start)
+ * - Later prefetches: Wait for previous copy to complete
+ * - Uses mPtableCopyDoneEvent to track completion
+ *
+ * Key Functions:
+ * 1. prefetchNextPromptTableChunk:
+ *    - Calls the correct function based on position in code (before or after prepareBuffers())
+ *    - Waits for previous copy to complete if not the first chunk
+ *
+ * 2. remapInputTokensForPromptTable:
+ *    - Identifies tokens that need prompt table embeddings (tokens that are greater than vocabSize)
+ *    - Remaps IDs to match chunked prompt table layout
+ *
+ * 3. copyPromptTableToGpuInChunk:
+ *    - Handles actual transfer from CPU pinned memory to GPU
+ *    - Uses appropriate buffer manager based on isBeforePrepareBuffers
+ */
 void TrtGptModelInflightBatching::prefetchNextPromptTableChunk(
     RequestVector const& contextRequests, bool isBeforePrepareBuffers, SizeType32 bufferId)
 {
@@ -2663,7 +2693,6 @@ void TrtGptModelInflightBatching::remapInputTokensForPromptTable(
     auto& inputTokensMutable = llmReq->getTokensMutable(0);
     auto vocabSize = mModelConfig.getVocabSize();
 
-    // For first chunk's initialization
     if (isBeforePrepareBuffers)
     {
         promptTuningBuffers->initializeChunkPtableBuffers(
@@ -2698,15 +2727,10 @@ void TrtGptModelInflightBatching::remapInputTokensForPromptTable(
         beginPos = llmReq->getContextCurrentPosition();
     }
 
-    // Bounds check
-    if (beginPos + processChunkSize > inputTokensMutable.size())
-    {
-        TLLM_THROW("Invalid chunk access: beginPos(%zu) + processChunkSize(%zu) > totalSize(%zu)", beginPos,
-            processChunkSize, inputTokensMutable.size());
-        return;
-    }
+    TLLM_CHECK_WITH_INFO(beginPos + processChunkSize <= inputTokensMutable.size(),
+        "Invalid chunk access: beginPos(%zu) + processChunkSize(%zu) > totalSize(%zu)", beginPos, processChunkSize,
+        inputTokensMutable.size());
 
-    // Process tokens
     auto inputTokensChunk = inputTokensMutable.begin() + beginPos;
     std::vector<SizeType32> outOfVocabTokens;
     SizeType32 ptableTokenId = vocabSize;
@@ -2724,7 +2748,7 @@ void TrtGptModelInflightBatching::remapInputTokensForPromptTable(
 
 void TrtGptModelInflightBatching::copyPromptTableToGpuInChunk(std::shared_ptr<LlmRequest> const& llmReq,
     std::vector<int32_t> const& outOfVocabTokens, bool isBeforePrepareBuffers, SizeType32 bufferId,
-    SizeType32 contextId) // Add parameter to choose which buffer to use
+    SizeType32 contextId)
 {
     TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
     NVTX3_SCOPED_RANGE_WITH_NAME(range, "copyPromptTableToGpuInChunk");
@@ -2756,25 +2780,20 @@ void TrtGptModelInflightBatching::copyPromptTableToGpuInChunk(std::shared_ptr<Ll
     auto table1D = runtime::ITensor::view(
         promptTable.value(), runtime::ITensor::makeShape({static_cast<int64_t>(totalElements)}));
 
-    // Add bounds checking
-    if (srcOffset + sliceSize > totalElements)
-    {
-        printf("ERROR: Would access beyond buffer bounds!\n");
-        printf("Total elements: %zu, Trying to access up to: %zu\n", totalElements, srcOffset + (sliceSize));
-    }
+    TLLM_CHECK_WITH_INFO(srcOffset + sliceSize <= totalElements,
+        "Buffer bounds violation: Trying to access up to %zu elements but buffer only has %zu elements (offset: %zu, "
+        "slice size: %zu)",
+        srcOffset + sliceSize, totalElements, srcOffset, sliceSize);
 
-    // Convert UniquePtr to SharedPtr
     auto table1DShared = runtime::ITensor::SharedPtr(table1D.release());
     auto pTableView = runtime::ITensor::slice(table1DShared, srcOffset, sliceSize);
 
     auto gpuBufferSlice = runtime::ITensor::slice(gpuBuffer, dstOffset, numRows);
 
     currentBufferManager.copy(*pTableView, *gpuBufferSlice);
 
-    // Update buffer sizes
     promptTuningBuffers->updateBufferStartPosition(currentIndex, outOfVocabTokens.size());
 
-    // Update position for next chunk
     llmReq->mPtableCurrentPosition += outOfVocabTokens.size();
 
     TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
diff --git a/tensorrt_llm/runtime/model_runner_cpp.py b/tensorrt_llm/runtime/model_runner_cpp.py
@@ -834,6 +834,9 @@ def _prepare_ptuning_executor(self, batch_input_ids_list, prompt_table,
         prompt_tuning_configs = len(batch_input_ids_list) * [None]
         if prompt_table is not None:
             if mm_embedding_offloading:
+                # CUDA Stream Overlapping Requirements:
+                # 1. Both memory copy stream and kernel execution stream must be non-default streams
+                # 2. For host<->device transfers (H2D/D2H), host memory MUST be page-locked (pinned)
                 prompt_table_data = self._prepare_embedding_table(
                     prompt_table).pin_memory()
             else:
diff --git a/tensorrt_llm/runtime/multimodal_model_runner.py b/tensorrt_llm/runtime/multimodal_model_runner.py
@@ -1379,11 +1379,12 @@ def get_visual_features(self, image, other_vision_inputs):
         image_embeds = visual_outputs[self.vision_output_names[0]]
 
         if self.args.mm_embedding_offloading:
-            # Allocate pinned memory with same shape and dtype
+            # CUDA Stream Overlapping Requirements:
+            # 1. Both memory copy stream and kernel execution stream must be non-default streams
+            # 2. For host<->device transfers (H2D/D2H), host memory MUST be page-locked (pinned)
             pinned_embeds = torch.empty_like(image_embeds,
                                              device='cpu',
                                              pin_memory=True)
-            # Copy directly from GPU to pinned memory
             pinned_embeds.copy_(image_embeds, non_blocking=True)
             image_embeds = pinned_embeds
 
@@ -1823,6 +1824,9 @@ def ptuning_setup(self, prompt_table, input_ids, input_lengths):
                     dtype=str_dtype_to_torch(self.model_config.dtype))
             else:
                 if self.args.mm_embedding_offloading:
+                    # CUDA Stream Overlapping Requirements:
+                    # 1. Both memory copy stream and kernel execution stream must be non-default streams
+                    # 2. For host<->device transfers (H2D/D2H), host memory MUST be page-locked (pinned)
                     prompt_table = prompt_table.pin_memory().to(
                         dtype=self.model.dtype)
                 else:

Original file line number	Diff line number	Diff line change
`@@ -302,6 +302,7 @@ void LlmRequest::movePromptEmbeddingTableToGpu(runtime::BufferManager const& man`
`302`	`302`	`{`
`303`	`303`	`return;`
`304`	`304`	`}`
	`305`	`+`
`305`	`306`	`TensorPtr gpuPromptEmbeddingTable = manager.copyFrom(*mPromptEmbeddingTable.value(), runtime::MemoryType::kGPU);`
`306`	`307`	`mPromptEmbeddingTable = gpuPromptEmbeddingTable;`
`307`	`308`	`}`