NVIDIA
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/llmRequest.h
Lines changed: 11 additions & 0 deletions b/‎cpp/include/tensorrt_llm/batch_manager/llmRequest.h
Lines changed: 11 additions & 0 deletions
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/promptTuningBuffers.h
Lines changed: 106 additions & 0 deletions b/‎cpp/include/tensorrt_llm/batch_manager/promptTuningBuffers.h
Lines changed: 106 additions & 0 deletions
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/runtimeBuffers.h
Lines changed: 6 additions & 4 deletions b/‎cpp/include/tensorrt_llm/batch_manager/runtimeBuffers.h
Lines changed: 6 additions & 4 deletions
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/trtGptModelOptionalParams.h
Lines changed: 4 additions & 1 deletion b/‎cpp/include/tensorrt_llm/batch_manager/trtGptModelOptionalParams.h
Lines changed: 4 additions & 1 deletion
diff --git a/‎cpp/include/tensorrt_llm/executor/executor.h
Lines changed: 7 additions & 1 deletion b/‎cpp/include/tensorrt_llm/executor/executor.h
Lines changed: 7 additions & 1 deletion
@@ -553,6 +553,14 @@ class GenericLlmRequest
         return mTokens.at(beam);
     }
 
+    /// @brief Get mutable reference to tokens for a specific beam
+    /// @param beam The beam index
+    /// @return Mutable reference to the tokens vector
+    [[nodiscard]] VecTokens& getTokensMutable(SizeType32 beam)
+    {
+        return mTokens.at(beam);
+    }
+
     /// @brief Get all tokens (input+output) for all beams
     /// @return A vector of vector of tokens.
     [[nodiscard]] BeamTokens const& getTokens() const
@@ -1772,6 +1780,9 @@ class GenericLlmRequest
 
     LlmRequestState mState{LlmRequestState::kCONTEXT_INIT};
 
+    // current position of the prompt tuning table (only used in chunked prefill mode)
+    SizeType32 mPtableCurrentPosition{0};
+
 protected:
     bool mIsStreaming;
 
 
@@ -0,0 +1,106 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "tensorrt_llm/batch_manager/common.h"
+#include "tensorrt_llm/runtime/bufferManager.h"
+#include "tensorrt_llm/runtime/iTensor.h"
+#include "tensorrt_llm/runtime/modelConfig.h"
+#include "tensorrt_llm/runtime/promptTuningParams.h"
+#include "tensorrt_llm/runtime/worldConfig.h"
+
+namespace tensorrt_llm::batch_manager
+{
+
+class PromptTuningBuffers
+{
+
+public:
+    using SizeType32 = tensorrt_llm::runtime::SizeType32;
+    using ITensor = tensorrt_llm::runtime::ITensor;
+    using TensorPtr = runtime::ITensor::SharedPtr;
+
+    runtime::PromptTuningParams mPromptTuningParams;
+    SizeType32 mMaxPromptVocabSize;
+
+    PromptTuningBuffers(SizeType32 maxBatchSize, runtime::BufferManager const& manager,
+        runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig);
+
+    PromptTuningBuffers(SizeType32 maxBatchSize, runtime::BufferManager const& manager,
+        runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig, bool promptTableOffloading);
+
+    void validate(std::optional<TensorPtr> const& optReqPromptEmbeddingTable,
+        std::optional<SizeType32> const& optReqPromptVocabSize);
+
+    void fill(RequestVector const& contextRequests, RequestVector const& genRequests,
+        runtime::BufferManager const& manager, bool packed);
+
+    /*
+     * The below functions are specific for Chunked Prefill mode
+     * Chunk Ptable with Ping-Pong Buffer Implementation
+     * -----------------------------------------------
+     *
+     * Overview:
+     * The chunk ptable (prompt tuning table) system uses a ping-pong buffer mechanism to efficiently
+     * manage large embedding tables when operating in context Prefill mode. This allows
+     * for processing of large embedding tables by loading them in chunks from CPU to GPU memory,
+     * enabling support for tables that exceed available GPU memory.
+     *
+     * Key Components:
+     * 1. Ping-Pong Buffers (mChunkPtableBuffers):
+     *    - Two alternating GPU buffers that store chunks of the embedding table
+     *    - While the current buffer is being processed by the model,
+     *      the next chunk can be asynchronously loaded into the other buffer
+     *    - Managed through mChunkPtableCurrentIndex (toggles between 0 and 1)
+     * 2. Start Positions Tracking (mChunkPtableBufferStartPositions):
+     *    - Mainly used for multi-batch processing
+     *    - Maintains the starting position of each batch's data within each buffer
+     *    - Maintained separately for each ping-pong buffer
+     *
+     * Memory Optimization:
+     * - Only two GPU buffers are maintained regardless of total embedding table size
+     * - Each buffer size is limited to contextChunkSize * hiddenSize
+     * - Efficient memory usage through chunk-based processing
+     */
+
+    bool mPromptTableOffloading;
+
+    bool mChunkPtableInitialized{false};
+    std::optional<std::array<TensorPtr, 2>> mChunkPtableBuffers;
+    std::optional<std::vector<std::vector<SizeType32>>> mChunkPtableBufferStartPositions;
+    size_t mChunkPtableCurrentIndex{0};
+
+    void initializeChunkPtableBuffers(runtime::BufferManager const& manager, runtime::ModelConfig const& modelConfig,
+        SizeType32 contextChunkSize, std::shared_ptr<LlmRequest> const& llmReq);
+
+    void switchChunkPtableBuffer();
+
+    size_t getChunkPtableCurrentIndex();
+
+    [[nodiscard]] TensorPtr& getChunkPtableBuffer(size_t index);
+
+    [[nodiscard]] SizeType32 getChunkPtableBufferSliceSize(size_t index, size_t batchIdx);
+
+    [[nodiscard]] SizeType32 getChunkPtableBufferStartPosition(size_t index, size_t batchIdx);
+
+    void updateBufferStartPosition(size_t index, SizeType32 numRows);
+
+    void clearBufferStartPositions(size_t index);
+};
+
+} // namespace tensorrt_llm::batch_manager
@@ -135,6 +135,10 @@ class RuntimeBuffers
 
 public:
     TensorPtr sequenceLengthsDevice;
+    bool promptTableOffloading;
+
+    //! Prompt-Tuning
+    std::unique_ptr<PromptTuningBuffers> promptTuningBuffers;
 
 private:
     //! Runtime
@@ -148,9 +152,6 @@ class RuntimeBuffers
     //! Pipeline-Parallelism
     TensorPtr hiddenStates;
 
-    //! Prompt-Tuning
-    std::unique_ptr<PromptTuningBuffers> promptTuningBuffers;
-
     //! Mrope
     TensorPtr mropeRotaryCosSin;
     TensorPtr mropePositionDeltas;
@@ -259,7 +260,8 @@ class RuntimeBuffers
         runtime::TllmRuntime const& runtime, runtime::ModelConfig const& modelConfig,
         runtime::WorldConfig const& worldConfig, executor::DecodingConfig const& decodingConfig,
         bool gatherGenerationLogits, std::optional<SizeType32> maxNumTokens = std::nullopt,
-        std::optional<std::vector<executor::AdditionalModelOutput>> const& additionalModelOutputs = std::nullopt);
+        std::optional<std::vector<executor::AdditionalModelOutput>> const& additionalModelOutputs = std::nullopt,
+        bool promptTableOffloading = false);
 
     RuntimeBuffers(RuntimeBuffers const& other) = delete;
     RuntimeBuffers& operator=(RuntimeBuffers const& other) = delete;
 
@@ -53,7 +53,7 @@ class TrtGptModelOptionalParams
         std::optional<executor::GuidedDecodingConfig> guidedDecodingConfig = std::nullopt,
         bool isLeaderInOrchMode = false,
         std::optional<std::vector<executor::AdditionalModelOutput>> additionalModelOutputs = std::nullopt,
-        bool gatherGenerationLogits = false)
+        bool gatherGenerationLogits = false, bool promptTableOffloading = false)
         : kvCacheConfig{std::move(kvCacheConfig)}
         , enableTrtOverlap{enableTrtOverlap}
         , deviceIds(std::move(deviceIds))
@@ -75,6 +75,7 @@ class TrtGptModelOptionalParams
         , isLeaderInOrchMode{isLeaderInOrchMode}
         , additionalModelOutputs{std::move(additionalModelOutputs)}
         , gatherGenerationLogits{gatherGenerationLogits}
+        , promptTableOffloading{promptTableOffloading}
     {
         if (guidedDecodingConfig)
         {
@@ -125,6 +126,8 @@ class TrtGptModelOptionalParams
     bool isLeaderInOrchMode;
     std::optional<std::vector<executor::AdditionalModelOutput>> additionalModelOutputs;
     bool gatherGenerationLogits;
+    // Whether to offload the prompt table to CPU and prefetching to GPU
+    bool promptTableOffloading;
 };
 
 } // namespace tensorrt_llm::batch_manager
@@ -1408,7 +1408,8 @@ class ExecutorConfig
         std::optional<SpeculativeDecodingConfig> specDecConfig = std::nullopt,
         std::optional<GuidedDecodingConfig> guidedDecodingConfig = std::nullopt,
         std::optional<std::vector<AdditionalModelOutput>> additionalModelOutputs = std::nullopt,
-        bool gatherGenerationLogits = false, bool useVariableBeamWidthSearch = false);
+        bool gatherGenerationLogits = false, bool useVariableBeamWidthSearch = false,
+        bool promptTableOffloading = false);
 
     [[nodiscard]] SizeType32 getMaxBeamWidth() const;
     [[nodiscard]] SchedulerConfig getSchedulerConfig() const;
@@ -1441,6 +1442,7 @@ class ExecutorConfig
     [[nodiscard]] std::optional<std::vector<AdditionalModelOutput>> getAdditionalModelOutputs() const;
     [[nodiscard]] bool getGatherGenerationLogits() const;
     [[nodiscard]] bool getUseVariableBeamWidthSearch() const;
+    [[nodiscard]] bool getPromptTableOffloading() const;
 
     void setMaxBeamWidth(SizeType32 maxBeamWidth);
     void setMaxBatchSize(SizeType32 maxBatchSize);
@@ -1468,6 +1470,7 @@ class ExecutorConfig
     void setAdditionalModelOutputs(std::vector<AdditionalModelOutput> const& additionalModelOutputs);
     void setGatherGenerationLogits(bool gatherGenerationLogits);
     void setUseVariableBeamWidthSearch(bool useVariableBeamWidthSearch);
+    void setPromptTableOffloading(bool promptTableOffloading);
 
 private:
     friend class Serialization;
@@ -1548,6 +1551,9 @@ class ExecutorConfig
 
     /// @brief Controls if Variable-Beam-Width-Search is enabled.
     bool mUseVariableBeamWidthSearch{false};
+
+    /// @brief Controls if prompt table offloading is enabled.
+    bool mPromptTableOffloading{false};
 };
 
 struct KVCacheCreatedData