feat: Add Gemma3 text-only model support

brb-nv · brb-nv · commit 1e31450883ef · 2025-04-09T04:00:37.000Z
diff --git a/examples/utils.py b/examples/utils.py
@@ -337,7 +337,8 @@ def add_common_args(parser):
     parser.add_argument(
         '--max_attention_window_size',
         type=int,
-        default=None,
+        default=[512, 512, 512, 512, 512, 2048, 512, 512, 512, 512, 512, 2048, 512, 512, 512, 512, 512, 2048, 512, 512, 512, 512, 512, 2048, 512, 512],
+        # default=None,
         nargs="+",
         help=
         'The attention window size that controls the sliding window attention / cyclic kv cache behavior'
diff --git a/tensorrt_llm/layers/attention.py b/tensorrt_llm/layers/attention.py
@@ -175,6 +175,11 @@ def __init__(self,
         self.embed_positions = None
         self.rotary_inv_freq = None
         self.embed_positions_for_gpt_attention = None
+
+        self.embed_positions_local = None
+        self.rotary_inv_freq_local = None
+        self.embed_positions_for_gpt_attention_local = None
+
         # long rope const parameters
         self.long_rope_embed_positions = None
         self.long_rope_rotary_inv_freq = None
@@ -186,10 +191,16 @@ def fill_attention_const_params_for_rope(
             self,
             embed_positions: Tensor = None,
             rotary_inv_freq: Tensor = None,
-            embed_positions_for_gpt_attention: Tensor = None):
+            embed_positions_for_gpt_attention: Tensor = None,
+            embed_positions_local: Tensor = None,
+            rotary_inv_freq_local: Tensor = None,
+            embed_positions_for_gpt_attention_local: Tensor = None):
         self.embed_positions = embed_positions
         self.rotary_inv_freq = rotary_inv_freq
         self.embed_positions_for_gpt_attention = embed_positions_for_gpt_attention
+        self.embed_positions_local = embed_positions_local
+        self.rotary_inv_freq_local = rotary_inv_freq_local
+        self.embed_positions_for_gpt_attention_local = embed_positions_for_gpt_attention_local
         return self
 
     def fill_attention_const_params_for_long_rope(
@@ -359,6 +370,7 @@ def __init__(self,
                  dtype=None,
                  position_embedding_type=PositionEmbeddingType.learned_absolute,
                  rotary_embedding_base=10000.0,
+                 rotary_embedding_base_local=1.0,
                  rotary_embedding_scaling=None,
                  rotary_embedding_percentage=1.0,
                  rope_scaling_short_factors=None,
@@ -388,7 +400,8 @@ def __init__(self,
                  cp_size=1,
                  cp_rank=0,
                  max_seqlen_for_logn_scaling=8192,
-                 use_logn_scaling=False):
+                 use_logn_scaling=False,
+                 is_local=False):
         super().__init__()
 
         self.local_layer_idx = local_layer_idx
@@ -417,6 +430,7 @@ def __init__(self,
         self.cp_group = cp_group
         self.cp_size = cp_size
         self.cp_rank = cp_rank
+        self.is_local = is_local
 
         self.num_layers = num_layers
         self.apply_query_key_layer_scaling = apply_query_key_layer_scaling
@@ -437,6 +451,7 @@ def __init__(self,
         self.max_distance = max_distance
         self.num_buckets = num_buckets
         self.rotary_embedding_base = rotary_embedding_base
+        self.rotary_embedding_base_local = rotary_embedding_base_local
         self.rotary_embedding_scaling = rotary_embedding_scaling
         self.rotary_embedding_scale_type = RotaryScalingType.none
         self.rotary_embedding_scale = 1.0
@@ -677,6 +692,29 @@ def create_attention_const_params(model_cls, config):
                           dtype='float32',
                           is_buffer=True))
 
+            rotary_embedding_base_local = getattr(config, 'rope_local_base_freq', None)
+            if rotary_embedding_base_local is not None:
+                embed_positions_local = RopeEmbeddingUtils.create_sinusoidal_positions(
+                    max_position_embeddings,
+                    rotary_embedding_dim,
+                )
+                rotary_inv_freq_local, embed_positions_for_gpt_attention_local = RopeEmbeddingUtils.create_sinusoidal_positions_for_attention_plugin(
+                    max_position_embeddings, rotary_embedding_dim,
+                    rotary_embedding_base_local, rotary_embedding_scale,
+                    rotary_embedding_scale_type, rotary_embedding_scaling)
+                model_cls.register_parameter(
+                    'embed_positions_local',
+                    Parameter(embed_positions_local, dtype='float32', is_buffer=True))
+                model_cls.register_parameter(
+                    'rotary_inv_freq_local',
+                    Parameter(rotary_inv_freq_local, dtype='float32', is_buffer=True))
+                model_cls.register_parameter(
+                    'embed_positions_for_gpt_attention_local',
+                    Parameter(embed_positions_for_gpt_attention_local,
+                            dtype='float32',
+                            is_buffer=True))
+
+
     @staticmethod
     def fill_attention_params(model_cls, attention_params):
         if model_cls.position_embedding_type.is_rope():
@@ -695,7 +733,10 @@ def fill_attention_params(model_cls, attention_params):
                 return attention_params.fill_attention_const_params_for_rope(
                     model_cls.embed_positions.value,
                     model_cls.rotary_inv_freq.value,
-                    model_cls.embed_positions_for_gpt_attention.value)
+                    model_cls.embed_positions_for_gpt_attention.value,
+                    model_cls.embed_positions_local.value,
+                    model_cls.rotary_inv_freq_local.value,
+                    model_cls.embed_positions_for_gpt_attention_local.value)
         # Fill nothing.
         return attention_params
 
@@ -1020,6 +1061,9 @@ def compute_cross_kv(encoder_output):
             # Rotary cos/sin cache.
             rotary_cos_sin = getattr(attention_params,
                                      "embed_positions_for_gpt_attention", None)
+            rotary_inv_freq_local = getattr(attention_params, "rotary_inv_freq_local", None)
+            rotary_cos_sin_local = getattr(attention_params,
+                                     "embed_positions_for_gpt_attention_local", None)
 
             long_rope_rotary_inv_freq = getattr(attention_params,
                                                 "long_rope_rotary_inv_freq",
@@ -1037,6 +1081,9 @@ def compute_cross_kv(encoder_output):
                 assert (rotary_inv_freq is not None) and (
                     rotary_cos_sin is not None
                 ), "rotary_inv_freq and embed_positions_for_gpt_attention must be provided."
+                assert (rotary_inv_freq_local is not None) and (
+                    rotary_cos_sin_local is not None
+                ), "rotary_inv_freq_local and embed_positions_for_gpt_attention_local must be provided."
             if self.position_embedding_type == PositionEmbeddingType.long_rope:
                 assert long_rope_rotary_inv_freq is not None
                 assert long_rope_rotary_cos_sin is not None
@@ -1062,7 +1109,7 @@ def compute_cross_kv(encoder_output):
                 hidden_size_per_head=self.attention_head_size,
                 q_scaling=self.q_scaling,
                 rotary_embedding_dim=self.rotary_embedding_dim,
-                rotary_embedding_base=self.rotary_embedding_base,
+                rotary_embedding_base=self.rotary_embedding_base if not self.is_local else self.rotary_embedding_base_local,
                 rotary_embedding_scale_type=self.rotary_embedding_scale_type,
                 rotary_embedding_short_m_scale=attention_params.short_mscale,
                 rotary_embedding_long_m_scale=attention_params.long_mscale,
@@ -1071,8 +1118,8 @@ def compute_cross_kv(encoder_output):
                 rotary_embedding_original_max_positions=self.
                 original_max_position_embeddings,
                 position_embedding_type=self.position_embedding_type,
-                rotary_inv_freq=rotary_inv_freq,
-                rotary_cos_sin=rotary_cos_sin,
+                rotary_inv_freq=rotary_inv_freq if not self.is_local else rotary_inv_freq_local,
+                rotary_cos_sin=rotary_cos_sin if not self.is_local else rotary_cos_sin_local,
                 kv_orig_quant_scale=kv_orig_quant_scale,
                 kv_quant_orig_scale=kv_quant_orig_scale,
                 attention_output_orig_quant_scale=self.
diff --git a/tensorrt_llm/models/__init__.py b/tensorrt_llm/models/__init__.py
@@ -33,7 +33,7 @@
 from .enc_dec.model import DecoderModel, EncoderModel, WhisperEncoder
 from .falcon.config import FalconConfig
 from .falcon.model import FalconForCausalLM, FalconModel
-from .gemma.config import GEMMA2_ARCHITECTURE, GEMMA_ARCHITECTURE, GemmaConfig
+from .gemma.config import GEMMA3_ARCHITECTURE, GEMMA2_ARCHITECTURE, GEMMA_ARCHITECTURE, GemmaConfig
 from .gemma.model import GemmaForCausalLM
 from .gpt.config import GPTConfig
 from .gpt.model import GPTForCausalLM, GPTModel
@@ -183,6 +183,7 @@
     'SkyworkForCausalLM': LLaMAForCausalLM,
     GEMMA_ARCHITECTURE: GemmaForCausalLM,
     GEMMA2_ARCHITECTURE: GemmaForCausalLM,
+    GEMMA3_ARCHITECTURE: GemmaForCausalLM,
     'QWenLMHeadModel': QWenForCausalLM,
     'QWenForCausalLM': QWenForCausalLM,
     'Qwen2ForCausalLM': QWenForCausalLM,
diff --git a/tensorrt_llm/models/gemma/config.py b/tensorrt_llm/models/gemma/config.py
@@ -12,13 +12,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import TYPE_CHECKING, Optional, Union
+from typing import TYPE_CHECKING, Optional, Union, List
 
 from tensorrt_llm.functional import PositionEmbeddingType
 from tensorrt_llm.logger import logger
 from tensorrt_llm.mapping import Mapping
 from tensorrt_llm.models.convert_utils import infer_dtype
-from tensorrt_llm.models.modeling_utils import (Gemma2ConfigGroup,
+from tensorrt_llm.models.modeling_utils import (Gemma2ConfigGroup, Gemma3ConfigGroup,
                                                 PretrainedConfig, QuantConfig)
 
 if TYPE_CHECKING:
@@ -30,6 +30,7 @@
 
 GEMMA_ARCHITECTURE = "GemmaForCausalLM"
 GEMMA2_ARCHITECTURE = "Gemma2ForCausalLM"
+GEMMA3_ARCHITECTURE = "Gemma3ForCausalLM"
 
 
 class GemmaConfig(PretrainedConfig):
@@ -48,6 +49,9 @@ def __init__(
         final_logit_softcapping: Optional[float] = None,
         attn_logit_softcapping: Optional[float] = None,
         mapping: Optional[Union[Mapping, dict]] = None,
+        sliding_window_pattern: int = None,
+        rope_local_base_freq: int = None,
+        sliding_window: int = None,
         **kwargs,
     ):
         use_parallel_embedding = False
@@ -85,17 +89,26 @@ def __init__(
             self.query_pre_attn_scalar = query_pre_attn_scalar
             self.final_logit_softcapping = final_logit_softcapping
             self.attn_logit_softcapping = attn_logit_softcapping
+        elif self.is_gemma_3:
+            self.inter_layernorms = True
+            assert query_pre_attn_scalar is not None, "Gemma3 models must configure `query_pre_attn_scalar`"
+            self.query_pre_attn_scalar = query_pre_attn_scalar
+            self.final_logit_softcapping = final_logit_softcapping
+            self.sliding_window_pattern = sliding_window_pattern
+            self.rope_local_base_freq = rope_local_base_freq
+            self.sliding_window = sliding_window
 
     GEMMA_ADDED_FIELDS = {
         "rotary_base", "rotary_scaling", "attn_bias", "mlp_bias",
         "inter_layernorms"
     }
     GEMMA2_ADDED_FIELDS = Gemma2ConfigGroup.keys()
+    GEMMA3_ADDED_FIELDS = Gemma3ConfigGroup.keys()
     VERBATIM = {
         "num_hidden_layers", "num_attention_heads", "hidden_size",
         "intermediate_size", "vocab_size", "max_position_embeddings",
         "hidden_act", "use_parallel_embedding"
-    } | GEMMA2_ADDED_FIELDS
+    } | GEMMA2_ADDED_FIELDS | GEMMA3_ADDED_FIELDS
 
     @property
     def is_gemma_2(self) -> bool:
@@ -106,6 +119,15 @@ def gemma2_config(self):
             return self.get_config_group(Gemma2ConfigGroup)
         return None
 
+    @property
+    def is_gemma_3(self) -> bool:
+        return self.architecture == GEMMA3_ARCHITECTURE
+
+    def gemma3_config(self):
+        if self.is_gemma_3:
+            return self.get_config_group(Gemma3ConfigGroup)
+        return None
+
     def to_dict(self):
         """Serialize the fields added in GemmaConfig"""
 
@@ -118,7 +140,11 @@ def to_dict(self):
             **({
                 f: getattr(self, f)
                 for f in self.GEMMA2_ADDED_FIELDS
-            } if self.is_gemma_2 else {})
+            } if self.is_gemma_2 else {}),
+            **({
+                f: getattr(self, f)
+                for f in self.GEMMA3_ADDED_FIELDS
+            } if self.is_gemma_3 else {})
         }
 
     @classmethod
@@ -148,6 +174,7 @@ def from_hugging_face(
             norm_epsilon=hf_config.rms_norm_eps,
             num_key_value_heads=getattr(hf_config, "num_key_value_heads",
                                         hf_config.num_attention_heads),
+            rotary_base=getattr(hf_config, "rope_theta", 10000.0),
             rotary_scaling=getattr(hf_config, "rotary_scaling", None),
             quantization=quant_config,
             mapping=mapping,
diff --git a/tensorrt_llm/models/gemma/convert.py b/tensorrt_llm/models/gemma/convert.py
@@ -317,6 +317,10 @@ def rename_to_trt_llm(self, name: str) -> Optional[str]:
              None),  # merged with above
             (r"model.layers.(\d+).self_attn.o_proj.weight",
              r"layers.\1.attention.dense.weight"),
+            (r"model.layers.(\d+).self_attn.q_norm.weight",
+             r"layers.\1.attention.q_layernorm.weight"),
+            (r"model.layers.(\d+).self_attn.k_norm.weight",
+             r"layers.\1.attention.k_layernorm.weight"),
             (r"model.layers.(\d+).mlp.gate_proj.weight",
              r"layers.\1.mlp.fc.weight"),
             (r"model.layers.(\d+).mlp.up_proj.weight",
@@ -795,6 +799,8 @@ def load_gemma_weights(
                 "pre_feedforward_layernorm",
                 "post_feedforward_layernorm",
                 "model.norm.weight",
+                "q_norm.weight",
+                "k_norm.weight",
         )):
             param = param + 1.0  # upcasted to float32 in case of bfloat16
             add_trt_llm_weight(weights, trt_llm_name, param,
diff --git a/tensorrt_llm/models/gemma/model.py b/tensorrt_llm/models/gemma/model.py
@@ -24,7 +24,7 @@
 from ..._common import default_net
 from ..._utils import pad_vocab_size
 from ...functional import (AllReduceFusionOp, AllReduceParams, Tensor, cast,
-                           recv, send)
+                           recv, send, LayerNormType)
 from ...layers import (Attention, AttentionMaskType, AttentionParams,
                        ColumnLinear, Embedding, GatedMLP, KeyValueCacheParams,
                        LoraParams, PositionEmbeddingType, RmsNorm)
@@ -56,32 +56,48 @@ def __init__(self, config: GemmaConfig, layer_idx: int):
 
         q_scaling = 1.0
         max_attn_value = 0.0
+        qk_layernorm = False
+        is_sliding = False
+        rotary_base = config.rotary_base
+        rotary_base_local = None
 
         gemma2_config = config.gemma2_config()
+        gemma3_config = config.gemma3_config()
         if gemma2_config:
             q_scaling = math.sqrt(
                 gemma2_config.query_pre_attn_scalar) / math.sqrt(
                     config.head_size)
             max_attn_value = config.attn_logit_softcapping or 0.0
+        elif gemma3_config:
+            qk_layernorm = True
+            q_scaling = math.sqrt(
+                gemma3_config.query_pre_attn_scalar) / math.sqrt(
+                    config.head_size)
+            is_sliding = bool((layer_idx + 1) % gemma3_config.sliding_window_pattern)
+            rotary_base_local = config.rope_local_base_freq
 
         self.attention = Attention(
             local_layer_idx=self.local_layer_idx,
             hidden_size=config.hidden_size,
             num_attention_heads=config.num_attention_heads,
             num_kv_heads=config.num_key_value_heads,
             attention_head_size=config.head_size,
+            qk_layernorm=qk_layernorm,
+            layernorm_type=LayerNormType.RmsNorm,
             max_position_embeddings=config.max_position_embeddings,
             dtype=config.dtype,
             attention_mask_type=AttentionMaskType.causal,
             bias=config.attn_bias,
             position_embedding_type=PositionEmbeddingType.rope_gpt_neox,
-            rotary_embedding_base=config.rotary_base,
+            rotary_embedding_base=rotary_base,
+            rotary_embedding_base_local=rotary_base_local,
             rotary_embedding_scaling=config.rotary_scaling,
             tp_group=config.mapping.tp_group,
             tp_size=config.mapping.tp_size,
             quant_mode=config.quant_mode,
             q_scaling=q_scaling,
             max_attn_value=max_attn_value,
+            is_local=is_sliding,
         )
 
         mlp_hidden_size = config.hidden_size * 4 if config.intermediate_size is None else config.intermediate_size
@@ -223,8 +239,7 @@ def forward(self,
 
         if self.mapping.is_first_pp_rank():
             hidden_states = self.vocab_embedding(input_ids, *ptuning_args)
-            hidden_states = cast(hidden_states * math.sqrt(self.hidden_size),
-                                 hidden_states.dtype)
+            hidden_states = cast(hidden_states * math.sqrt(self.hidden_size), hidden_states.dtype)
         else:
             hidden_states = recv(hidden_states, self.mapping.prev_pp_rank())
         hidden_states = self.layers.forward(
diff --git a/tensorrt_llm/models/modeling_utils.py b/tensorrt_llm/models/modeling_utils.py
diff --git a/tensorrt_llm/quantization/quantize_by_modelopt.py b/tensorrt_llm/quantization/quantize_by_modelopt.py