wip

Ubuntu · Ubuntu · commit 293914065124 · 2025-04-15T14:24:14.000Z
Signed-off-by: Ubuntu &lt;dafrimi@nvidia.com&gt;
diff --git a/tests/unittest/_torch/modules/tests_lora_modules/test_lora_mlp_pytorch_flow_vs_torch.py b/tests/unittest/_torch/modules/tests_lora_modules/test_lora_mlp_pytorch_flow_vs_torch.py
@@ -62,11 +62,14 @@ def setUpClass(cls):
         cls.device = torch.device('cuda')
 
     def _create_mlp_inputs(self):
+        # Initialize in FP32 first for better numerical stability
         hidden_states = torch.rand(
             size=[self.batch_size, self.seq_len, self.hidden_size],
-            dtype=self.torch_dtype,
-            device='cuda')
+            dtype=torch.float32,
+            device='cuda') * 0.01  # Use smaller scale
 
+        # Convert to target dtype after initialization
+        hidden_states = hidden_states.to(self.torch_dtype)
         return hidden_states
 
     def _create_lora_params(self):
@@ -83,35 +86,55 @@ def _create_lora_params(self):
 
         # Create weights for up projection
         lora_weight_ins_up = [
-            (torch.rand(self.hidden_size, lora_rank, device=self.device).to(
-                self.torch_dtype) * 0.1) for lora_rank in lora_ranks_list
+            # Initialize with FP32 and smaller scale (0.01 instead of 0.1)
+            torch.rand(self.hidden_size,
+                       lora_rank,
+                       device=self.device,
+                       dtype=torch.float32) * 0.01
+            for lora_rank in lora_ranks_list
         ]
         lora_weight_outs_up = [
-            (torch.rand(lora_rank, self.intermediate_size,
-                        device=self.device).to(self.torch_dtype) * 0.1)
+            torch.rand(lora_rank,
+                       self.intermediate_size,
+                       device=self.device,
+                       dtype=torch.float32) *
+            (0.01 / max(lora_rank, 1))  # Scale by rank
             for lora_rank in lora_ranks_list
         ]
 
         # Create weights for down projection
         lora_weight_ins_down = [
-            (torch.rand(self.intermediate_size, lora_rank,
-                        device=self.device).to(self.torch_dtype) * 0.1)
+            torch.rand(self.intermediate_size,
+                       lora_rank,
+                       device=self.device,
+                       dtype=torch.float32) * 0.01
             for lora_rank in lora_ranks_list
         ]
+        # Apply rank-based scaling to output weights
         lora_weight_outs_down = [
-            (torch.rand(lora_rank, self.hidden_size, device=self.device).to(
-                self.torch_dtype) * 0.1) for lora_rank in lora_ranks_list
+            torch.rand(lora_rank,
+                       self.hidden_size,
+                       device=self.device,
+                       dtype=torch.float32) *
+            (0.01 / max(lora_rank, 1))  # Scale by rank
+            for lora_rank in lora_ranks_list
         ]
 
-        lora_weight_ins_up = [tmp.contiguous() for tmp in lora_weight_ins_up]
+        # Convert to target dtype after initialization
+        lora_weight_ins_up = [
+            tmp.to(self.torch_dtype).contiguous() for tmp in lora_weight_ins_up
+        ]
         lora_weight_outs_up = [
-            tmp.transpose(1, 0).contiguous() for tmp in lora_weight_outs_up
+            tmp.to(self.torch_dtype).transpose(1, 0).contiguous()
+            for tmp in lora_weight_outs_up
         ]
         lora_weight_ins_down = [
-            tmp.contiguous() for tmp in lora_weight_ins_down
+            tmp.to(self.torch_dtype).contiguous()
+            for tmp in lora_weight_ins_down
         ]
         lora_weight_outs_down = [
-            tmp.transpose(1, 0).contiguous() for tmp in lora_weight_outs_down
+            tmp.to(self.torch_dtype).transpose(1, 0).contiguous()
+            for tmp in lora_weight_outs_down
         ]
 
         lora_weights_pointers_up = []
@@ -163,10 +186,8 @@ def _setup_vanilla_pytorch_mlp(self):
         return mlp_module
 
     def test_mlp(self):
-        hidden_states = torch.rand(
-            size=[self.batch_size, self.seq_len, self.hidden_size],
-            dtype=self.torch_dtype,
-            device='cuda')
+        # Use the _create_mlp_inputs method for consistent initialization
+        hidden_states = self._create_mlp_inputs()
 
         lora_params = self._create_lora_params()