ModelCloud · Qubitium · Feb 21, 2025 · Feb 21, 2025 · Feb 21, 2025 · Feb 21, 2025
diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
@@ -417,6 +417,7 @@ jobs:
             if [ "$gpu_id" -lt 0 ]; then
               echo "http://${{ needs.check-vm.outputs.ip }}/gpu/get?id=${{ github.run_id }}&timestamp=$timestamp&test=${{ matrix.test_script }}&runner=${RUNNER_NAME}&exclusive=${{ github.event.inputs.exclusive-gpu }} returned $gpu_id"
               echo "No available GPU, waiting 5 seconds..."
+              curl http://${{ needs.check-vm.outputs.ip }}/gpu/status2
               sleep 5
             else
               echo "Allocated GPU ID: $gpu_id"
@@ -428,6 +429,7 @@ jobs:
           echo "CUDA_VISIBLE_DEVICES=$gpu_id" >> $GITHUB_ENV
           echo "STEP_TIMESTAMP=$timestamp" >> $GITHUB_ENV
           echo "CUDA_VISIBLE_DEVICES set to $gpu_id, timestamp=$timestamp"
+          curl http://${{ needs.check-vm.outputs.ip }}/gpu/status2
 
       - name: Run tests
         if: ${{ (!github.event.inputs.test_names || contains(github.event.inputs.test_names, matrix.test_script)) && !cancelled() }}
@@ -563,7 +565,7 @@ jobs:
             uv pip install colorlog
           fi
 
-          if [[ "${{ matrix.test_script }}" == "test_sglang.py" ]]; then
+          if [[ "${{ matrix.test_script }}" == "test_sglang" ]]; then
             uv pip install transformers==4.48.3
           fi
 
@@ -616,6 +618,7 @@ jobs:
             if [ "$gpu_id" -lt 0 ]; then
               echo "http://${{ needs.check-vm.outputs.ip }}/gpu/get?id=${{ github.run_id }}&timestamp=$timestamp&test=${{ matrix.test_script }}&runner=${RUNNER_NAME}&exclusive=${{ github.event.inputs.exclusive-gpu }} returned $gpu_id"
               echo "No available GPU, waiting 5 seconds..."
+              curl http://${{ needs.check-vm.outputs.ip }}/gpu/status2
               sleep 5
             else
               echo "Allocated GPU ID: $gpu_id"
@@ -627,10 +630,14 @@ jobs:
           echo "CUDA_VISIBLE_DEVICES=$gpu_id" >> $GITHUB_ENV
           echo "STEP_TIMESTAMP=$timestamp" >> $GITHUB_ENV
           echo "CUDA_VISIBLE_DEVICES set to $gpu_id, timestamp=$timestamp"
+          curl http://${{ needs.check-vm.outputs.ip }}/gpu/status2
 
       - name: Run tests
         if: ${{ (!github.event.inputs.test_names || contains(github.event.inputs.test_names, matrix.test_script)) && !cancelled() }}
         run: |
+          if [[ "${{ matrix.test_script }}" == *ipex* ]]; then
+            export CUDA_VISIBLE_DEVICES=""
+          fi
           if [[ "${{ matrix.test_script }}" == *xpu* ]]; then
             export CUDA_VISIBLE_DEVICES=""
             source /etc/profile.d/pyenv.sh && pyenv activate xpu

diff --git a/tests/test_packing.py b/tests/test_packing.py
@@ -100,15 +100,6 @@ def pack(self, qlinearCls):
         return qlinear
 
     def test_compare_exllama_triton_torch(self):
-        # validate exllama packer
-        exllama_linear = self.pack(ExllamaQuantLinear)
-
-        dequantized_weight, dequantized_qzeros = dequantize_4bits_weight(exllama_linear)
-        dequantized_weight = dequantized_weight.to(torch.float16)
-
-        self.assertTrue(torch.equal(dequantized_weight, self.linear.weight))
-        self.assertTrue(torch.all(dequantized_qzeros == 8))
-
         triton_linear = self.pack(TritonV2QuantLinear)
 
         dequantized_weight, dequantized_qzeros = dequantize_4bits_weight(triton_linear)
@@ -117,10 +108,6 @@ def test_compare_exllama_triton_torch(self):
         self.assertTrue(torch.equal(dequantized_weight, self.linear.weight))
         self.assertTrue(torch.all(dequantized_qzeros == 8))
 
-        self.assertTrue(torch.allclose(exllama_linear.qweight, triton_linear.qweight))
-        self.assertTrue(torch.allclose(exllama_linear.scales, triton_linear.scales))
-        self.assertTrue(torch.allclose(exllama_linear.qzeros, triton_linear.qzeros))
-
         # validate torch packer
         torch_linear = self.pack(TorchQuantLinear)
 
@@ -130,6 +117,6 @@ def test_compare_exllama_triton_torch(self):
         self.assertTrue(torch.equal(dequantized_weight, self.linear.weight))
         self.assertTrue(torch.all(dequantized_qzeros == 8))
 
-        self.assertTrue(torch.allclose(exllama_linear.qweight, torch_linear.qweight))
-        self.assertTrue(torch.allclose(exllama_linear.scales, torch_linear.scales))
-        self.assertTrue(torch.allclose(exllama_linear.qzeros, torch_linear.qzeros))
+        self.assertTrue(torch.allclose(triton_linear.qweight, torch_linear.qweight))
+        self.assertTrue(torch.allclose(triton_linear.scales, torch_linear.scales))
+        self.assertTrue(torch.allclose(triton_linear.qzeros, torch_linear.qzeros))