optimize pack (#1153)

Qubitium · CSY-ModelCloud · web-flow · commit af27b21722a4 · 2025-01-24T11:35:15.000+08:00
* optimize torch packer

* revert change

* optimize exllama pack

* optimize triton pack

* optimize ipex pack

* clean test

* update test codes

* add test_pack_speed

---------

Co-authored-by: CSY &lt;csy@modelcloud.ai&gt;
diff --git a/gptqmodel/nn_modules/qlinear/exllama.py b/gptqmodel/nn_modules/qlinear/exllama.py
@@ -180,28 +180,21 @@ def pack(self, linear, scales, zeros, g_idx=None):
         intweight = intweight.t().contiguous()
         intweight = intweight.numpy().astype(np.uint32)
 
-        i = 0
-        row = 0
         qweight = np.zeros((intweight.shape[0] // 32 * self.bits, intweight.shape[1]), dtype=np.uint32)
-        while row < qweight.shape[0]:
-            for j in range(i, i + (32 // self.bits)):
-                qweight[row] |= intweight[j] << (self.bits * (j - i))
-            i += 32 // self.bits
-            row += 1
+        for row in range(qweight.shape[0]):
+            i = row * (32 // self.bits)
+            for j in range(32 // self.bits):
+                qweight[row] |= intweight[i + j] << (self.bits * j)
 
         qweight = qweight.astype(np.int32)
         self.qweight = torch.from_numpy(qweight)
 
         zeros = zeros.numpy().astype(np.uint32)
         qzeros = np.zeros((zeros.shape[0], zeros.shape[1] // 32 * self.bits), dtype=np.uint32)
-        i = 0
-        col = 0
-        while col < qzeros.shape[1]:
-            for j in range(i, i + (32 // self.bits)):
-                qzeros[:, col] |= zeros[:, j] << (self.bits * (j - i))
-            i += 32 // self.bits
-            col += 1
-
+        for col in range(qzeros.shape[1]):
+            i = col * (32 // self.bits)
+            for j in range(32 // self.bits):
+                qzeros[:, col] |= zeros[:, i + j] << (self.bits * j)
 
         qzeros = qzeros.astype(np.int32)
         self.qzeros = torch.from_numpy(qzeros)
diff --git a/gptqmodel/nn_modules/qlinear/ipex.py b/gptqmodel/nn_modules/qlinear/ipex.py
@@ -202,32 +202,25 @@ def pack(self, linear, scales, zeros, g_idx=None):
             self.bias = linear.bias.clone().to(dtype=linear.weight.dtype)
 
         intweight = torch.round((W + scale_zeros[self.g_idx].T) / scales[self.g_idx].T).to(torch.int)
-
         intweight = intweight.t().contiguous()
         intweight = intweight.numpy().astype(np.uint32)
 
-        i = 0
-        row = 0
         qweight = np.zeros((intweight.shape[0] // 32 * self.bits, intweight.shape[1]), dtype=np.uint32)
-        while row < qweight.shape[0]:
-            for j in range(i, i + (32 // self.bits)):
-                qweight[row] |= intweight[j] << (self.bits * (j - i))
-            i += 32 // self.bits
-            row += 1
+        for row in range(qweight.shape[0]):
+            i = row * (32 // self.bits)
+            for j in range(32 // self.bits):
+                qweight[row] |= intweight[i + j] << (self.bits * j)
 
         qweight = qweight.astype(np.int32)
         self.qweight = torch.from_numpy(qweight)
 
         zeros -= 1
         zeros = zeros.numpy().astype(np.uint32)
         qzeros = np.zeros((zeros.shape[0], zeros.shape[1] // 32 * self.bits), dtype=np.uint32)
-        i = 0
-        col = 0
-        while col < qzeros.shape[1]:
-            for j in range(i, i + (32 // self.bits)):
-                qzeros[:, col] |= zeros[:, j] << (self.bits * (j - i))
-            i += 32 // self.bits
-            col += 1
+        for col in range(qzeros.shape[1]):
+            i = col * (32 // self.bits)
+            for j in range(32 // self.bits):
+                qzeros[:, col] |= zeros[:, i + j] << (self.bits * j)
 
         qzeros = qzeros.astype(np.int32)
         self.qzeros = torch.from_numpy(qzeros)
diff --git a/gptqmodel/nn_modules/qlinear/torch.py b/gptqmodel/nn_modules/qlinear/torch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 
 import math
-
 import numpy as np
 import torch
 import torch.nn as nn
@@ -26,7 +25,6 @@
 
 from ...models._const import DEVICE, PLATFORM
 
-
 logger = setup_logger()
 
 class TorchQuantLinear(BaseQuantLinear):
@@ -62,9 +60,7 @@ def __init__(
 
         self.infeatures = infeatures
         self.outfeatures = outfeatures
-
         self.padded_infeatures = infeatures + (-infeatures % group_size)
-
         self.bits = bits
         self.group_size = group_size if group_size != -1 else infeatures
         self.maxq = 2**self.bits - 1
@@ -99,7 +95,6 @@ def __init__(
         else:
             self.bias = None
 
-        # is performed by unpacking the weights and using torch.matmul
         if self.bits in [2, 4, 8]:
             self.wf = torch.tensor(list(range(0, 32, self.bits)), dtype=torch.int32).unsqueeze(0)
         elif self.bits == 3:
@@ -140,77 +135,61 @@ def pack(self, linear, scales, zeros, g_idx=None):
             self.bias = linear.bias.clone().to(dtype=linear.weight.dtype)
 
         intweight = torch.round((W + scale_zeros[self.g_idx].T) / scales[self.g_idx].T).to(torch.int)
-
         intweight = intweight.t().contiguous()
         intweight = intweight.numpy().astype(np.uint32)
 
-        i = 0
-        row = 0
         qweight = np.zeros((intweight.shape[0] // 32 * self.bits, intweight.shape[1]), dtype=np.uint32)
-        while row < qweight.shape[0]:
-            if self.bits in [2, 4, 8]:
-                for j in range(i, i + (32 // self.bits)):
-                    qweight[row] |= intweight[j] << (self.bits * (j - i))
-                i += 32 // self.bits
-                row += 1
-            elif self.bits == 3:
-                for j in range(i, i + 10):
-                    qweight[row] |= intweight[j] << (3 * (j - i))
-                i += 10
-                qweight[row] |= intweight[i] << 30
-                row += 1
-                qweight[row] |= (intweight[i] >> 2) & 1
-                i += 1
-                for j in range(i, i + 10):
-                    qweight[row] |= intweight[j] << (3 * (j - i) + 1)
-                i += 10
-                qweight[row] |= intweight[i] << 31
+        if self.bits in [2, 4, 8]:
+            bits_div = 32 // self.bits
+            for row in range(qweight.shape[0]):
+                for j in range(bits_div):
+                    qweight[row] |= intweight[row * bits_div + j] << (self.bits * j)
+        elif self.bits == 3:
+            for row in range(qweight.shape[0]):
+                row_offset = row * 10  # Cache row * 10
+                row_offset_plus_10 = row_offset + 10  # Cache row * 10 + 10
+                for j in range(10):
+                    qweight[row] |= intweight[row_offset + j] << (3 * j)
+                qweight[row] |= intweight[row_offset_plus_10] << 30
                 row += 1
-                qweight[row] |= (intweight[i] >> 1) & 0x3
-                i += 1
-                for j in range(i, i + 10):
-                    qweight[row] |= intweight[j] << (3 * (j - i) + 2)
-                i += 10
+                qweight[row] |= (intweight[row_offset_plus_10] >> 2) & 1
+                for j in range(10):
+                    qweight[row] |= intweight[row_offset + j] << (3 * j + 1)
+                qweight[row] |= intweight[row_offset_plus_10] << 31
                 row += 1
+                qweight[row] |= (intweight[row_offset_plus_10] >> 1) & 0x3
+                for j in range(10):
+                    qweight[row] |= intweight[row_offset + j] << (3 * j + 2)
 
-        qweight = qweight.astype(np.int32)
-        self.qweight = torch.from_numpy(qweight)
+        self.qweight = torch.from_numpy(qweight.astype(np.int32))
 
         zeros = zeros.numpy().astype(np.uint32)
         qzeros = np.zeros((zeros.shape[0], zeros.shape[1] // 32 * self.bits), dtype=np.uint32)
-        i = 0
-        col = 0
-        while col < qzeros.shape[1]:
-            if self.bits in [2, 4, 8]:
-                for j in range(i, i + (32 // self.bits)):
-                    qzeros[:, col] |= zeros[:, j] << (self.bits * (j - i))
-                i += 32 // self.bits
-                col += 1
-            elif self.bits == 3:
-                for j in range(i, i + 10):
-                    qzeros[:, col] |= zeros[:, j] << (3 * (j - i))
-                i += 10
-                qzeros[:, col] |= zeros[:, i] << 30
-                col += 1
-                qzeros[:, col] |= (zeros[:, i] >> 2) & 1
-                i += 1
-                for j in range(i, i + 10):
-                    qzeros[:, col] |= zeros[:, j] << (3 * (j - i) + 1)
-                i += 10
-                qzeros[:, col] |= zeros[:, i] << 31
+        if self.bits in [2, 4, 8]:
+            bits_div = 32 // self.bits
+            for col in range(qzeros.shape[1]):
+                for j in range(bits_div):
+                    qzeros[:, col] |= zeros[:, col * bits_div + j] << (self.bits * j)
+        elif self.bits == 3:
+            for col in range(qzeros.shape[1]):
+                col_offset = col * 10  # Cache col * 10
+                col_offset_plus_10 = col_offset + 10  # Cache col * 10 + 10
+                for j in range(10):
+                    qzeros[:, col] |= zeros[:, col_offset + j] << (3 * j)
+                qzeros[:, col] |= zeros[:, col_offset_plus_10] << 30
                 col += 1
-                qzeros[:, col] |= (zeros[:, i] >> 1) & 0x3
-                i += 1
-                for j in range(i, i + 10):
-                    qzeros[:, col] |= zeros[:, j] << (3 * (j - i) + 2)
-                i += 10
+                qzeros[:, col] |= (zeros[:, col_offset_plus_10] >> 2) & 1
+                for j in range(10):
+                    qzeros[:, col] |= zeros[:, col_offset + j] << (3 * j + 1)
+                qzeros[:, col] |= zeros[:, col_offset_plus_10] << 31
                 col += 1
+                qzeros[:, col] |= (zeros[:, col_offset_plus_10] >> 1) & 0x3
+                for j in range(10):
+                    qzeros[:, col] |= zeros[:, col_offset + j] << (3 * j + 2)
 
-        qzeros = qzeros.astype(np.int32)
-        self.qzeros = torch.from_numpy(qzeros)
+        self.qzeros = torch.from_numpy(qzeros.astype(np.int32))
 
     def forward(self, x: torch.Tensor):
-        # if infeatures is padded, we need to pad the input as well
         if x.size(-1) != self.padded_infeatures:
             x = F.pad(x, (0, self.padded_infeatures - self.infeatures))
 
@@ -241,6 +220,7 @@ def _empty_gptq_only_weights(self):
     def dequantize_weight(self, num_itr=1):
         if self.wf.device != self.qzeros.device:
             self.wf = self.wf.to(self.qzeros.device)
+
         if self.bits in [2, 4, 8]:
             zeros = torch.bitwise_right_shift(
                 torch.unsqueeze(self.qzeros, 2).expand(-1, -1, 32 // self.bits),
@@ -293,4 +273,4 @@ def dequantize_weight(self, num_itr=1):
 
         return weights
 
-__all__ = ["TorchQuantLinear"]
+__all__ = ["TorchQuantLinear"]
diff --git a/gptqmodel/nn_modules/qlinear/tritonv2.py b/gptqmodel/nn_modules/qlinear/tritonv2.py
@@ -139,7 +139,6 @@ def post_init(self):
             self.g_idx = torch.tensor([i // self.group_size for i in range(self.padded_infeatures)], dtype=torch.int32,
                                       device=self.g_idx.device)
 
-
     def pack(self, linear, scales, zeros, g_idx=None):
         W = linear.weight.data.clone()
         if isinstance(linear, nn.Conv2d):
@@ -157,31 +156,24 @@ def pack(self, linear, scales, zeros, g_idx=None):
             self.bias = linear.bias.clone().half()
 
         intweight = torch.round((W + scale_zeros[self.g_idx].T) / scales[self.g_idx].T).to(torch.int)
-
         intweight = intweight.t().contiguous()
         intweight = intweight.numpy().astype(np.uint32)
 
-        i = 0
-        row = 0
         qweight = np.zeros((intweight.shape[0] // 32 * self.bits, intweight.shape[1]), dtype=np.uint32)
-        while row < qweight.shape[0]:
-            for j in range(i, i + (32 // self.bits)):
-                qweight[row] |= intweight[j] << (self.bits * (j - i))
-            i += 32 // self.bits
-            row += 1
+        for row in range(qweight.shape[0]):
+            i = row * (32 // self.bits)
+            for j in range(32 // self.bits):
+                qweight[row] |= intweight[i + j] << (self.bits * j)
 
         qweight = qweight.astype(np.int32)
         self.qweight = torch.from_numpy(qweight)
 
         zeros = zeros.numpy().astype(np.uint32)
         qzeros = np.zeros((zeros.shape[0], zeros.shape[1] // 32 * self.bits), dtype=np.uint32)
-        i = 0
-        col = 0
-        while col < qzeros.shape[1]:
-            for j in range(i, i + (32 // self.bits)):
-                qzeros[:, col] |= zeros[:, j] << (self.bits * (j - i))
-            i += 32 // self.bits
-            col += 1
+        for col in range(qzeros.shape[1]):
+            i = col * (32 // self.bits)
+            for j in range(32 // self.bits):
+                qzeros[:, col] |= zeros[:, i + j] << (self.bits * j)
 
         qzeros = qzeros.astype(np.int32)
         self.qzeros = torch.from_numpy(qzeros)
diff --git a/tests/test_packing.py b/tests/test_packing.py