turboderp-org
diff --git a/‎README.md
Lines changed: 1 addition & 2 deletions b/‎README.md
Lines changed: 1 addition & 2 deletions
diff --git a/‎doc/cq_humaneval.png
59.5 KB b/‎doc/cq_humaneval.png
59.5 KB
diff --git a/‎exllamav3/cache/__init__.py
Lines changed: 2 additions & 1 deletion b/‎exllamav3/cache/__init__.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎exllamav3/cache/cache.py
Lines changed: 29 additions & 4 deletions b/‎exllamav3/cache/cache.py
Lines changed: 29 additions & 4 deletions
diff --git a/‎exllamav3/cache/fp16.py
Lines changed: 15 additions & 7 deletions b/‎exllamav3/cache/fp16.py
Lines changed: 15 additions & 7 deletions
diff --git a/‎exllamav3/cache/quant.py
Lines changed: 100 additions & 0 deletions b/‎exllamav3/cache/quant.py
Lines changed: 100 additions & 0 deletions
diff --git a/‎exllamav3/exllamav3_ext/bindings.cpp
Lines changed: 7 additions & 0 deletions b/‎exllamav3/exllamav3_ext/bindings.cpp
Lines changed: 7 additions & 0 deletions
@@ -6,7 +6,7 @@ This is an **early preview release** of ExLlamaV3. Please note: ↙
 - The framework <u>is not yet fully optimized</u>. Performance is lacking, especially on Ampere, and there may be a significant CPU bottleneck on slower processors until the extension functions are fully built out.
 - AMD GPUs (ROCm) are not yet supported.
 - [FlashAttention-2](https://github.com/Dao-AILab/flash-attention) is currently required. I hope to switch over to [FlashInfer](https://github.com/flashinfer-ai/flashinfer/tree/main) in time, but there are some obstacles to overcome first. 
-- A number of important features are yet to be added, such as cache quantization, tensor parallelism and multimodal support.
+- A number of important features are yet to be added, such as tensor parallelism and multimodal support.
 - There are no release builds yet.
 - Integration into [TabbyAPI](https://github.com/theroyallab/tabbyAPI/) is planned when all the core functionality is in place.
 
@@ -26,7 +26,6 @@ There's much that still needs to be added and/or ported over from ExLlamaV2. I'v
 - Samplers (most notably repetition penalties and min-P are missing)
 - Constrained sampling (JSON filters etc.)
 - Multimodal support
-- Cache quantization
 - LoRA support
 - ROCm support
 - Tensor-parallel inference
 
@@ -1,2 +1,3 @@
 from .cache import Cache, CacheLayer
-from .fp16 import CacheLayer_fp16
+from .fp16 import CacheLayer_fp16
+from .quant import CacheLayer_quant
@@ -16,6 +16,7 @@ def __init__(
         config: Config,
         attention: Attention,
         max_num_tokens: int,
+        **kwargs
     ):
         self.config = config
         self.attention = attention
@@ -30,7 +31,18 @@ def free(self):
         pass
 
     @abstractmethod
-    def get_kv(self):
+    def get_kv(self, cache_seqlens: torch.Tensor, block_table: torch.Tensor) -> tuple:
+        pass
+
+    @abstractmethod
+    def update_kv(
+        self,
+        cache_seqlens: torch.Tensor,
+        block_table: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        length: int
+    ):
         pass
 
     @abstractmethod
@@ -45,6 +57,7 @@ def __init__(
         model: Model,
         max_num_tokens: int,
         layer_type: Type[CacheLayer] | None = None,
+        **kwargs
     ):
         """
         Create cache for model
@@ -71,7 +84,7 @@ def __init__(
 
         self.num_layers = len(self.model.get_cache_layers())
         self.layers = [
-            self.layer_type(self.config, attn, self.max_num_tokens)
+            self.layer_type(self.config, attn, self.max_num_tokens, **kwargs)
             for attn in self.model.get_cache_layers()
         ]
         self.attach_to_model()
@@ -107,8 +120,20 @@ def detach_from_model(self, model: Model | None = None):
             module.cache_layers.remove(layer)
 
 
-    def get_layer(self, idx: int) -> tuple:
-        return self.layers[idx].get_kv()
+    def get_layer(self, idx: int, cache_seqlens: torch.Tensor, block_table: torch.Tensor) -> tuple:
+        return self.layers[idx].get_kv(cache_seqlens, block_table)
+
+
+    def update_layer(
+        self,
+        idx: int,
+        cache_seqlens: torch.Tensor,
+        block_table: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        length: int
+    ):
+        return self.layers[idx].update_kv(cache_seqlens, block_table, k, v, length)
 
 
     def copy_page(
 
@@ -47,16 +47,24 @@ def free(self):
 
 
     @override
-    def get_kv(self):
+    def get_kv(self, cache_seqlens: torch.Tensor, block_table: torch.Tensor) -> tuple:
         return self.k, self.v
 
 
+    @override
+    def update_kv(
+        self,
+        cache_seqlens: torch.Tensor,
+        block_table: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        length: int
+    ):
+        pass
+
+
     @override
     def copy_page(self, source: CacheLayer_fp16, from_page: int, to_page: int, num_tokens: int):
         assert self.shape == source.shape
-        kd = self.k[to_page, :num_tokens, :, :]
-        vd = self.v[to_page, :num_tokens, :, :]
-        ks = source.k[from_page, :num_tokens, :, :]
-        vs = source.v[from_page, :num_tokens, :, :]
-        kd.copy_(ks, non_blocking = True)
-        vd.copy_(vs, non_blocking = True)
+        self.k[to_page, :num_tokens, :, :].copy_(source.k[from_page, :num_tokens, :, :], non_blocking = True)
+        self.v[to_page, :num_tokens, :, :].copy_(source.v[from_page, :num_tokens, :, :], non_blocking = True)
@@ -0,0 +1,100 @@
+from __future__ import annotations
+from typing_extensions import override
+import torch
+import torch.nn.functional as F
+from torch import nn
+from ..constants import PAGE_SIZE
+from ..models import Model, Config
+from .cache import CacheLayer
+from typing import TYPE_CHECKING
+from exllamav3.ext import exllamav3_ext as ext
+if TYPE_CHECKING:
+    from ..modules import Attention
+
+class CacheLayer_quant(CacheLayer):
+
+    def __init__(
+        self,
+        config: Config,
+        attention: Attention,
+        max_num_tokens: int,
+        k_bits: int,
+        v_bits: int,
+    ):
+        super().__init__(config, attention, max_num_tokens)
+
+        assert max_num_tokens % PAGE_SIZE == 0, \
+            f"max_num_tokens must be a multiple of {PAGE_SIZE}."
+        assert (2 <= k_bits <= 8) and (2 <= v_bits <= 8), "quantized cache must be from 2 to 8 bits"
+
+        self.shape = (
+            (max_num_tokens // PAGE_SIZE, PAGE_SIZE, attention.num_kv_heads, attention.head_dim)
+            if attention else None
+        )
+
+        self.k_bits = k_bits
+        self.v_bits = v_bits
+        self.token_dim = attention.num_kv_heads * attention.head_dim
+        self.qshape_k = ((max_num_tokens // PAGE_SIZE, PAGE_SIZE, self.token_dim // 32 * k_bits) if attention else None)
+        self.qshape_v = ((max_num_tokens // PAGE_SIZE, PAGE_SIZE, self.token_dim // 32 * v_bits) if attention else None)
+        self.qshape_s = ((max_num_tokens // PAGE_SIZE, PAGE_SIZE, self.token_dim // 32) if attention else None)
+
+        self.qk = None
+        self.qv = None
+        self.sk = None
+        self.sv = None
+        self.device = None
+
+
+    @override
+    def alloc(self, device: torch.device):
+        self.device = device
+        self.qk = torch.zeros(self.qshape_k, dtype = torch.int, device = device) if self.shape else None
+        self.qv = torch.zeros(self.qshape_v, dtype = torch.int, device = device) if self.shape else None
+        self.sk = torch.zeros(self.qshape_s, dtype = torch.half, device = device) if self.shape else None
+        self.sv = torch.zeros(self.qshape_s, dtype = torch.half, device = device) if self.shape else None
+
+
+    @override
+    def free(self):
+        self.device = None
+        self.qk = None
+        self.qv = None
+        self.sk = None
+        self.sv = None
+
+
+    @override
+    def get_kv(self, cache_seqlens: torch.Tensor, block_table: torch.Tensor):
+        k = torch.empty(self.shape, dtype = torch.half, device = self.device)
+        v = torch.empty(self.shape, dtype = torch.half, device = self.device)
+        ext.dequant_cache_paged(self.qk, self.sk, k, self.qv, self.sv, v, cache_seqlens, block_table, PAGE_SIZE)
+        return k, v
+
+
+    @override
+    def update_kv(
+        self,
+        cache_seqlens: torch.Tensor,
+        block_table: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        length: int
+    ):
+        ext.quant_cache_paged(
+            k, self.qk, self.sk,
+            v, self.qv, self.sv,
+            cache_seqlens, block_table,
+            PAGE_SIZE,
+            length
+        )
+
+
+    @override
+    def copy_page(self, source: CacheLayer_quant, from_page: int, to_page: int, num_tokens: int):
+        assert self.qshape_k == source.qshape_k
+        assert self.qshape_v == source.qshape_v
+        self.qk[to_page, :num_tokens, :].copy_(source.qk[from_page, :num_tokens, :], non_blocking = True)
+        self.qv[to_page, :num_tokens, :].copy_(source.qv[from_page, :num_tokens, :], non_blocking = True)
+        self.sk[to_page, :num_tokens, :].copy_(source.sk[from_page, :num_tokens, :], non_blocking = True)
+        self.sv[to_page, :num_tokens, :].copy_(source.sv[from_page, :num_tokens, :], non_blocking = True)
@@ -21,6 +21,8 @@
 #include "generator/sampling_basic.cuh"
 #include "generator/gumbel.cuh"
 
+#include "cache/q_cache.cuh"
+
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
 {
     m.def("stloader_read", &stloader_read, "stloader_read");
@@ -56,4 +58,9 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
 
     m.def("partial_strings_match", &partial_strings_match, "partial_strings_match");
     m.def("count_match_tensor", &count_match_tensor, "count_match_tensor");
+
+    m.def("quant_cache_cont", &quant_cache_cont, "quant_cache_cont");
+    m.def("dequant_cache_cont", &dequant_cache_cont, "dequant_cache_cont");
+    m.def("quant_cache_paged", &quant_cache_paged, "quant_cache_paged");
+    m.def("dequant_cache_paged", &dequant_cache_paged, "dequant_cache_paged");
 }