dphnAI · AlpinDale · Jul 22, 2025
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -242,6 +242,7 @@ set(APHRODITE_EXT_SRC
   "kernels/quantization/fp8/common.cu"
   "kernels/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
   "kernels/quantization/gguf/gguf_kernel.cu"
+  "kernels/quantization/exl3/exl3_gemm.cu"
   "kernels/quantization/activation_kernels.cu"
   "kernels/cuda_utils_kernels.cu"
   "kernels/prepare_inputs/advance_step.cu"

diff --git a/aphrodite/_custom_ops.py b/aphrodite/_custom_ops.py
@@ -319,6 +319,39 @@ def gptq_shuffle(q_weight: torch.Tensor, q_perm: torch.Tensor,
     torch.ops._C.gptq_shuffle(q_weight, q_perm, bit)
 
 
+# exl3
+def exl3_gemm(input: torch.Tensor, trellis: torch.Tensor, suh: torch.Tensor,
+              svh: torch.Tensor, mcg_mult: int, mul1_mult: int) -> torch.Tensor:
+    return torch.ops._C.exl3_gemm(input, trellis, suh, svh, mcg_mult, mul1_mult)
+
+
+def exl3_reconstruct(trellis: torch.Tensor, in_features: int, out_features: int,
+                     mcg_mult: int, mul1_mult: int) -> torch.Tensor:
+    return torch.ops._C.exl3_reconstruct(trellis, in_features, out_features,
+                                         mcg_mult, mul1_mult)
+
+
+if hasattr(torch.ops._C, "exl3_gemm"):
+
+    @register_fake("_C::exl3_gemm")
+    def _exl3_gemm_fake(input: torch.Tensor, trellis: torch.Tensor,
+                        suh: torch.Tensor, svh: torch.Tensor, mcg_mult: int,
+                        mul1_mult: int) -> torch.Tensor:
+        batch_size = input.size(0)
+        out_features = svh.size(0)
+        return torch.empty((batch_size, out_features),
+                           dtype=torch.float32,
+                           device=input.device)
+
+    @register_fake("_C::exl3_reconstruct")
+    def _exl3_reconstruct_fake(trellis: torch.Tensor, in_features: int,
+                               out_features: int, mcg_mult: int,
+                               mul1_mult: int) -> torch.Tensor:
+        return torch.empty((in_features, out_features),
+                           dtype=torch.float16,
+                           device=trellis.device)
+
+
 # squeezellm
 def squeezellm_gemm(vec: torch.Tensor, mat: torch.Tensor, mul: torch.Tensor,
                     lookup_table: torch.Tensor) -> None:

diff --git a/aphrodite/quantization/__init__.py b/aphrodite/quantization/__init__.py
@@ -30,6 +30,7 @@
     "quark",
     "moe_wna16",
     "torchao",
+    "exl3",
     "fp2",
     "fp3",
     "fp4",
@@ -94,6 +95,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
     from .compressed_tensors.compressed_tensors import (  # noqa: E501
         CompressedTensorsConfig)
     from .deepspeedfp import DeepSpeedFPConfig
+    from .exl3 import EXL3Config
     from .experts_int8 import ExpertsInt8Config
     from .fbgemm_fp8 import FBGEMMFp8Config
     from .fp6 import QuantLLMFPConfig
@@ -142,6 +144,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
         "quark": QuarkConfig,
         "moe_wna16": MoeWNA16Config,
         "torchao": TorchAOConfig,
+        "exl3": EXL3Config,
         "fp2": QuantLLMFPConfig,
         "fp3": QuantLLMFPConfig,
         "fp4": QuantLLMFPConfig,