pytorch · namgyu-youn · Sep 21, 2025 · Sep 22, 2025 · Sep 23, 2025 · Sep 24, 2025
diff --git a/docs/source/quantization_overview.rst b/docs/source/quantization_overview.rst
@@ -5,7 +5,7 @@ First we want to lay out the torchao stack::
 
   Quantization Algorithms/Flows: weight only/dynamic/static quantization, hqq, awq, gptq etc.
   ---------------------------------------------------------------------------------------------
-      Quantized Tensors (derived dtypes): Int4Tensor, Int4PreshuffledTensor, Float8Tensor
+      Quantized Tensors (derived dtypes): Int4Tensor, Int4PreshuffledTensor, Int8Tensor, Float8Tensor
   ---------------------------------------------------------------------------------------------
     Quantization Primitive Ops/Efficient Kernels: matmul, quantize, dequantize
   ---------------------------------------------------------------------------------------------
@@ -88,6 +88,8 @@ So in general we structure Tensor subclasses by dervied dtpype and packing forma
      - scaled int4
      - preshuffled (special format to optimize for loading)
      - float8 act + int4 weight dynamic quantization and int4 weight only quantization
+   * - Int8Tensor
+     - plain
 
 .. note::
    We don't have granularity specific tensor subclasses, i.e. no Float8RowwiseTensor or Float8BlockwiseTensor, all granularities are implemented in the same Tensor, we typically use a general `block_size` attribute to distinguish between different granularities, and each Tensor is allowed to support only a subset of all possible granularity options.

diff --git a/test/quantization/quantize_/workflows/int8/test_int8_tensor.py b/test/quantization/quantize_/workflows/int8/test_int8_tensor.py
@@ -0,0 +1,128 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+from torch.testing._internal import common_utils
+from torch.testing._internal.common_utils import run_tests
+from torch._inductor.utils import run_and_get_code
+
+from torchao.quantization.quantize_.common import KernelPreference
+from torchao.quantization.quantize_.workflows.int8.int8_tensor import (
+    Int8Tensor,
+    QuantizeTensorToInt8Kwargs,
+)
+from torchao.quantization.utils import compute_error
+from torchao.testing.utils import TorchAOIntegrationTestCase
+
+
+@unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+class TestInt8Tensor(TorchAOIntegrationTestCase):
+    def setUp(self):
+        super().setUp()
+        torch.manual_seed(42)
+        self.weight_fp = torch.randn(4, 3, dtype=torch.float32)
+        self.input_fp = torch.randn(2, 3, dtype=torch.float32)
+        self.bias = torch.randn(4)
+        self.block_size = [4, 3]
+
+    def test_creation_and_attributes(self):
+        """Test tensor creation, dtypes, and ranges"""
+        tensor = Int8Tensor.from_hp(self.weight_fp, self.block_size)
+
+        self.assertEqual(tensor.shape, (4, 3))
+        self.assertEqual(tensor.qdata.dtype, torch.int8)
+        self.assertTrue(
+            torch.all(tensor.qdata >= -128) and torch.all(tensor.qdata <= 127)
+        )
+
+    @common_utils.parametrize(
+        "kernel_preference",
+        [KernelPreference.AUTO, KernelPreference.TORCH, KernelPreference.FBGEMM],
+    )
+    def test_kernel_preference(self, kernel_preference):
+        """Test Int8Tensor with different kernels"""
+        tensor = Int8Tensor.from_hp(
+            self.weight_fp, self.block_size, kernel_preference=kernel_preference
+        )
+
+        self.assertEqual(tensor.kernel_preference, kernel_preference)
+
+    def test_linear_operations(self):
+        """Test fp+int8 and int8+int8 linear ops with quantization error check"""
+        weight_q8 = Int8Tensor.from_hp(self.weight_fp, self.block_size)
+        input_q8 = Int8Tensor.from_hp(self.input_fp, self.block_size)
+
+        reference = torch.nn.functional.linear(self.input_fp, self.weight_fp, self.bias)
+        result_fp = torch.nn.functional.linear(self.input_fp, weight_q8, self.bias)
+        result_q8 = torch.nn.functional.linear(input_q8, weight_q8, self.bias)
+
+        self.assertEqual(result_fp.shape, reference.shape)
+        self.assertEqual(result_q8.shape, reference.shape)
+        self.assertTrue(compute_error(result_fp, reference) > 10)
+        self.assertTrue(compute_error(result_q8, reference) > 10)
+
+    def test_dynamic_quantization(self):
+        weight_q8_dynamic = Int8Tensor.from_hp(
+            self.weight_fp,
+            self.block_size,
+            act_quant_kwargs=QuantizeTensorToInt8Kwargs(),
+        )
+
+        reference = torch.nn.functional.linear(self.input_fp, self.weight_fp, self.bias)
+        result_dynamic = torch.nn.functional.linear(
+            self.input_fp, weight_q8_dynamic, self.bias
+        )
+
+        self.assertEqual(result_dynamic.shape, reference.shape)
+
+    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    def test_expected_kernel_operations(self):
+        """Test Int8Tensor with FBGEMM kernels"""
+
+        # Setup model with Int8Tensor
+        weight_q8 = Int8Tensor.from_hp(
+            self.weight_fp,
+            self.block_size,
+            kernel_preference=KernelPreference.FBGEMM
+        )
+
+        def model(x):
+            return torch.nn.functional.linear(x, weight_q8, self.bias)
+
+        compiled_model = torch.compile(model)
+
+        output, code = run_and_get_code(compiled_model, self.input_fp)
+
+        self.assertEqual(output.shape, (2, 4))
+        self.assertTrue(len(code) > 0, "Should generate some compiled code")
+
+        # Test dequantization kernel
+        dequant_output = torch.ops.aten.dequantize.self(weight_q8)
+        self.assertEqual(dequant_output.shape, self.weight_fp.shape)
+
+    def test_error_handling_and_dequant(self):
+        """Test input validation and dequantization accuracy"""
+        # Test 1D tensor validation
+        with self.assertRaises((AssertionError, ValueError, RuntimeError)):
+            Int8Tensor.from_hp(torch.randn(5), [1])
+
+        # Test wrong block_size validation
+        with self.assertRaises((AssertionError, ValueError, RuntimeError)):
+            Int8Tensor.from_hp(self.weight_fp, [1])
+
+        # Test dequantization with exact values
+        test_data = torch.tensor([[1.0, -1.0]], dtype=torch.float32)
+        tensor = Int8Tensor.from_hp(test_data, [1, 1])
+
+        dequantized = torch.ops.aten.dequantize.self(tensor)
+        self.assertEqual(dequantized.shape, test_data.shape)
+        self.assertLess(torch.abs(dequantized - test_data).max().item(), 0.1)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/torchao/quantization/__init__.py b/torchao/quantization/__init__.py
@@ -96,6 +96,7 @@
     Int4PreshuffledTensor,
     Int4Tensor,
     Int4TilePackedTo4dTensor,
+    Int8Tensor,
     IntxOpaqueTensor,
     IntxUnpackedToInt8Tensor,
 )
@@ -169,6 +170,7 @@
     "IntxOpaqueTensor",
     "IntxUnpackedToInt8Tensor",
     "Int4TilePackedTo4dTensor",
+    "Int8Tensor",
     "Float8Tensor",
     "Int4OpaqueTensor",
     # smooth quant - subject to change

diff --git a/torchao/quantization/quantize_/common/quantize_tensor_kwargs.py b/torchao/quantization/quantize_/common/quantize_tensor_kwargs.py
@@ -39,7 +39,9 @@ def _choose_quant_func_and_quantize_tensor(
     """
     from torchao.quantization.quantize_.workflows import (
         Float8Tensor,
+        Int8Tensor,
         QuantizeTensorToFloat8Kwargs,
+        QuantizeTensorToInt8Kwargs,
     )
 
     if isinstance(quant_kwargs, QuantizeTensorToFloat8Kwargs):
@@ -52,5 +54,11 @@ def _choose_quant_func_and_quantize_tensor(
             quant_kwargs.hp_value_ub,
             quant_kwargs.kernel_preference,
         )
+    elif isinstance(quant_kwargs, QuantizeTensorToInt8Kwargs):
+        return Int8Tensor.from_hp(
+            tensor,
+            quant_kwargs.block_size or [1, tensor.shape[-1]],
+            kernel_preference=quant_kwargs.kernel_preference,
+        )
 
     raise NotImplementedError(f"Quant kwargs not supported: {quant_kwargs}")
diff --git a/torchao/quantization/quantize_/workflows/__init__.py b/torchao/quantization/quantize_/workflows/__init__.py
@@ -20,6 +20,10 @@
     Int4Tensor,
 )
 from .int4.int4_tile_packed_to_4d_tensor import Int4TilePackedTo4dTensor
+from .int8.int8_tensor import (
+    Int8Tensor,
+    QuantizeTensorToInt8Kwargs,
+)
 from .intx.intx_opaque_tensor import (
     IntxOpaqueTensor,
 )
@@ -36,6 +40,8 @@
     "Int4MarlinSparseTensor",
     "Int4PlainInt32Tensor",
     "Int4TilePackedTo4dTensor",
+    "Int8Tensor",
+    "QuantizeTensorToInt8Kwargs",
     "Float8Tensor",
     "QuantizeTensorToFloat8Kwargs",
     "Int4OpaqueTensor",