From c7ecb1efcdd333c8be50393a6c5f1eb77f0d6085 Mon Sep 17 00:00:00 2001
From: jainapurva <apurvajain.kota@gmail.com>
Date: Fri, 7 Nov 2025 11:51:47 -0800
Subject: [PATCH 01/22] Add torch2.9 in regression tests

---
 .github/workflows/regression_test.yml | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/.github/workflows/regression_test.yml b/.github/workflows/regression_test.yml
index cc474ff9e7..456822e4f2 100644
--- a/.github/workflows/regression_test.yml
+++ b/.github/workflows/regression_test.yml
@@ -77,6 +77,12 @@ jobs:
             gpu-arch-type: "cuda"
             gpu-arch-version: "12.6"
             dev-requirements-overrides: ""
+          - name: CUDA 2.9
+            runs-on: linux.g5.12xlarge.nvidia.gpu
+            torch-spec: 'torch==2.9.0'
+            gpu-arch-type: "cuda"
+            gpu-arch-version: "12.6"
+            dev-requirements-overrides: ""
 
           - name: CPU 2.6
             runs-on: linux.4xlarge
@@ -96,6 +102,12 @@ jobs:
             gpu-arch-type: "cpu"
             gpu-arch-version: ""
             dev-requirements-overrides: ""
+          - name: CPU 2.9
+            runs-on: linux.4xlarge
+            torch-spec: 'torch==2.9.0 --index-url https://download.pytorch.org/whl/cpu'
+            gpu-arch-type: "cpu"
+            gpu-arch-version: ""
+            dev-requirements-overrides: ""
 
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:

From e9f94ba9f56ee0778546c2943e680017b089e6e8 Mon Sep 17 00:00:00 2001
From: Apurva Jain <apurvajain.kota@gmail.com>
Date: Wed, 12 Nov 2025 14:03:01 -0800
Subject: [PATCH 02/22] Update torch version to 2.9.1 in regression tests

---
 .github/workflows/regression_test.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/regression_test.yml b/.github/workflows/regression_test.yml
index 456822e4f2..278b276ada 100644
--- a/.github/workflows/regression_test.yml
+++ b/.github/workflows/regression_test.yml
@@ -79,7 +79,7 @@ jobs:
             dev-requirements-overrides: ""
           - name: CUDA 2.9
             runs-on: linux.g5.12xlarge.nvidia.gpu
-            torch-spec: 'torch==2.9.0'
+            torch-spec: 'torch==2.9.1'
             gpu-arch-type: "cuda"
             gpu-arch-version: "12.6"
             dev-requirements-overrides: ""
@@ -104,7 +104,7 @@ jobs:
             dev-requirements-overrides: ""
           - name: CPU 2.9
             runs-on: linux.4xlarge
-            torch-spec: 'torch==2.9.0 --index-url https://download.pytorch.org/whl/cpu'
+            torch-spec: 'torch==2.9.1 --index-url https://download.pytorch.org/whl/cpu'
             gpu-arch-type: "cpu"
             gpu-arch-version: ""
             dev-requirements-overrides: ""

From 886f0a6cb5654a4ceb20924eb7748ca85ddcac1d Mon Sep 17 00:00:00 2001
From: Apurva Jain <apurvajain.kota@gmail.com>
Date: Wed, 12 Nov 2025 14:04:06 -0800
Subject: [PATCH 03/22] Update torch version from 2.7.0 to 2.7.1

---
 .github/workflows/regression_test.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/regression_test.yml b/.github/workflows/regression_test.yml
index 278b276ada..46928b30cf 100644
--- a/.github/workflows/regression_test.yml
+++ b/.github/workflows/regression_test.yml
@@ -67,7 +67,7 @@ jobs:
             dev-requirements-overrides: ""
           - name: CUDA 2.7
             runs-on: linux.g5.12xlarge.nvidia.gpu
-            torch-spec: 'torch==2.7.0'
+            torch-spec: 'torch==2.7.1'
             gpu-arch-type: "cuda"
             gpu-arch-version: "12.6"
             dev-requirements-overrides: ""
@@ -92,7 +92,7 @@ jobs:
             dev-requirements-overrides: ""
           - name: CPU 2.7
             runs-on: linux.4xlarge
-            torch-spec: 'torch==2.7.0 --index-url https://download.pytorch.org/whl/cpu'
+            torch-spec: 'torch==2.7.1 --index-url https://download.pytorch.org/whl/cpu'
             gpu-arch-type: "cpu"
             gpu-arch-version: ""
             dev-requirements-overrides: ""

From 1a9a13f1c4399a41159fc3d9e419512e643b992e Mon Sep 17 00:00:00 2001
From: Apurva Jain <apurvajain.kota@gmail.com>
Date: Fri, 7 Nov 2025 13:52:55 -0800
Subject: [PATCH 04/22] Move dyn_int8_act_int4_wei_cpu_layout to
 prototype/dtypes (#3299)

---
 docs/source/api_ref_dtypes.rst                |   1 +
 test/dtypes/test_uintx.py                     |  39 +++
 test/integration/test_integration.py          |  27 --
 test/sparsity/test_sparse_api.py              |  27 --
 torchao/dtypes/__init__.py                    |   2 +-
 torchao/dtypes/affine_quantized_tensor_ops.py |   8 +-
 .../uintx/dyn_int8_act_int4_wei_cpu_layout.py | 326 +-----------------
 torchao/prototype/dtypes/__init__.py          |   7 +-
 torchao/prototype/dtypes/uintx/__init__.py    |   2 +
 .../uintx/dyn_int8_act_int4_wei_cpu_layout.py | 318 +++++++++++++++++
 10 files changed, 388 insertions(+), 369 deletions(-)
 create mode 100644 torchao/prototype/dtypes/uintx/dyn_int8_act_int4_wei_cpu_layout.py

diff --git a/docs/source/api_ref_dtypes.rst b/docs/source/api_ref_dtypes.rst
index e347dfd2e3..5c73d275eb 100644
--- a/docs/source/api_ref_dtypes.rst
+++ b/docs/source/api_ref_dtypes.rst
@@ -52,6 +52,7 @@ Prototype
 
     BlockSparseLayout
     CutlassInt4PackedLayout
+    Int8DynamicActInt4WeightCPULayout
 
 ..
   _NF4Tensor - add after fixing torchao/dtypes/nf4tensor.py:docstring
diff --git a/test/dtypes/test_uintx.py b/test/dtypes/test_uintx.py
index cb0c88b21c..5d54a80753 100644
--- a/test/dtypes/test_uintx.py
+++ b/test/dtypes/test_uintx.py
@@ -3,6 +3,9 @@
 #
 # This source code is licensed under the BSD 3-Clause license found in the
 # LICENSE file in the root directory of this source tree.
+import sys
+import warnings
+
 import pytest
 import torch
 
@@ -165,3 +168,39 @@ def test_uintx_model_size(dtype):
     quantize_(linear[0], UIntXWeightOnlyConfig(dtype))
     quantized_size = get_model_size_in_bytes(linear)
     assert bf16_size * _dtype_to_ratio[dtype] == quantized_size
+
+
+def test_uintx_api_deprecation():
+    """
+    Test that deprecated uintx APIs trigger deprecation warnings on import.
+    TODO: Remove this test once the deprecated APIs have been removed.
+    """
+    deprecated_apis = [
+        (
+            "Int8DynamicActInt4WeightCPULayout",
+            "torchao.dtypes.uintx.dyn_int8_act_int4_wei_cpu_layout",
+        ),
+        ("CutlassInt4PackedLayout", "torchao.dtypes.uintx.cutlass_int4_packed_layout"),
+        ("BlockSparseLayout", "torchao.dtypes.uintx.block_sparse_layout"),
+    ]
+
+    for api_name, module_path in deprecated_apis:
+        # Clear the cache to force re-importing and trigger the warning again
+        modules_to_clear = [module_path, "torchao.dtypes"]
+        for mod in modules_to_clear:
+            if mod in sys.modules:
+                del sys.modules[mod]
+
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")  # Ensure all warnings are captured
+
+            # Dynamically import the deprecated API
+            exec(f"from torchao.dtypes import {api_name}")
+
+            assert any(
+                issubclass(warning.category, DeprecationWarning)
+                and api_name in str(warning.message)
+                for warning in w
+            ), (
+                f"Expected deprecation warning for {api_name}, got: {[str(warning.message) for warning in w]}"
+            )
diff --git a/test/integration/test_integration.py b/test/integration/test_integration.py
index 50c2eebe81..70da622c73 100644
--- a/test/integration/test_integration.py
+++ b/test/integration/test_integration.py
@@ -1948,32 +1948,5 @@ def test_benchmark_model_cpu(self):
         assert self.run_benchmark_model("cpu") is not None
 
 
-# TODO: Remove this test once the deprecated API has been removed
-def test_cutlass_int4_packed_layout_deprecated():
-    import sys
-    import warnings
-
-    # We need to clear the cache to force re-importing and trigger the warning again.
-    modules_to_clear = [
-        "torchao.dtypes.uintx.cutlass_int4_packed_layout",
-        "torchao.dtypes",
-    ]
-    for mod in modules_to_clear:
-        if mod in sys.modules:
-            del sys.modules[mod]
-
-    with warnings.catch_warnings(record=True) as w:
-        from torchao.dtypes import CutlassInt4PackedLayout  # noqa: F401
-
-        warnings.simplefilter("always")  # Ensure all warnings are captured
-        assert any(
-            issubclass(warning.category, DeprecationWarning)
-            and "CutlassInt4PackedLayout" in str(warning.message)
-            for warning in w
-        ), (
-            f"Expected deprecation warning for CutlassInt4PackedLayout, got: {[str(warning.message) for warning in w]}"
-        )
-
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/sparsity/test_sparse_api.py b/test/sparsity/test_sparse_api.py
index c9d41a98a9..66cd032a9a 100644
--- a/test/sparsity/test_sparse_api.py
+++ b/test/sparsity/test_sparse_api.py
@@ -267,33 +267,6 @@ def test_sparse(self, compile):
 
         torch.testing.assert_close(reference, sparse_result, rtol=1e-1, atol=1e-1)
 
-    # TODO: Remove this test once the deprecated API has been removed
-    def test_sparse_deprecated(self):
-        import sys
-        import warnings
-
-        # We need to clear the cache to force re-importing and trigger the warning again.
-        modules_to_clear = [
-            "torchao.dtypes.uintx.block_sparse_layout",
-            "torchao.dtypes",
-        ]
-        for mod in modules_to_clear:
-            if mod in sys.modules:
-                del sys.modules[mod]
-
-        with warnings.catch_warnings(record=True) as w:
-            from torchao.dtypes import BlockSparseLayout  # noqa: F401
-
-            warnings.simplefilter("always")  # Ensure all warnings are captured
-            self.assertTrue(
-                any(
-                    issubclass(warning.category, DeprecationWarning)
-                    and "BlockSparseLayout" in str(warning.message)
-                    for warning in w
-                ),
-                f"Expected deprecation warning for BlockSparseLayout, got: {[str(w.message) for w in w]}",
-            )
-
 
 common_utils.instantiate_parametrized_tests(TestSemiStructuredSparse)
 common_utils.instantiate_parametrized_tests(TestQuantSemiSparse)
diff --git a/torchao/dtypes/__init__.py b/torchao/dtypes/__init__.py
index 252498bc97..354692e794 100644
--- a/torchao/dtypes/__init__.py
+++ b/torchao/dtypes/__init__.py
@@ -16,7 +16,6 @@
 from .uintx import (
     Int4CPULayout,
     Int4XPULayout,
-    Int8DynamicActInt4WeightCPULayout,
     MarlinQQQLayout,
     MarlinQQQTensor,
     MarlinSparseLayout,
@@ -29,6 +28,7 @@
 )
 from .uintx.block_sparse_layout import BlockSparseLayout
 from .uintx.cutlass_int4_packed_layout import CutlassInt4PackedLayout
+from .uintx.dyn_int8_act_int4_wei_cpu_layout import Int8DynamicActInt4WeightCPULayout
 from .utils import (
     Layout,
     PlainLayout,
diff --git a/torchao/dtypes/affine_quantized_tensor_ops.py b/torchao/dtypes/affine_quantized_tensor_ops.py
index e46809059e..3816f9bf1f 100644
--- a/torchao/dtypes/affine_quantized_tensor_ops.py
+++ b/torchao/dtypes/affine_quantized_tensor_ops.py
@@ -25,10 +25,6 @@
     _linear_f16_bf16_act_floatx_weight_check,
     _linear_f16_bf16_act_floatx_weight_impl,
 )
-from torchao.dtypes.uintx.dyn_int8_act_int4_wei_cpu_layout import (
-    _linear_int8_act_int4_weight_cpu_check,
-    _linear_int8_act_int4_weight_cpu_impl,
-)
 from torchao.dtypes.uintx.gemlite_layout import (
     _linear_fp_act_int4_weight_gemlite_check,
     _linear_fp_act_int4_weight_gemlite_impl,
@@ -94,6 +90,10 @@
     _linear_int8_act_int4_weight_cutlass_check,
     _linear_int8_act_int4_weight_cutlass_impl,
 )
+from torchao.prototype.dtypes.uintx.dyn_int8_act_int4_wei_cpu_layout import (
+    _linear_int8_act_int4_weight_cpu_check,
+    _linear_int8_act_int4_weight_cpu_impl,
+)
 from torchao.quantization.quant_primitives import (
     ZeroPointDomain,
     _dequantize_affine_no_zero_point,
diff --git a/torchao/dtypes/uintx/dyn_int8_act_int4_wei_cpu_layout.py b/torchao/dtypes/uintx/dyn_int8_act_int4_wei_cpu_layout.py
index 8d0cfaddeb..d66f70e2ee 100644
--- a/torchao/dtypes/uintx/dyn_int8_act_int4_wei_cpu_layout.py
+++ b/torchao/dtypes/uintx/dyn_int8_act_int4_wei_cpu_layout.py
@@ -3,317 +3,25 @@
 #
 # This source code is licensed under the BSD 3-Clause license found in the
 # LICENSE file in the root directory of this source tree.
-from dataclasses import dataclass
-from typing import Tuple
 
-import torch
-from torch.utils._python_dispatch import (
-    return_and_correct_aliasing,
-)
+# Backward compatibility stub - imports from the new location
+import warnings
 
-from torchao.dtypes.affine_quantized_tensor import (
-    AffineQuantizedTensor,
-    register_layout,
+warnings.warn(
+    "Importing from torchao.dtypes.uintx.dyn_int8_act_int4_wei_cpu_layout is deprecated. "
+    "Please use 'from torchao.prototype.dtypes import Int8DynamicActInt4WeightCPULayout' instead. "
+    "This import path will be removed in a future release of torchao. "
+    "See https://github.com/pytorch/ao/issues/2752 for more details.",
+    DeprecationWarning,
+    stacklevel=2,
 )
-from torchao.dtypes.utils import Layout, PlainLayout, is_device
-from torchao.utils import torch_version_at_least
 
-from .int4_cpu_layout import (
-    Int4CPUAQTTensorImpl,
-    _is_float,
+from torchao.prototype.dtypes.uintx.dyn_int8_act_int4_wei_cpu_layout import (  # noqa: F401
+    DA8W4CPUAQTTensorImpl,  # noqa: F401
+    Int8DynamicActInt4WeightCPULayout,  # noqa: F401
+    _aqt_is_int8,  # noqa: F401
+    _aqt_is_uint4,  # noqa: F401
+    _aqt_is_uint8,  # noqa: F401
+    _linear_int8_act_int4_weight_cpu_check,  # noqa: F401
+    _linear_int8_act_int4_weight_cpu_impl,  # noqa: F401
 )
-
-aten = torch.ops.aten
-
-
-@dataclass(frozen=True)
-class Int8DynamicActInt4WeightCPULayout(Layout):
-    """Layout class for da8w4 CPU layout for affine quantized tensor"""
-
-    pass
-
-
-@register_layout(Int8DynamicActInt4WeightCPULayout)
-class DA8W4CPUAQTTensorImpl(Int4CPUAQTTensorImpl):
-    """TensorImpl for da8w4 CPU layout for affine quantized tensor
-    It stores the original tensor of dimension [n][k] (int32 dtype) as packed weight of 2-d tensor of
-    dimension: [n][k / 2] (uint8 dtype)
-    It is similar to Int4CPUAQTTensorImpl but with a different memory layout of weight data
-    fields:
-      packed_weight (torch.Tensor): the 2-d packed tensor in a Int4 CPU layout
-      scales (torch.Tensor): the scales Tensor used to map between floating point tensor to quantized tensor
-      qzeros (torch.Tensor): the zero_point Tensor used to map between floating point tensor to quantized tensor
-    """
-
-    def __new__(
-        cls,
-        packed_weight: torch.Tensor,
-        scales: torch.Tensor,
-        qzeros: torch.Tensor,
-        compensation: torch.Tensor,
-        transposed: bool,
-        _layout: Layout,
-    ):
-        kwargs = {}
-        kwargs["device"] = packed_weight.device
-        kwargs["layout"] = (
-            kwargs.get("layout")
-            if kwargs.get("layout", False)
-            else packed_weight.layout
-        )
-        kwargs["dtype"] = packed_weight.dtype
-        kwargs["requires_grad"] = False
-        shape = packed_weight.shape
-        return torch.Tensor._make_wrapper_subclass(cls, shape, **kwargs)  # type: ignore[attr-defined]
-
-    def __init__(
-        self,
-        packed_weight: torch.Tensor,
-        scales: torch.Tensor,
-        qzeros: torch.Tensor,
-        compensation: torch.Tensor,
-        transposed: bool,
-        _layout: Layout,
-    ):
-        self.packed_weight = packed_weight
-        self.scales = scales
-        self.qzeros = qzeros
-        self.compensation = compensation
-        self.transposed = transposed
-        self._layout = _layout
-
-    def __tensor_flatten__(self):
-        return ["packed_weight", "scales", "qzeros", "compensation"], [
-            self.transposed,
-            self._layout,
-        ]
-
-    @classmethod
-    def __tensor_unflatten__(
-        cls, tensor_data_dict, tensor_attributes, outer_size, outer_stride
-    ):
-        packed_weight, scales, qzeros, compensation = (
-            tensor_data_dict["packed_weight"],
-            tensor_data_dict["scales"],
-            tensor_data_dict["qzeros"],
-            tensor_data_dict["compensation"],
-        )
-        (
-            transposed,
-            _layout,
-        ) = tensor_attributes
-        return cls(packed_weight, scales, qzeros, compensation, transposed, _layout)
-
-    @classmethod
-    def from_plain(
-        cls,
-        int_data: torch.Tensor,
-        scale: torch.Tensor,
-        zero_point: torch.Tensor,
-        _layout: Layout,
-    ):
-        assert isinstance(_layout, Int8DynamicActInt4WeightCPULayout)
-        assert int_data.dtype == torch.uint8, "DA8W4 CPU: expects uint8 weight"
-        assert int_data.shape[1] % 2 == 0, "DA8W4 CPU: expects even number of columns"
-        if scale.dim() == 1:
-            scale.unsqueeze_(-1)
-        scale = scale.to(torch.float)
-        if zero_point.dim() == 1:
-            zero_point.unsqueeze_(-1)
-
-        # Pack weight from [N, K] to [N / block_n, K / block_k, block_k, block_n].
-        # Pack the inner blocks [block_k, block_n] to VNNI layout if AMX is available.
-        # Pack scales/qzeros from [N, num_groups] to [N / block_n, num_groups, block_n].
-        # Compensation shape = [N / block_n, K / block_k, block_n].
-        weight_int4, scales, qzeros, compensation = (
-            torch.ops.torchao.da8w4_linear_prepack_cpu(int_data, scale, zero_point)
-        )
-        return cls(weight_int4, scales, qzeros, compensation, False, _layout)
-
-    def _apply_fn_to_data(self, fn):
-        return self.__class__(
-            fn(self.packed_weight),
-            fn(self.scales),
-            fn(self.qzeros),
-            fn(self.compensation),
-            self.transposed,
-            self._layout,
-        )
-
-    @classmethod
-    def __torch_dispatch__(cls, func, types, args, kwargs):
-        kwargs = {} if kwargs is None else kwargs
-        if func is aten.t.default:
-            """we don't need to repack the weight and just rely on external
-            shape being changed and record the status of transpose/no-transpose
-            """
-            transposed = DA8W4CPUAQTTensorImpl(
-                args[0].packed_weight,
-                args[0].scales,
-                args[0].qzeros,
-                args[0].compensation,
-                not args[0].transposed,
-                args[0]._layout,
-            )
-            return return_and_correct_aliasing(func, args, kwargs, transposed)
-        else:
-            return super().__torch_dispatch__(func, types, args, kwargs)
-
-    __torch_function__ = torch._C._disabled_torch_function_impl
-
-    @property
-    def block_size(self):
-        assert len(self.packed_weight.shape) == 2
-        weight_shape = self.packed_weight.shape
-        N = weight_shape[0]
-        K = weight_shape[1] * 2
-        groups = self.scales.numel() // N
-        group_size = K // groups
-        return (1, group_size)
-
-    def get_plain(self) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        # Unpack weight by linear(eye(K), packed_weight).t()
-        packed_w_shape = self.packed_weight.shape
-        if len(packed_w_shape) == 4:
-            K = packed_w_shape[1] * packed_w_shape[2]
-        else:
-            K = packed_w_shape[1]
-        x = torch.eye(K).to(torch.uint8)
-        x_scale = torch.ones(K).float()
-        x_qzero = torch.zeros(K).to(torch.int32)
-        w_scale = torch.ones_like(self.scales).float()
-        w_qzero = torch.zeros_like(self.qzeros).to(torch.int8)
-        plain_weight = torch.ops.torchao.da8w4_linear_cpu.default(
-            x,
-            x_scale,
-            x_qzero,
-            self.packed_weight,
-            w_scale,
-            w_qzero,
-            self.compensation,
-            None,  # bias
-            torch.float,  # out_dtype
-        )
-        plain_weight = plain_weight.t().contiguous()
-        plain_weight = plain_weight.to(torch.int8)
-
-        if self.scales.dim() == 2:
-            assert self.qzeros.dim() == 2
-            plain_scales = self.scales
-            plain_qzeros = self.qzeros
-        else:
-            assert self.scales.dim() == 3 and self.qzeros.dim() == 3
-            packed_shape = self.scales.shape  # [Nc, G, block_n]
-            plain_scales = (
-                self.scales.permute([0, 2, 1]).contiguous().view([-1, packed_shape[1]])
-            )
-            plain_qzeros = (
-                self.qzeros.permute([0, 2, 1]).contiguous().view([-1, packed_shape[1]])
-            )
-
-        return plain_weight, plain_scales, plain_qzeros
-
-
-def _aqt_is_uint8(aqt):
-    """Check if an AffineQuantizedTensor is uint8 quantized Tensor"""
-    return (
-        aqt.tensor_impl.dtype == torch.uint8
-        and aqt.quant_min == 0
-        and aqt.quant_max == 255
-    )
-
-
-def _aqt_is_int8(aqt):
-    """Check if an AffineQuantizedTensor is uint8 quantized Tensor"""
-    return (
-        aqt.tensor_impl.dtype == torch.int8
-        and aqt.quant_min == -127
-        and aqt.quant_max == 127
-    )
-
-
-def _aqt_is_uint4(aqt):
-    """Check if an AffineQuantizedTensor is uint4 quantized Tensor"""
-    return (
-        aqt.tensor_impl.dtype == torch.uint8
-        and aqt.quant_min == 0
-        and aqt.quant_max == 15
-    )
-
-
-def _linear_int8_act_int4_weight_cpu_check(input_tensor, weight_tensor, bias):
-    return (
-        torch_version_at_least("2.7.0")
-        and is_device(input_tensor.device.type, "cpu")
-        and is_device(weight_tensor.device.type, "cpu")
-        and (bias is None or is_device(bias.device.type, "cpu"))
-        and isinstance(input_tensor, AffineQuantizedTensor)
-        and (_aqt_is_uint8(input_tensor) or _aqt_is_int8(input_tensor))
-        and _is_float(input_tensor.dtype)
-        and isinstance(input_tensor._layout, PlainLayout)
-        and isinstance(weight_tensor, AffineQuantizedTensor)
-        and _aqt_is_uint4(weight_tensor)
-        and _is_float(weight_tensor.dtype)
-        and isinstance(weight_tensor._layout, Int8DynamicActInt4WeightCPULayout)
-    )
-
-
-def _linear_int8_act_int4_weight_cpu_impl(input_tensor, weight_tensor, bias):
-    assert torch_version_at_least("2.7.0"), (
-        f"Requires PyTorch version at least 2.7, but got: {torch.__version__}"
-    )
-    if _aqt_is_int8(input_tensor):
-        assert torch_version_at_least("2.8.0"), (
-            f"Requires PyTorch version at least 2.8, but got: {torch.__version__}"
-        )
-    assert is_device(input_tensor.device.type, "cpu"), (
-        f"For CPU device only but got: {input_tensor.device}"
-    )
-    assert weight_tensor.block_size[0] == 1, (
-        f"Requires groupwise quantization, got block_size: {weight_tensor.block_size}"
-    )
-    assert input_tensor.shape[-1] == weight_tensor.shape[1], (
-        f"need input_tensor shape: {input_tensor.shape} final"
-        f"dim to match weight_tensor shape: {weight_tensor.shape} second dim "
-    )
-
-    act_mat = input_tensor
-    act = act_mat.tensor_impl.int_data
-    act_scales = act_mat.tensor_impl.scale
-    act_qzeros = act_mat.tensor_impl.zero_point
-
-    packed_weight = weight_tensor.tensor_impl.packed_weight
-    wei_scales = weight_tensor.tensor_impl.scales
-    wei_qzeros = weight_tensor.tensor_impl.qzeros
-    compensation = weight_tensor.tensor_impl.compensation
-
-    orig_act_size = act_mat.size()
-    orig_dtype = act_mat.dtype
-
-    # reshape to 2D
-    act = act.reshape(-1, act.shape[-1])
-
-    y = torch.ops.torchao.da8w4_linear_cpu.default(
-        act.contiguous(),
-        act_scales,
-        act_qzeros,
-        packed_weight,
-        wei_scales,
-        wei_qzeros,
-        compensation,
-        bias.float() if bias is not None else bias,  # requires bias to be float
-        orig_dtype,  # out_dtype
-    )
-
-    # remove out_feature padding
-    orig_out_features = weight_tensor.shape[-2]
-    y = y[:, :orig_out_features]
-    y = y.reshape(*orig_act_size[:-1], orig_out_features)
-
-    return y.to(orig_dtype)
-
-
-# Register the concat linear fusion pass
-# from ...prototype.inductor.fx_passes import register_da8w4_concat_linear_cpu_pass
-
-# register_da8w4_concat_linear_cpu_pass()
diff --git a/torchao/prototype/dtypes/__init__.py b/torchao/prototype/dtypes/__init__.py
index 25f139d583..52a5aec425 100644
--- a/torchao/prototype/dtypes/__init__.py
+++ b/torchao/prototype/dtypes/__init__.py
@@ -4,9 +4,14 @@
 # This source code is licensed under the BSD 3-Clause license found in the
 # LICENSE file in the root directory of this source tree.
 
-from .uintx import BlockSparseLayout, CutlassInt4PackedLayout
+from .uintx import (
+    BlockSparseLayout,
+    CutlassInt4PackedLayout,
+    Int8DynamicActInt4WeightCPULayout,
+)
 
 __all__ = [
     "BlockSparseLayout",
     "CutlassInt4PackedLayout",
+    "Int8DynamicActInt4WeightCPULayout",
 ]
diff --git a/torchao/prototype/dtypes/uintx/__init__.py b/torchao/prototype/dtypes/uintx/__init__.py
index 53edddb8ac..89c1f3f810 100644
--- a/torchao/prototype/dtypes/uintx/__init__.py
+++ b/torchao/prototype/dtypes/uintx/__init__.py
@@ -6,8 +6,10 @@
 
 from .block_sparse_layout import BlockSparseLayout
 from .cutlass_int4_packed_layout import CutlassInt4PackedLayout
+from .dyn_int8_act_int4_wei_cpu_layout import Int8DynamicActInt4WeightCPULayout
 
 __all__ = [
     "BlockSparseLayout",
     "CutlassInt4PackedLayout",
+    "Int8DynamicActInt4WeightCPULayout",
 ]
diff --git a/torchao/prototype/dtypes/uintx/dyn_int8_act_int4_wei_cpu_layout.py b/torchao/prototype/dtypes/uintx/dyn_int8_act_int4_wei_cpu_layout.py
new file mode 100644
index 0000000000..24cc02e358
--- /dev/null
+++ b/torchao/prototype/dtypes/uintx/dyn_int8_act_int4_wei_cpu_layout.py
@@ -0,0 +1,318 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+from dataclasses import dataclass
+from typing import Tuple
+
+import torch
+from torch.utils._python_dispatch import (
+    return_and_correct_aliasing,
+)
+
+from torchao.dtypes.affine_quantized_tensor import (
+    AffineQuantizedTensor,
+    register_layout,
+)
+from torchao.dtypes.uintx.int4_cpu_layout import (
+    Int4CPUAQTTensorImpl,
+    _is_float,
+)
+from torchao.dtypes.utils import Layout, PlainLayout, is_device
+from torchao.utils import torch_version_at_least
+
+aten = torch.ops.aten
+
+
+@dataclass(frozen=True)
+class Int8DynamicActInt4WeightCPULayout(Layout):
+    """Layout class for da8w4 CPU layout for affine quantized tensor"""
+
+    pass
+
+
+@register_layout(Int8DynamicActInt4WeightCPULayout)
+class DA8W4CPUAQTTensorImpl(Int4CPUAQTTensorImpl):
+    """TensorImpl for da8w4 CPU layout for affine quantized tensor
+    It stores the original tensor of dimension [n][k] (int32 dtype) as packed weight of 2-d tensor of
+    dimension: [n][k / 2] (uint8 dtype)
+    It is similar to Int4CPUAQTTensorImpl but with a different memory layout of weight data
+    fields:
+      packed_weight (torch.Tensor): the 2-d packed tensor in a Int4 CPU layout
+      scales (torch.Tensor): the scales Tensor used to map between floating point tensor to quantized tensor
+      qzeros (torch.Tensor): the zero_point Tensor used to map between floating point tensor to quantized tensor
+    """
+
+    def __new__(
+        cls,
+        packed_weight: torch.Tensor,
+        scales: torch.Tensor,
+        qzeros: torch.Tensor,
+        compensation: torch.Tensor,
+        transposed: bool,
+        _layout: Layout,
+    ):
+        kwargs = {}
+        kwargs["device"] = packed_weight.device
+        kwargs["layout"] = (
+            kwargs.get("layout")
+            if kwargs.get("layout", False)
+            else packed_weight.layout
+        )
+        kwargs["dtype"] = packed_weight.dtype
+        kwargs["requires_grad"] = False
+        shape = packed_weight.shape
+        return torch.Tensor._make_wrapper_subclass(cls, shape, **kwargs)  # type: ignore[attr-defined]
+
+    def __init__(
+        self,
+        packed_weight: torch.Tensor,
+        scales: torch.Tensor,
+        qzeros: torch.Tensor,
+        compensation: torch.Tensor,
+        transposed: bool,
+        _layout: Layout,
+    ):
+        self.packed_weight = packed_weight
+        self.scales = scales
+        self.qzeros = qzeros
+        self.compensation = compensation
+        self.transposed = transposed
+        self._layout = _layout
+
+    def __tensor_flatten__(self):
+        return ["packed_weight", "scales", "qzeros", "compensation"], [
+            self.transposed,
+            self._layout,
+        ]
+
+    @classmethod
+    def __tensor_unflatten__(
+        cls, tensor_data_dict, tensor_attributes, outer_size, outer_stride
+    ):
+        packed_weight, scales, qzeros, compensation = (
+            tensor_data_dict["packed_weight"],
+            tensor_data_dict["scales"],
+            tensor_data_dict["qzeros"],
+            tensor_data_dict["compensation"],
+        )
+        (
+            transposed,
+            _layout,
+        ) = tensor_attributes
+        return cls(packed_weight, scales, qzeros, compensation, transposed, _layout)
+
+    @classmethod
+    def from_plain(
+        cls,
+        int_data: torch.Tensor,
+        scale: torch.Tensor,
+        zero_point: torch.Tensor,
+        _layout: Layout,
+    ):
+        assert isinstance(_layout, Int8DynamicActInt4WeightCPULayout)
+        assert int_data.dtype == torch.uint8, "DA8W4 CPU: expects uint8 weight"
+        assert int_data.shape[1] % 2 == 0, "DA8W4 CPU: expects even number of columns"
+        if scale.dim() == 1:
+            scale.unsqueeze_(-1)
+        scale = scale.to(torch.float)
+        if zero_point.dim() == 1:
+            zero_point.unsqueeze_(-1)
+
+        # Pack weight from [N, K] to [N / block_n, K / block_k, block_k, block_n].
+        # Pack the inner blocks [block_k, block_n] to VNNI layout if AMX is available.
+        # Pack scales/qzeros from [N, num_groups] to [N / block_n, num_groups, block_n].
+        # Compensation shape = [N / block_n, K / block_k, block_n].
+        weight_int4, scales, qzeros, compensation = (
+            torch.ops.torchao.da8w4_linear_prepack_cpu(int_data, scale, zero_point)
+        )
+        return cls(weight_int4, scales, qzeros, compensation, False, _layout)
+
+    def _apply_fn_to_data(self, fn):
+        return self.__class__(
+            fn(self.packed_weight),
+            fn(self.scales),
+            fn(self.qzeros),
+            fn(self.compensation),
+            self.transposed,
+            self._layout,
+        )
+
+    @classmethod
+    def __torch_dispatch__(cls, func, types, args, kwargs):
+        kwargs = {} if kwargs is None else kwargs
+        if func is aten.t.default:
+            """we don't need to repack the weight and just rely on external
+            shape being changed and record the status of transpose/no-transpose
+            """
+            transposed = DA8W4CPUAQTTensorImpl(
+                args[0].packed_weight,
+                args[0].scales,
+                args[0].qzeros,
+                args[0].compensation,
+                not args[0].transposed,
+                args[0]._layout,
+            )
+            return return_and_correct_aliasing(func, args, kwargs, transposed)
+        else:
+            return super().__torch_dispatch__(func, types, args, kwargs)
+
+    __torch_function__ = torch._C._disabled_torch_function_impl
+
+    @property
+    def block_size(self):
+        assert len(self.packed_weight.shape) == 2
+        weight_shape = self.packed_weight.shape
+        N = weight_shape[0]
+        K = weight_shape[1] * 2
+        groups = self.scales.numel() // N
+        group_size = K // groups
+        return (1, group_size)
+
+    def get_plain(self) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        # Unpack weight by linear(eye(K), packed_weight).t()
+        packed_w_shape = self.packed_weight.shape
+        if len(packed_w_shape) == 4:
+            K = packed_w_shape[1] * packed_w_shape[2]
+        else:
+            K = packed_w_shape[1]
+        x = torch.eye(K).to(torch.uint8)
+        x_scale = torch.ones(K).float()
+        x_qzero = torch.zeros(K).to(torch.int32)
+        w_scale = torch.ones_like(self.scales).float()
+        w_qzero = torch.zeros_like(self.qzeros).to(torch.int8)
+        plain_weight = torch.ops.torchao.da8w4_linear_cpu.default(
+            x,
+            x_scale,
+            x_qzero,
+            self.packed_weight,
+            w_scale,
+            w_qzero,
+            self.compensation,
+            None,  # bias
+            torch.float,  # out_dtype
+        )
+        plain_weight = plain_weight.t().contiguous()
+        plain_weight = plain_weight.to(torch.int8)
+
+        if self.scales.dim() == 2:
+            assert self.qzeros.dim() == 2
+            plain_scales = self.scales
+            plain_qzeros = self.qzeros
+        else:
+            assert self.scales.dim() == 3 and self.qzeros.dim() == 3
+            packed_shape = self.scales.shape  # [Nc, G, block_n]
+            plain_scales = (
+                self.scales.permute([0, 2, 1]).contiguous().view([-1, packed_shape[1]])
+            )
+            plain_qzeros = (
+                self.qzeros.permute([0, 2, 1]).contiguous().view([-1, packed_shape[1]])
+            )
+
+        return plain_weight, plain_scales, plain_qzeros
+
+
+def _aqt_is_uint8(aqt):
+    """Check if an AffineQuantizedTensor is uint8 quantized Tensor"""
+    return (
+        aqt.tensor_impl.dtype == torch.uint8
+        and aqt.quant_min == 0
+        and aqt.quant_max == 255
+    )
+
+
+def _aqt_is_int8(aqt):
+    """Check if an AffineQuantizedTensor is uint8 quantized Tensor"""
+    return (
+        aqt.tensor_impl.dtype == torch.int8
+        and aqt.quant_min == -127
+        and aqt.quant_max == 127
+    )
+
+
+def _aqt_is_uint4(aqt):
+    """Check if an AffineQuantizedTensor is uint4 quantized Tensor"""
+    return (
+        aqt.tensor_impl.dtype == torch.uint8
+        and aqt.quant_min == 0
+        and aqt.quant_max == 15
+    )
+
+
+def _linear_int8_act_int4_weight_cpu_check(input_tensor, weight_tensor, bias):
+    return (
+        torch_version_at_least("2.7.0")
+        and is_device(input_tensor.device.type, "cpu")
+        and is_device(weight_tensor.device.type, "cpu")
+        and (bias is None or is_device(bias.device.type, "cpu"))
+        and isinstance(input_tensor, AffineQuantizedTensor)
+        and (_aqt_is_uint8(input_tensor) or _aqt_is_int8(input_tensor))
+        and _is_float(input_tensor.dtype)
+        and isinstance(input_tensor._layout, PlainLayout)
+        and isinstance(weight_tensor, AffineQuantizedTensor)
+        and _aqt_is_uint4(weight_tensor)
+        and _is_float(weight_tensor.dtype)
+        and isinstance(weight_tensor._layout, Int8DynamicActInt4WeightCPULayout)
+    )
+
+
+def _linear_int8_act_int4_weight_cpu_impl(input_tensor, weight_tensor, bias):
+    assert torch_version_at_least("2.7.0"), (
+        f"Requires PyTorch version at least 2.7, but got: {torch.__version__}"
+    )
+    if _aqt_is_int8(input_tensor):
+        assert torch_version_at_least("2.8.0"), (
+            f"Requires PyTorch version at least 2.8, but got: {torch.__version__}"
+        )
+    assert is_device(input_tensor.device.type, "cpu"), (
+        f"For CPU device only but got: {input_tensor.device}"
+    )
+    assert weight_tensor.block_size[0] == 1, (
+        f"Requires groupwise quantization, got block_size: {weight_tensor.block_size}"
+    )
+    assert input_tensor.shape[-1] == weight_tensor.shape[1], (
+        f"need input_tensor shape: {input_tensor.shape} final"
+        f"dim to match weight_tensor shape: {weight_tensor.shape} second dim "
+    )
+
+    act_mat = input_tensor
+    act = act_mat.tensor_impl.int_data
+    act_scales = act_mat.tensor_impl.scale
+    act_qzeros = act_mat.tensor_impl.zero_point
+
+    packed_weight = weight_tensor.tensor_impl.packed_weight
+    wei_scales = weight_tensor.tensor_impl.scales
+    wei_qzeros = weight_tensor.tensor_impl.qzeros
+    compensation = weight_tensor.tensor_impl.compensation
+
+    orig_act_size = act_mat.size()
+    orig_dtype = act_mat.dtype
+
+    # reshape to 2D
+    act = act.reshape(-1, act.shape[-1])
+
+    y = torch.ops.torchao.da8w4_linear_cpu.default(
+        act.contiguous(),
+        act_scales,
+        act_qzeros,
+        packed_weight,
+        wei_scales,
+        wei_qzeros,
+        compensation,
+        bias.float() if bias is not None else bias,  # requires bias to be float
+        orig_dtype,  # out_dtype
+    )
+
+    # remove out_feature padding
+    orig_out_features = weight_tensor.shape[-2]
+    y = y[:, :orig_out_features]
+    y = y.reshape(*orig_act_size[:-1], orig_out_features)
+
+    return y.to(orig_dtype)
+
+
+# Register the concat linear fusion pass
+# from ...prototype.inductor.fx_passes import register_da8w4_concat_linear_cpu_pass
+
+# register_da8w4_concat_linear_cpu_pass()

From 677ed0cabf30e639fde4054338737a28f06473ff Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh168@gmail.com>
Date: Fri, 7 Nov 2025 14:12:52 -0800
Subject: [PATCH 05/22] Skip quantization when channels_out / channels_in are
 not multiple of 16 (#3309)

Summary:
The underlying fbgemm conv3d kernel for float8 only supports channels_out/channels_in are both multiples of 16
so we skip the shapes that doesn't satisfy the requirements for now, we can expand the support to do padding
if needed in the future

Test Plan:
python test/quantization/quantize_/workflows/float8/test_float8_tensor.py -k test_fp8_conv_skip_quant
---
 .../workflows/float8/test_float8_tensor.py    | 85 ++++++++++++++++---
 torchao/quantization/quant_api.py             |  7 ++
 2 files changed, 78 insertions(+), 14 deletions(-)

diff --git a/test/quantization/quantize_/workflows/float8/test_float8_tensor.py b/test/quantization/quantize_/workflows/float8/test_float8_tensor.py
index be5f2361c3..1b91875359 100644
--- a/test/quantization/quantize_/workflows/float8/test_float8_tensor.py
+++ b/test/quantization/quantize_/workflows/float8/test_float8_tensor.py
@@ -17,6 +17,7 @@
 
 from torchao.quantization import (
     Float8DynamicActivationFloat8WeightConfig,
+    Float8Tensor,
     Float8WeightOnlyConfig,
     Granularity,
     PerBlock,
@@ -25,7 +26,6 @@
     quantize_,
 )
 from torchao.quantization.quantize_.common import KernelPreference
-from torchao.quantization.quantize_.workflows.float8.float8_tensor import Float8Tensor
 from torchao.quantization.utils import compute_error
 from torchao.testing.utils import TorchAOIntegrationTestCase
 from torchao.utils import (
@@ -329,14 +329,13 @@ def _test_fp8_matmul_model(
     @unittest.skipIf(
         not is_sm_at_least_100(), "Requires GPU with compute capability >= 10.0"
     )
+    @unittest.skipIf(
+        not _is_fbgemm_gpu_genai_available(),
+        "Requires fbgemm_gpu_genai to be installed",
+    )
     @common_utils.parametrize("dtype", [torch.bfloat16, torch.float32])
     @common_utils.parametrize("compile", [True, False])
-    @common_utils.parametrize("granularity", [PerTensor()])
     @common_utils.parametrize("inference_mode", [True, False])
-    @common_utils.parametrize(
-        "kernel_preference",
-        [KernelPreference.AUTO],
-    )
     # only test for 3D conv for now
     # Inputs are (N, C_in, C_out, D, H, W)
     @common_utils.parametrize(
@@ -349,19 +348,14 @@ def test_fp8_conv_variants(
         self,
         dtype: torch.dtype,
         compile: bool,
-        granularity,
         inference_mode: bool,
         kernel_preference: KernelPreference,
         sizes: Tuple,
     ):
-        if (not _is_fbgemm_gpu_genai_available()) or (not is_sm_at_least_100()):
-            return unittest.skip(
-                "Requires fbgemm_gpu_genai and sm version >= 10.0 to run "
-                "fbgemm kernel preference test"
-            )
-
-        dim = 3
+        granularity = PerTensor()
+        kernel_preference = KernelPreference.AUTO
         N, C_in, C_out, D, H, W = sizes
+        dim = 3
         kernel_size = 3
 
         # Note: this is channel last memory format
@@ -404,6 +398,69 @@ def test_fp8_conv_variants(
             f"Quantization error is too high got a SQNR of {error}"
         )
 
+    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skipIf(
+        not is_sm_at_least_100(), "Requires GPU with compute capability >= 10.0"
+    )
+    @unittest.skipIf(
+        not _is_fbgemm_gpu_genai_available(),
+        "Requires fbgemm_gpu_genai to be installed",
+    )
+    @common_utils.parametrize("dtype", [torch.bfloat16, torch.float32])
+    # only test for 3D conv for now
+    # Inputs are (N, C_in, C_out, D, H, W)
+    @common_utils.parametrize(
+        "sizes",
+        [
+            (4, 12, 64, 32, 32, 32),
+            (4, 16, 12, 32, 32, 32),
+        ],
+    )
+    def test_fp8_conv_skip_quant(
+        self,
+        dtype: torch.dtype,
+        sizes: Tuple,
+    ):
+        """Some shapes are not supported so we won't quantize the module
+        Specifically, we skip quantization when C_in or C_out is not a multiple of 16
+        """
+        granularity = PerTensor()
+        kernel_preference = KernelPreference.AUTO
+        N, C_in, C_out, D, H, W = sizes
+        dim = 3
+        kernel_size = 3
+
+        # Note: this is channel last memory format
+        input_tensor = torch.randn(N, C_in, D, H, W, dtype=dtype, device="cuda")
+        input_tensor = input_tensor.to(memory_format=torch.channels_last_3d)
+        # Create a linear layer with bfloat16 dtype
+        model = ToyConvModel(
+            dim,
+            C_in,
+            C_out,
+            kernel_size,
+            bias=False,
+            padding=0,
+            dtype=dtype,
+            device="cuda",
+        ).eval()
+
+        quantized_model = copy.deepcopy(model)
+
+        config = Float8DynamicActivationFloat8WeightConfig(
+            granularity=granularity,
+            kernel_preference=kernel_preference,
+        )
+
+        _is_conv3d = lambda m, fqn: isinstance(m, torch.nn.Conv3d)
+
+        quantize_(quantized_model, config, filter_fn=_is_conv3d)
+        assert not isinstance(quantized_model.conv.weight, Float8Tensor)
+
+        output_original = model(input_tensor)
+        output_quantized = quantized_model(input_tensor)
+        self.assertEqual(output_original, output_quantized)
+
     @common_utils.parametrize("granularity", [PerTensor(), PerRow()])
     @unittest.skipIf(
         not is_sm_at_least_90(),
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
index bca3a7cb3e..e3a75bbb3e 100644
--- a/torchao/quantization/quant_api.py
+++ b/torchao/quantization/quant_api.py
@@ -1821,6 +1821,13 @@ def _float8_dynamic_activation_float8_weight_quantize_tensor(weight, config):
         assert isinstance(activation_granularity, PerTensor) and isinstance(
             weight_granularity, PerTensor
         ), "5D tensor only supports per tensor activation and weight quantization"
+
+        # weight dim: (C_out, C_in, K1, K2, K3)
+        # skip quantization when either C_out or C_in
+        # is not a multiple of 16
+        if weight.shape[0] % 16 != 0 or weight.shape[1] % 16 != 0:
+            return weight
+
     elif not _fp8_mm_compat(weight):
         # TODO(future PR): this should really throw an exception instead of silently
         # not doing what the user asked

From 02ecbb786650983a105373c807f83802b50c97aa Mon Sep 17 00:00:00 2001
From: Daniel Vega-Myhre <danvm@meta.com>
Date: Fri, 7 Nov 2025 17:14:55 -0800
Subject: [PATCH 06/22] [mxfp8 moe training][BE] add docs showing equivalent
 convergence to bf16 at scale (#3312)

---
 docs/static/mxfp8_with_loss.png          | Bin 0 -> 47012 bytes
 torchao/prototype/moe_training/README.md |  24 ++++++++++++++++++++++-
 2 files changed, 23 insertions(+), 1 deletion(-)
 create mode 100644 docs/static/mxfp8_with_loss.png

diff --git a/docs/static/mxfp8_with_loss.png b/docs/static/mxfp8_with_loss.png
new file mode 100644
index 0000000000000000000000000000000000000000..47e2967aed57357bfa6ef2ab8d830f8cba7973f7
GIT binary patch
literal 47012
zcmdRVWm{WK)GkhO3KW;(?(Rj3dueg^;_gr!N^#c|FU8$mQY^Rz2=4CAN&CF-b)66A
z51f3-&d%O5Yu4OLXRRnT6<G{aVpJFy7z}wiX$=?{czhTb*rYc|&^rzKq1(_eSXT{M
zN!Z09mOvPo_b~F(5}IDwM=Rcb<exk4&c`z{tj0Qdm6P~ZZl`MT6d3DS-eK!wtG`9$
zue|l?SUntczb|bL3Vt8S@LlUIUI&|(M7(lpZHk6&YkZ?kYmqXzaIwiMC*$(bEO&P(
ziDy4GwRrn6phNOvl$(2ZRK!LABDm`t>q06^9f~dahB6ep&zSTS_3s7rmx>>b_WxXl
zE5o7vceB45gGig9NKK)mq2cYnS0sPPAxC4C=V2<)B^|ARl|&M{Iq-Bf+Qa^h=ml0s
z)z`v3do2CM(6@V;hm3+b)2)-2tkGOKa<USGhrg2q07T5``mp~;fftDa5YE70#m&u4
zT}P0(kK+c74T%iZ0*r|QPW}1cgpo~xMR>Zl7nhfE3w({o=&g`|pfKVyga08`K(73K
zog~VC^#0$J%Nzf9FO-lEh2?)2Oc(SYq1dJiG}ZqxK@vR+0snudGD+(G@9GVw-T#vv
z|9=fCP-<N<sY`MCwVgJ0&YgVN8*{f4bC;WN2mq0*NW+%=fj4(v`3+l=R9T^P(;%#A
zbh33$N^pEVnK#=fW^3G$W?uOXr3~DD4Yp*q=E33NVnZ3M$`t+RzOBL%A1f=XKrDCp
z<<-rNMqA6t&qrQ9z8d>S5B+)3y-qSRvYEL#Qwqx!{dtRBP!1F-UfRHpcT@=s4GmfQ
z$1&aV4{fwi1kyq`WTWun_|zYu=h#K2fiZjK33PTMwdz7zH$Jtg!15Y3)}J5qD~f}6
zYB;$SQns>Jo#*ag#feH(qI<OWG$5$pjgp3&cfSX2syc!032H4yQak;T1b5Zu)5RdJ
z<LsN`f%mqSkgI@wcJR%?a~m?nRy4m~n3ssymZ*z{g;<vz5yitFptYkT#TB`wk*|jV
zZWMxe`!(N5%dOV4STy{~<*0XUrRUmp$;gP<ZucSS@9e9aR#ta?*A<|*YhQQ(;MVxo
z_t*IC2QYLk=I{TlXli9e%F)qrr{S=bFg(a6Grc%U<F^sK>zZ$2csqLI^9S_4KhQ<F
z8W2d8jvn$<@G{x9nGi0-hG=}>*}<=T>PB9ycE$}iF~8wOhcChl`T*0%k?uQe*Lgo%
z#pQFg*Lg_l1UhQP@L*M7Ozmn7X8c5Fb&#^k{=9wZO>JpbLujzF;(z-Y%im}dC2Ov+
zF<^HnK>TqDd9I<UDb1S4M-Jfbac_BG2>Y1BZD81n2|ps9Jig#ey0j$hWY=~YV3p!X
zDAV=9WNlY?W|u7#n;v)Q9|~$o&iBJFDh=8e)Vq?}mq{%_*-?<w2+d8$!^Y#8`T0+}
zy1iTSNhlFgP?1lq&|93H=-#`SRLT?xGUo8=b~!vaU|?jFU>pdK$zot*vmCs#ajW-K
zt8<3G_r4#^6-6t8Dn3%6$>yLk&oLOdC68NLSy^0A5VAdzhO0=UUFX4CY{TU`=$4sz
zGda65P#A@)bN|3JJgiVkXwb9#X$KoMf_r{tWkq=`JZ7O$+5K8z@XP1CNuCq?LJwCm
zK|w*KEWVWXWqtI%_xN#j{#K&%0b7})?sf~!2?lM%P?A6h3MHVdXp>CsV)=)jPMh?Z
z)8(O9vTHnid@kQPrBO|(MGm|?wQnaU<c)TV-{;usEI~B<`~=^$ML@;M4fRb;Rwqf2
zQ`rKnvYwU2#i8P*F=;b(CK)Xb`vbhhAsSv@5(Wwbm5)q&-=(SSR@*ZU4$kr3y_=ex
zlphPnkS<Uk9v+UArutD;S)6Zr6>q^<bar2^1ha8@-%e?iby#hmrNeEdp{*Ui*ah86
ze%r5+zzwBbu8hg;ZNhK~EiD3GUS5G%%Zp_B<g_&E3=+x6q%f$0i%d$j&=I*ja$UYC
z0Pn>}N6~MzUnWN1SH0e0egBT?ebchrvP*EAXcRwQ8II&|w?WW+7JTWwF+_h4fIOw<
zZ%`2vd$?}$cY=cJK*J)=DvDhmq!Ey(8`vAZlcMVC_%uU96?JvWQ}6k<g%($ORnceF
zUO|(S)=OYEpCfsKT=L^d7eM@V+i`1q7bouHeUCDPK0B%J8S5p;nr1j8@Q&Jcu5$NA
zFR*)hT%?aWPGq>W1hkpZB|fOt`GiPJlI8UpQE4(TGD67j!6NK2bNTA<#7AOl%gD;K
zGxKTpJmfSmEi*I5@3c6p#xH=XL$v+yV&wkLodVH#b1(*v08d><2c^3&FfiHZ>B8{$
z@81iJ{>TZdZ_*4eFjaE+`44?MY+rY|$1k3<0wa$Fpc}FJ+R1#kZ1pONn~n%a>F~17
zTHnxk{%|nsx0P9T-@7X&YobmGxtsG`(ASDM6gpZ2|MBd+#$ao^!om9<5+kGtx%3|!
z^SckPnj+aRs_1~qvFGa+P>E1Hx9#LbFy(WK-{qmv*c0Zm|2X?Sz~~7*W<*)MJ!LGb
zNlh(Yb!BG^k09*=V)XI^3m@Hy>eYb*ZaiH>ZdxAx&A;01i1>?KYd{(Diiu0F^s}uk
zBFnKQtd&&{<?eZFd*^%_@RWXYgW2gjJ$AI(K{8$ai|`YOl^-%w#jgmp_U2Pq%%KAN
z&6V>)6;)NcvQ3blE1T)NXUD)k&n|!Jz=uPK*AWF&>5j9X7Bv@!8a;39F1rJd|BC3m
zd&}VofZO5%FTj0#yk)adFN155go$UyiD=&a{P7toOHsF-k1HZ~-Tn))eNtHBqXPu&
z+i_T8A2<lNCS_k?yQA4V`o{yE+4=oXWnRwO9(CM6R446+4uf$dz*&krm|~kViq{t+
zhr6j=Zc)+(y#s5<)3uk!fPtdZ_Rz``2R|{2RGalyHvZ=`eBDgFI{@G{+X&)|1$gKO
zIvJ$B`fi;{<2s*oUU~$0%&ThG-nA=r0yBASiTE8@1O-i6TU%Q;K>(K?#NErfwv(*`
zEW?i`>ag?vGlyexSlkzn&piB1XZP$Y-Mw`LXOEAkn}d#iIgpo=+YEh-dG{z_L3Wop
zXA{H&jX&_U?P){nwQu-y8BFumM$yyOjo$s{DaG1KT1OJ{>m5BQu!oCvtvli41NR#w
zIDOw^+2-AoHm^O^Kx0iEqQHksBNPNU76Tv3aj{nsR@Mp_k(aGcN?Ae~P@R_R??9v~
zcFp6t?7{2_xi=e(qZm&dh~^5~4BA&(a~+8b=nq%U`nPM(x6j{KwL?!=+_+(~4*4$A
zx}IX#{Z3q$eaFY|=2ea2Y%<FO%y{EOUj$xHZH>0mx|*}%j67MxM|o&N1)}7j-qJrf
zI0d-&$$0Sdxr{|r6}S4360jMLZhYLW_8TZ$XdFY_WnUl5B<_5PaSSSBq~JyE<Iwh7
zHBGJ3?umgsllK_MCnhGsr&<vrol6IMyQ-R;d+%~tT$t0vQjBNoaJLu_;S?+94F8(a
zXmo&Jb}zXf4zd5)9pmRe+wFQv0adbP-tbO>!6VJipRIrX{>{XlESnbz-1OT=dmWl#
z{PgAyzPU1ssEQ(b!27ARq~*e2i^Fl1QS-|eF7GLtWR1}b(dgoi$Ge+bhy2dYT$If+
z?wHGKUf$7kPO>7r%1!Jy4?|-^J-e6dt;F-r3=}K3c;f!c;V~r6WNA>u8vG#98`y^K
z+d++)_xW}j_v2Dy-+*G8#%G-&?`V8Z?+qF#*+|`CyX+=r$Cf9}n{0FQcE*=GV{W%s
zq!-u>6}<QN_u2k)IwFrpu1|L$z*KWpl>6ZvywgFucfToTqdi!LE5jUW+*owIVsCXA
zKXMR4Z3l7}MAeZ;LBfPG=)>C2p&4CU^l2KLsM;0>w6sn$#_1Aev*9iI_Q#yG^QT&Z
zD5!BL{bhVG!?=7u4Y-(qkXz1;X2591mYVyy^ENGekf@5?^VC_5Ny_I!9TFO<z)7h6
z257<yWXP=v@#Q%&G>}0xvuTY$lyqX?bJ}6cejESP)Y9<my#J3bkt=K%qn7n(zs)lK
zYS&%1%GL*B^1zGI5d4gQC9)Wq4svt=1<;wRPMEQ;Umj5;O1jr}w2f`ZJdrVp?tCaN
zFx!|zng;61|9Gh*EXv)82+kX|Q_0|ZM{`UQdyM@C;eCp=5)-!9cpKa9s_)5QW0K!`
zS645VL(989js@_791FhL*Wdib)xY9Z1CGp=JzwGd*27<SpoU=h^Jy0jz@o^}fUX(}
z3WVMuB~$(n?+N==X!%Nfb*8TC$+D!m<hw84MoBlH-~u*0p|)ipdVG7?6@O}W7NgX5
z{r%y`!U)OPxjC0@WU)WCRaG0Jr=8hE)5ViXnW!GUcjqhumr91Zo!J)bozg0XY`Zy!
zi1=yxPned<fSxY@8`r0Kf-WTnLi5Squn0WOLU=*2d~mQDL$){M^fn|oH<z4=iOK6M
z=uu$vYTA3P2jV)`Fh4mpHGH){?REA7;WArt3%D6Ex;R&TMiUk7KxWZ`Dr1_zv4#p$
zxQR*8V7V$lEG9pnirn|enJVE837_4@0D;k83-=Zku@qdJ;)>6<MIipjxSQ?Ihb8`i
z1${`b$$m3Gnr&nL_9a-b&hp~6OWb23yu?B$qkr?tP9oL9!qN>D!7qLIF-h3{0MFxe
zwNJ+?x91x0gaCNh0=8Ujcw%ElaH$?=WoB|=*iCbAtIkZ<NfCuha1efI%glDdEE&xm
zxE#r~GIHf^w6F6WCx^QAT2k<{vzvEAagJ8|hy-U%5frdcQ4s1~CuL#=!o(}ld7F(9
zuevW<_#G|9x!kIrtbGF9d09Sn<>ASLdw7(Km9u0N6-laC0#oE@hBF%up$3`6b!Sw?
z9|<Wu?-xWzRV3H=`hb6-)t!Xf5@bGItoxOJ{u}^!Mn@Yr{5HpyYLl-qZ;?r-i{)Q6
zeY;hZW+YK?f05SN4Yg#EK+`*TLyb<zsacPXRZW0DhN{RpUR{Z5Nqal#w+e9M<O+DK
z%ixvqW&0Cg3kz#`ygY-O3$UoBG~Ms4ZwICmX9py1nC9v={+OHlA|xc_wO*8#&goAV
z78V9|@upW-L1~?5Mt+C3MkNl1U-;~n<ZSgw60Ae3{I`)_9?<|+24h)vOZsTFY~fHD
zq#Oc6J+R>?Zk0`~t{03EL%)Q9-P{rS%4dcqh&KV$^C=3jf={AX*%-d<BmlbKgKiyr
zt#p!;lg}PV%I=rDUWFTv&l+BDl~K{qhK5i+={1Fsbe?&`*nv-Oc|=zxkge_Q3tEZ<
zr$5*^I1FCgK1Z#zH8EaFzdX?@-mQJsx+o2N_QcPri?6I?0~yt(#~EIlSSn==@6~d3
zI&XSvUk?&@-Wk2hL#6N2vE_34pcM^{iJ}2njTia|*ZqT*<JzTBYY>*9E9>m$?>7;>
zPkRykmsyYGYuB@7g7&NWx!x~`Y`(iiFy(=D;hY7`f5yhf%7=_I=$OBBstAzwI_nR{
zR&0!NgN!%&QIygJ5?F6vGP}gC0^4szPj~c;9#K7)-TB!&+q0Ki$Fl2;VdwRqDPeqC
z+c=e@0k0c|c3{iJX!i5Pz(<}s%d!3%G6e;NySXMu)05>zJ8)x6><L?b0t4dx{b-gi
zYSVH6vKDmUZ098#XpFk2*>O3ZJuZ6t@|`{4o>?=7s4q6Bjmy>!CfoG}F5nsb=zWl3
zb>H{g1+wE$mMG*O7SHud93JLue>!lnRLKJ6hoRN@;aAQ*UqEp`P|q?f8$%_|A1D5@
zubRzeg=y5W4t1E9=I_^dDQMTQrk3UgbDALEA*WW0XzXVlSq|&%1JMihCb&uTO6GGv
zGTN^$MwVYT0k!5sE#*3Gv4Y%IoHo7v1^0*0v<NW8g|q}>h9kwe5k-U1cFSdBdL2;r
z^9^Oc+}!e1Z{4^ka4RZF;JE&ivq9RtR>jU3Kcuj^IWo+Oy|CR6uW_^5AMEj^;~c5N
z-d+hH6gIG%6Je8Sx2!or;#i(nR~D^PXX$j(8%Buim3iFCRtC|Qq9WfduLJe-D5tUq
zzO!HeWs6hvad@o{q<vrn>1|;Q?H?U&&AzMp8{8`>Dx{0Qq7Q0mNijvNLvOx)gV4Ou
zahg|Cli+SU@48=<hAk(W0%@@&C^Iw~m?}~n8XJTCSzH`GYddEu%vS=+0cJ1OEp+(u
zmuYqNoOJk>R+0k|6;qAV>7cQMXUpCbZM)h2$L2~!s)V&2y+CP2S2XTo<${)m6suhS
z*lE=)Ak>MKroq_`BQVPU^(+1oWDsf(F88vh&Em7qFE9U5N09wdM~4VTuhF5Xv^47J
z3GPdUp|Xt)V>C!Ct2e(_Q<VSg;rHwhV&3nJN%S@TKo5{kbG(y)lB+A)U+?I07tm8q
z#2wzgt7~iM?~a(oH_K&lp!q#I+WPC)jVIdZj$4DY?W)S3MHMm8Y|Y<{C-h#;Rv!58
zdsxLck_GE(SfU?tguP$F2>z<Z-KT+F44vFff%e7=cjDU<i~WxT30Y~1_V%o0tJ+dr
z8C#?GRvcyzZgLeOoes}gb_?}UP7M7myDX-?p<kDlmUQYo$QrV2hu9g6&+|WOe*R2;
zbJ`3|*<kIb8y`0M{AOZ)bLam|?nU`G?QqP{R<f@44C#wITfkLatMjwwLH$UTU_6?R
z+Id;wNIutsdF6R6_qsz4a)$C}yO9DigkkhRg*)imqjbC@RQa?v`?MbPn9dC*nYvmW
ze<m2Ks1Wgb^^f48{eAcqeY5-iEh$X|6<ArkIgAzD=+i>G^f6D44wt3lOj68g_7Q8J
z)TVJ0D)x{EkY_2lH3m8Gkwgvy#rHKrd=#o~*?IW*YJE@1pTq)pxK)<q#V+?%t%{eG
zNfwe^st+j+E)!mXsFW&r5jfg^)EFrH<p<n(b#J$+tSh*=87!(ym6^Kb92Kj2a}3jh
zp$<*Q^C<W3Cj(K;q6avf3WH!+$MyCr&i%oiAVZ2@hWi)OF=JQXLHh$Q50%pcv&7u)
zN**24Q)Piz?J@uh=XJzgZU_C(mJoG|qQ9D;^WoZ&%YI4ohPMf7z3wjv!~w(Xdu{al
z?|QI2jOvQ2SRW11+kpy<`W2I<-KWFMmG5!(kNI5_=h%u$a}$E*v;4d2ApGdmb#GRn
zf@(Qn%iMC31Wk!(NZKD;R{B65y<StaCYGiYXo!yq+`b-L23K9}CW3&3M*&F>fK%h!
zoe#H4P)`79$_$#jbadVCx`bL=%M~GAJum|slj#YgD_2=E0DWqHabc!E9D@X^0_1V&
zIZY(5&nq*b`8=<GFUn3DbKhp8V;N4(8|&a(WqT~0SE`81Ugnm>kr)_)_GVbrfwdx(
zp<ZnXf76|qli<9d(gu)73NgyOFU!G16J})7mb?cM^?D{0Q__D^bdr%j5^&TCiypCJ
zEn}wU`GeU$LFq*Omm?+2o->>_EG<~B7w_LdPR=ARN+!d$ULo)!HI`)hyAIflSm-pX
z;mhWv^i#9*x)eV&e*Q2c*A%E6^gtf6hw(|otWa;O5`qs#Eo%QJ8H@@IMPB}<XCvQA
zQLNd%sYF})C=7ZLhqrdjx-Ed$-INxx3EZco%F@cqp;>aDtlT6^duQEG;)xA5h{#f^
zu=fA?e(O!;dosoLrI``jHldnNu;Ez;Vb<Rlo{Zgb<BpHICo{uy5tQGyIeH}r&!vwk
z`uZ-+5eX+BK=a4~;pPm<P+q=)M}8sQcr10t)|}I|UldO=^pt92=<}ssZAY`v-5L`P
z{4+(RxB9-e04Yqd-C6j)#PH3qKq4{beW!~3B7yQ0ep3ySn=p&xfr*E6JeD$JF=yh|
zW7#biqxhdd5M!Se@zH_&Sv1BNf^rBPpy#^o2|?N0kPd1AqW|WjIjNvtW>3?V9w9u`
zg2upqKl&wc?<alm<!!DLs-*xR7o!TBSUgMZnkB#Ot9ojj@qn}sU#CL#SK@>i<|zKt
z_l>u1KRyt)m4!)fdN>(e>|{r0Xer%hv8($G(R27oEEztasi@L_32=to6;S@R5%&4B
zXKK6_zr&69Xv!z90}0*ijVQsxy&)3Og+%2UOLhVl2Pi?LDN?@65+y#uxeJRu<-PB<
zR9Puah3{xc^xja$GXG7qZ#19hE*whZc|N=D$(0a%Fpt|;N1`z!<Q^X&<PGRD^W|p)
zYH@s@SRiuid=coW;YNu8NbL^GqQvCHdL&AFmIIZg<&MheMQqr<I18$@BO3*9CXn$;
zH&zs^)kjAjFG<fI1ZvdZ!EcQjNCL|Ofku4z+)dcr^bit5-hPEil=<y>z0AcqlD-nn
zt~^z`?`+`p;hKSLt=ul7@i2|*B5n|UaY}V3?{G(!?%k<(K`H1Gm*d^$1>Dx1QZWIN
zx4jX2s>wu>TckjyI%SBF<JX;y8br;V^pF{-Z1(GYeE0#C3V~0BF0@B90wI2jH}N4q
zk}ckr=Sp}@i98jVoxe+d`#U!(Lrb!~H23UoF<mgMUhG-#Lbc;fA4RW`0CAuVt)&WO
zGh5K)CdwBTN=Hyj^ALiECy4;&aDAkM5msXiS4GH<Q=2Ys9WSR<ciJJ2>oTP)j~#&u
zGlr|mM{+G?Cdy1Vn>axw);Am_400bA^j3IH+nO^+6{-%wruFosD$HZsP^eA4rN{@$
z{v}${=ikoAH_SX!j^3wHED|N76l2v7o<*pqA}$S72ODCB%&pof6k{EG)iK-9@6&!Q
zd3cw+t!m#S5fm#MWvBCTpB8}d^ZNdkwzVvEV!_6cs(I-(BN@VRH(Ea#;W|qc9$Y;x
zTvB16&l4UTS{5a2NEe=mM=KGn-xX!Sz#^rdb7&TzgIV`o>P|TXDtuu<6K_{!q3WTJ
z{;z(??#{NxT2lF@Auvh4Y>u8BgnKMQL?@c{Wo<A&neCbIxauips)#~Ln9reo-gble
zV}5DwpE=%sVm`C*%bM>pK_(M=O~WL=Er6VT-laHk0pwE4ljZg?7j{wv%vsdQ2rLRx
z_+#oMMhlX#B;W#(_JXA273aZFR<<%Wa|ii!$ZUhl$(}w=Y%vSXt0JzWk#6xvTwwYf
zaZz$pzyx-e<Av7W7)`oVdV5(1?VKPM?!Cx1PkHv!%#ABY-+l9tz)LODQ_<&zl?rPh
zr5fj3?y3pl7?XUZBy(R}OGC~UZv3B06!#?tN^Vvd0+@o+<Gn^)JKv(fvb}OSwhr=<
z)kO=dIX_vf&sR}C!mw2Sp`G8|R(9(rQ-G&S5ftS4m;|FQ()ca3tn@hYa-YZ|Il^DI
z!a?!}0uD=Lf|6|q<n-5HzY+icavR<0grH}3&Tu5AZ$8~V5KIgqK&N+$-;8rmb7mp6
zYt^GvLBLUpOi-|mF(I^D)K&ewZ4DP-M7PVT{0}V%#RkYl=v9-atr^>;4Tr_#s6{V(
z%zfZ_jx!DRc6U~nVvwvb-;z7*iYMFg%>Yg&rB#R!@v>W!&u<nf#$-5ITRc?iYuATY
zBUBCy#~sG`W3YAXS+{H?e8M-$Z1jH$9yfj15JFXUUd1I|_I&%fw9v7ekZDne$e#=k
z*xcdpt257F(ZT{5wOHeEmYU4?jWd<-7uYZarVTFvwxEwUXTS3Tpt|z>OM-;7WczIR
z^jX?Z3DQ{j=T1_UHevr<H-`b^T~W560kxn|gm+M8i2Q*VQm!gQ;1bGVQ-<3bw+TN!
zPSQ!`{W~9Ag^O5<Pg+HHE;qjw;`jj}sOaDgM<NeghKtV2FTNVTz>@lV=t!SbFxe=$
zAec<-lHhyJ$GZ^ulUiO}AQ=ML#Mdrlw_LmB*(3ygp!z%J`In&pFQFM<sSffzV<~N#
zU)b>1&nktN`?bEz;izmkuOEri^*FU<yPZ#K2#>D21HB?LkNkf=6H+y8I)R)S9%GVS
zvm>ul#;wf(Qep)Ofw8#9+(MHFzV(&&(q&bCGJV1kD!iRYzT@sI&nN8PY%)XqF#Th?
zU#{g;2g;{X4zeul*B-cas<yd(9$$o~mOe6?b!}o6pL(6@LQkOd^_C)a-m^cqIndkV
zR8np6%RcBT{W}4bB3{@Nnz-qd$)dxF*;oK!)00`|gnKp%5d8{yr9z)WnEz%&e6-RQ
zx3NLRGO~OsO-<)5{EYQm7>ByCF7|BnQ#TLRx*U{Sl9379Y9Zf`N){XKr&lMVY|?+3
zHOfSbH28ys3-?^AbMPFdiv8S(Di2SZ4)~~T7&J3$pSGIpdL5Vtm`Hr}51<wix^ab=
z{J-R0mseJz$9D^wf1Rbzt4ErxH`6=bcAPxw*v62>#b%%2{hLq9iALh%gS!a1)oA)8
zvPc{@^<w0Hjohww;bq)1i0)UWPy28Be}ihGA7v2wQ8?AD`Y>PUg$XRhg0m~7uC;$%
z1q$oM$u|6bh%)w%n*1WN_zB@>xyYskXEc_ZQ#d7m)egR-w;638{sAIX3hg}-nurEu
zGVu$miWfCoj0CJT0rMO!H^JZkH$vUcvcJ#VYJSKo1WRTk$SkWd)DNQIZ!jfETkREB
z{%<XSs;tkZo}u`Ei*T-9{hk+$T1%ey)#f-dsXLMc6_MEf9yTWF2SwcPf9Kt#uEZFn
zDJ>TGzY-J=dqTu81auWYq+1}|XgAZie(%7lv3L9m)-(EVzz{+0-kYBA5Broi!iM@7
zv1QIkGr&ooE}5%52VqU0$l6;lyFqN(<Mj)=hu?z_l5kPMW2128$s=)fokj>2Nb6gv
z=8mhI{)(ezl^fB`d$8&kYKK-GeFA#zZ!&og7-5Ni|B42(n~^Ysrl_l%de!_@eW&fY
zwR|!5uK17~6v?tWj^!9;yRv&!iaxz^BG=zcKU$@o9>C5g&1RBnYq>_IfEvP6)0Hb#
z-`0)>z<yeG)&Y}Rd^P!&|FZ!ixGZttT}$FDijO++1tz&y>53dM{ErRj(-W{#MjXyO
z!D#;JM#xjdu3kP?^jX5D8I2iv#`d<!vttPXV7^9KoynOjA|`W&g{iljDTCyR4i68Q
zc`v2@t+<%ud~P+3v|L_lc^w}48r8L>X%=m5Q44H^jUVk!?^<KEi({1&XxNYT=@Q+z
zMbU)bIhC)@D}JJppK-neH+qmSPWFf&2z&9Cy)>5HLs%<j8i92_^6jg|uPal7W^F=|
zOKb)x5d=4GbrsC1rB!*xi@9tkrHjDUIqR<RLv!8@%P`3{fgsWeHp})yuX&k&u!*-o
zMi1!(443^`m1PVQ{!zmvdl=+3<9t6jywXpWR-8VjwUb)+iz*Ic_jNt*K?NN#O>5iv
zc~o7bz9HxFK$jC^AYZ-3&)HB=_2g*FibRj&UG^884cn5>URa4(9A`8g9JR+^9rVC|
z=1yqd0=IELMYoi<Vr$u=7jcQwipodm&GXb2F$V>0jDx}V`}xlG()JGe4H4<bIL`Vf
zf=!t_zsAyxz%j7NNK1E<y-Y4FxJ5r3MTdcgOG5@$pE;{oMhmWUk`Y-A7v@JYEKGLd
zhKFXVL{EMuAIv{uyX!Z44Bj6dxyXdE&Na!jY>xTs>!twvGbHqR#*YrRwu|3fDo7qh
z)6_%<YljP!3UkP;sxjbMBfE?^9T_fb;~8=AxB=4tifI3vUD4Tj9o4bxZ|}Fx;G*61
zL{F@VeQ^3o38XOm@l%L1lMcCm4WDBPT>(q*jB@o<bpC8yRvm5378~M!-Z~#quJF6P
znyl|94Y=D8tIXCOh1Ub${n;|Tc&2-8G3BSo(e0&ZYIL=+V7pXsZMhlnS>?}*%ov~~
z<yk_sjpv%yLod4PztXsX#~0V>iYp;*Z7y%+z*UQPn6^z%rdazeKd^O=&gT1-BnKs;
zYu<Ar%k6%0@pC=U-i0LuU~g-s*}auz^<%vkc<5!Lik0?*`Ms&|^7)R~oM%5L-qxei
zi9QcLI4pCw$u;1k;m(USPr&q=r+dYdWnD4T(XQg<uy<o#bji)lgl(Ij?+alu9#V5X
z6JWJ)J|xkNCvbUb$lgjzosa`|72g{l$XZRv<t@yL9qdIAo7644$6Ar~kBL2`LuW-H
z6^_oo^tH^0zgv+A-7kz_BqLXhp5^3eD@<n8dtc?tXKN7mPPQ8523asLxjrp-Rwg{G
z(g3o*C@9VhI!d}5egoK@<-T;SE~61!;)UI<FUk0Yn_9SHi8hYx7r3-^RlXxwn0jYr
z=^S~S;~Y@m8RZMAAb-j&497?b^M7(z6dpg`-(hwgp`10G(GewY2}rUnNr5|4Fyp5U
z!2RTHQDGN%u}^TQBSaS$r^;@j<(U0(yG5j-EUEa{SgeC%<D~dpLjkY8Zi3>}ECXVT
zSkmW5syhtoX}yF_D|^nYDDMC<o}y6UdEWE08%+GTq0Qx5axt!k&}s`)28J^|H+meH
zD;j+}t-AJ_tQa!qRq>j5XYZh-7s2~#7Zzyt&pG?qQrw9Y8Cs-b;Q7Z4i>HK}xV&Am
zKBqsW;a{1NGn{~S{Lk7OVO>Fdc?^4$h?`JlutcX24&N9G@-2-0@AS<n(uOGEOn=TS
zsGlt0bV;I|ZAiQ^nuyKHi7eHl?096Mkqe;R7UTii^%nPx*%C9}=;;g5rR9dgcQ=cO
zL&W??CkPULt1Qp)4A`lC&k>E9(QAd2EA20o@>;s~+4|mX&6v9Q(~+S>gcX@|3VHGb
zu1pRkwY4&!J>2aN;uBNLi)kBtE6Xcp9kwUAah^=~yYNTK65i}5tLk{TI$?vl#Y`@K
z&p1<;&vcg~E)1yYEaXWet&I%cVoC9;>Ud|tsKD8NG6H^HT+M#xMKVRT6q2=>M2;@-
zvW!))1ldomeRGa-a<sk$aW-`YlT6E<>uyTb#mDPFQR4_>wq{HjuJC`5i!!Z_C@JMI
zjenkbD=MPglm#MA>4V?>LA%~c_zNB@OftzVLunKX#u%Mz2{P*h?yY&MY)t90KV1tR
z7NLqADbIgDufn`eJ#N2SM@xB9Wou!o9#5iDu{K=Ncpha_B(gH0oNIA0HqGn>nFcrK
zFCl+}@GxLuJ+lb=v%)H#543D&s}t{l9gnsjGAz;+CUczL=phy7ao|B!U1z5YKOkEp
zalhkF9s1zTgP^}A=B#&s&?ctkZ~86fk}qfM>@&EvcqHI_O#~Kl$31f?t?by)5oCIT
z0%z|6-r>)<sOqJqX3A1|$pj2rcO4I>`^p-`Jk<@JT{)VCP!Y@jDZlK@nR4rz0WnpH
z;y>6)DJCcO#l^dmqIa(vFhR!O|Ip)1>T$F*b`mZ+nj@MN-0KHJoRz85aig3lpXKdp
zsUcc<afYf^8W`kF%fKJ?_E7#E3pWUwK*2UO7g#1o3Tv|L?Ou&?3m91U&?YQRweem6
zMik>wtoG~+6)k<<r~Bfsl|9btXl&w*88_qUa@{Wp1<omjR(GXb@M-yK-;ZDzXvo%H
zzy%n~;dIb%iQuG+_AW93Q6!Bp$Gmm$^Jh1iUi&c<pAQY`x|SAtV=iw`8|1KnODkDx
zgY&msZ1ceHJBB-H;joUyC9j5)-NEXD`oa~-pwK=oL-3emlc(Am_x+%4+x-(`K}$0e
zz_DvmPI0ujl6&xW{Fk|_RdR#9dO&?~RF0^8r}I}>MA#(g^lDBkwLYu7e1QCKL*EC<
z8{L3{6)f2=-CWnv3z^V}-NZB(@R^mfle;T8H&jt;sz=%12@1~P0^v66D6r<%ofmKc
zj*UdGd;SYg{2@OhI^mD8ODoGlA)O4!!(5ZY=;|6G2Ajc(C?3H(%SMdll9H0Nr7^Jf
z&9JRf7FSSjFWg2H0<%^{QGI=S@RR<Rs<OQH^5o_0v-d+wA+(On8#M?~h03`J-79rO
zO>QDz&aLHF|JE4xqm(#Gh^QHQB`j4w(%QFG(vinOD<VA?c0KExV8D5PMPR)}*55}6
zU(psO-fm0bMti+(nLZWwJgJ)y`)uIdvx@Os@kmS?8``AbG{@xO8D;9J#EzSLI7c=g
zPnoqXb2#!?P%^u|#9OH45G+)^?31E?3PPE2>(|!J7^g2z!{NZd`wnXEJKErQCn$y!
zJE-R5%+w<^?#R(ZGdk1ze$(aIt)KM8{tHnODxBsv5(7%Q-9hAEIZ<Lmhrawh1b;vx
zdQu|#<2!FWB&Gbud1me|7G!7*o2Uk=B|c(CJOK^R%H_Tp?)N3UcDSjfdOT-K0E^W6
zSAhba(R;ItU!&*h_!pdI0@4#)rKpeELXGi$U5ja<0f`#1n^(~Z7)6ovi9AQEBZX13
zQkWJl1_aP*lh>YzW~U)R&A76DXHqEejVB%lye|ObxNi`&T}~DSVy7J=XVhASeD@Bb
z)QA)ky{;WOu1VpNR8#&d=cHchZjTB$U5|+v29-|eS`}xw8>Gd8*r3VtSjCmPq7pAr
z(hv+5!NeXqS&f^Y*F=Sj0!}HH)T}_!Uc-o{zO^iBPz?uq-$>1b|0O5M-(%4~2xK`1
zqNBO%7-7VO*@f__=2M`jO$~+hW=sY-glLW<1UG(MRiX4H2{K9Jf1#jrIhnOb$;;59
zITvZ|{O_}y^Trl7>*IF!6bea?5U?^UT#1Hw3CX(up~97@7iblUsX^Bh!l`>WGpZ`Z
zUsg8)S`{aIf5bl()+tB%ECzod?4)PJ-oIAnYc((4mhI+W>+4Zlf~5iwH=r+iFhbS=
z<{X8Kt{in!22~!K5+aq&Rnp_Da<~pAOZbXTRr&f-FElotMQX$=`K){u?RbltOqHNC
zHH}McE&wf@G>MbI`x^)fEMc-&%k8RPL4dy4Ws|yj4X(>zfkk`X&G+J1ji+irEK(o$
z>e{+nRmasTGWl2FYs4o&r&)NVl!su5s^k`iJ^1tq{=n@b0_T~`**M{(bDCH`C$^V2
z+(fO(9hvz{<@Rr8wmxTlbSaF7JQ%#<#Lnr~R-e6tlcK`H<U(l01`-OvcDu4r{J)4T
zpG2g3A&^k>MvVdJzISvaw#Bt^epgf4jxGDaR{1@)z(sd&)|W3PnMirYZ~D$sKQ607
zpv6X2TT-WO*;7+_R*Z7Drsi?7qXEaw#QaTfsPQF)F$sS=S~&cRx;FQWMe(l`mv6#j
zPARm#s6~0!NR5xHdZY^`lYAu*oFyl%p8{Ujp^7<Ah1t*uTJ&RUbGWzBiTpZDbZO~2
z_vHQb92GdVJT*uiC#tMxMevC(W_PM68d?<f94?LVw0dfFe9gK4fo(zF@r$L!FqQW|
z9y@yv!RU}^98+mg+?ajp<B0WA?+6jjR;Q9>w)YjVC>wRE6Z;zfjHwAy17gN;wcnJ9
zsjdCGelcrP;VA`sE(Cq_QE!J@188-DxnFA7u>iDm@%`HF!(pXNfjpfZ4>7=E{n8t)
zkkqer@x>xz33fY8LhbL&gq*SU5`dNF(JRYBV!<UcN-WhB$&7&d!*8}FOeH-R-eaD^
ziv;g8@D+%x=_i{2HhJiM;8VGgIkv))#W?=9)Y_+2DQjWa=ATN*v-9P%jTn`iIn$_x
zmB9V1yLknJf=%<`F{SIRul247*N{^m^kpxaPZ7`L@wOP@K^KS}+h|Eq>uKKEWanzX
zK6I<TUxPUM`utv0JL?8gxNX~iut}pa{(44f^RGTzpaz5Q-LbmB{`#@_%#rjk%zV%O
zatP{9ez1uXY4Ud8@v*cmLCU8$4qx>ONsehaUC<^g5ACC`f8ug#IynC}%#b(4T8{jL
zS|2^V8d>{9D2Qm-7r5r0NjF^W)TY<ox{V*iH917+;Oh(7cR^JT!nTf?o(#P}RuAGZ
zh9PdMBZ4zY7DtGU0nh$W8SF5mF~WYww%4ZWaKmeUE5gW5C_f(mm2kClSnoo3u{vQP
zv*b)w=Y!LZq#M<oCl)$lF`@^YFGQu9Wi)?NeD=%<!}rTCYIg=ve4%vYa|W7CW3r~-
ze+{=Jx=r=dce}5zR<qBmt`WjBdPJo%Gd-Y9-qnbFvaCH@tj+r4o;TqrxgLyjzs_lz
zs%XE|p&tt&Sg{R?+mH{1>uwz8_9zdcC<$kEUm-5UfTJq>iTRge0xvQUEVD6U!Jc^`
zhy?pcLYWIoR`MsSk?UA*v)k}ouMnJSeU#DR?Xt9w6=_JmcGuJl)4$bqLL4GH-1om%
z)gE}aCKQU_PB5bN(QT!UhJ^2@AFK<g1@+x9y<y#t{#bXymZH${R?>19QTFGm`5hjm
z^56TRDlHStSZc7Rc`dbm*5I4qQe(m#rJqTh$>M8>{56K`>E~=e?v?^gQ&j0Vw$YQh
zDC0ve?k7BO$l`)E`sc)?TvW-d)4FF65iD~;RQgpH8o)9Rwg!yZ*;JaI%rfl+l7hfC
znZIDhmQAs|6D3@V4YH?b=lsok_q19K2$G1rkR&bhgX<xR-h(cJrm=j`NHFFPl#5wj
zNDRtu+thc1p!YX}NrVGtXWMBw-p{9Z^rN(mH$>IJP<BafHOQWQRvKg=-44U3JQOXW
zV445G=tMS}kNUJ}iKEx`Lwk(Ng}Edzsd0V?#yEY?nqB45@F__}_TF{R;0C6rg<#FC
z$uuTc`Xn*#XMw@5Wh$I&UYdjX1kV@ug^IQw)V2C!%GMJi7q;fug~PbF;4{>ojOcE8
zY_Y=*!hwPw72cy1N%~D1PkRJBj}ev8V0CSaS!pe?@4P-#oxh2o`{eAq^ado7c9%Ne
zL5{_v;>j&zNTec5J+!Lg#uYx@id<mL6I5!I=c_Ak7Z|c!G{1S{fvxjG5L`!4;I0H0
z-RJ%@Pi=o;c9%wu0bVcT&y?3couotNz^!3Ru#J|utRv+%&}Tj3c}A(Okf)!PeOWok
zLIQprYMx`@{$M=phokqUKnbm^9<{9l<#-99I9uhap=)!{r}}ngoo+L<nC*SdK5e8d
z6ly<64(UW*4QPn@B-#l|8eq%XGw+JEqU`-LmM<%1w(*Ny$hgony##TJqXM5uk2>_6
z%vSDzV@StWD}+VeHBgz%Lp<$yv$avr^V9ET=7%2UO|J5l$gN)a_aP|Iu-m-pg;u59
z#y)AMOmnzI%guDee&&3~U4>WD1V1|mQwbGlCG%@>j^6vwU87ho|JD4kSgCSLBAN~D
z0{XY!_krhe!ieY}uV!ES+P&~(IMV(w2g3Hr<~s-m5$)#ph8cGM`aqA{KPMW(r8GMA
z)wGOUiU>c*=8|({5VuMve3*HF-=R@!2ZdPkgAw{&DmXB+5tcnUNL?-b`?axj8}g$3
zzr>Ec0(>!6Q(Mj!=vqJ->_mKs`7~P}WBx3(tbMws`~lIy6RqZY+_dRL@;)R!GPo?N
z;vTxLXx^`IDq3j69Fg-Panp-GK64_Llo2UKFASzRXCBrJDk-KEaG)vFFwsf9&pxOj
z6d)aDUOu*=?z9o~nC7c~^3P#f#5i{2Y9yn-xB#5R<iw!vnC_(Eq<fRUM3ua)(vhhR
z5EUYVw=hoC#L;^e7&^A?usvFqwY$dErZww;kz|C!Rizd1MG=svVy5V8z+L^JQk>Sc
z4)DMij28R$maUtGMAmB^dBQrou|xY^(i>3Ax7?C|Si!GwFDREBicFk+VRwULgG?k5
zB!{#dm-;TC4%D)8V&B~?hCpm;?B0{FbcS{4)Fo%309beF8*MKVn7|1|?2S8i*MI#&
z&OF|NUBuhU<$>DCa7PGpziJRvyI6QPXL{-cled9?nB~Xr>I1_xIRL?BUGnDluWups
z*=N1C1xat+sATT4%l7<VJPGe<_srp%+ESn}&f9H4=5SS%`x-O7^kJ12kas%($%CV;
zw>86$q14paHaP1JJsKUb)>NTvIc?5e7$k`5x`zuQLVI*bHt~{>|H^y#$2KnCC3;Bk
zwC|^ZpIPw+fCGz}J<a^t=O`5^*<zhSIB6O2w12S)zO4bauA0<Wl{Jr>txx$Xak-Xw
z8|$seCRcePm*#sV^Lr!8x93ZFUV<!4>6;kasG;h4`WzVSX9`v7iAu)Wu<XX%EECS`
zw;$U|*<;?mDNw@AreSJ|+`73I$a(zJcE#On@4J7$+W{39$!$Cf%X*>7AA4KJqrP14
zpr_eyD5WVZejX#poD;gF>bq*&idzduc_3|>Gr+HaCER9l?@C624hx|=9H`V?cx85Q
zG65)26=gjf=bx47Yb<zW-hVKC6sji!@;lHC4GOsl0L=T4mgV?6FQV22Xl@;7YmPA_
zON=&nCy>jxgKYAAg(5rd;l3;iB)0K+Kjuv=Q8pehme)Gp(TyU`(_I`$L1jqFdL3Eb
zPA{g)g$X*9v##eRKme=F0vRf5AE*?zu7Qy=<=qY|%3e!wKe8puCzHAx<Q)v7cs=li
z-cmvli~?$jyc<m8K$8g(Pd+f1Y@TFuqeA^gvVRsZN8Ho|U%@+Mz}wH?p>!hr8nUYO
z`aQ#FV_QvHXwsvXWaIP_w@Obwe+bRV#nR~81)_7oO8-2&6n()jWAZzsjMN(hE*D=G
zm~A2JBBwY=B<nx-(2Op{#A!>P;J6>FZbHJ+Qp4-=lTE~*?-A7sgKx84#jGRFbhNMy
zuCXRE&w`oNQsd5u(r^r(X@&;JLdOFrhAG~=4S$rL7m+sw68BnlA{YGR;Vnywj&--%
z{$h?1G4-=v#_U@bP3s9j8?gW-H-M<QaXu?4@*A0K2AzW=G9lf|5SrY{>bDMHHw$@)
z<0xKs@v*d)sEX6gM{ToWFR65)%t%MCgzkDos5%Qac`LN(VoH)~k`hbL@oblDWBOTz
zzVSf)$CEl#ZfF`nVHm?yJdbeYQ#IDY64s!P+C`f%Z*pq!%6^eHHHEA#qn)XyN5{&x
z=Q7aC8=q*DCn}r@p|k60%L=0a?hXCS=<-q|yk53x8Y>W|-}~L95a#(gE`f2Yo8UeO
z#!j=da3xda%1u@KvM14fIi0U_DlZ&?mXZ~hl|>BKydxxw=?8RR9sck%iOt%6mLyIb
zWQ96Ek(*YpB9lEOS+h%7*YZV;ZIxL%Kq^smg|=1mx>&R6X_BJf7ZY<_dj|(X4~Hh{
zViTCHO}Nxm2r=R2a4ZxwN!Ipe2(A<70(X9EG?N}U)X9hTbu8=?@`D_`-Wgj?7WeG{
zR>+!Eo<`m0Jl@ull@VSJB{#0z%njasf6tswUTPlWVseqchVR?v1|1b)Ub|*MSQMAC
z8&}W;G8fte<1|!z-OM#in9o%@734s5;IJHP{BFg8JwmTF{W^vx>b8%lm<Jqn`eBM$
zdg;EUuZAe=Ph55MK*9?9<y@6k@|Kir+IgIG5dwak>l0+JzEUN$l*}LndR%&QwY&y8
zaq+-Vtgwp+b&uYx8)&NrlqsabrSIxvmh)#!olAMSjtbL0edxBuCXunUu-nGEu~)>X
z(g;%PA}Vp?f-M&GQj_?NOq+2wmo>tA5pGiiD$1cDT*P>ud13O^pqM4-zYAHFbW`w_
z(3m=p9j}bb`GDy1P|;su_SO;6Zk~*TJ83Y+!>SnIPQ57ZS#+aldGW!GCGoo~o<li(
zLv15qqFs>_&Rbwb)R>+NYuH^RmFc4u$d`&_QYw!tx|xs*PvI}kRLAM%YsWc6IZ8K0
zc7<TmN^leT2cN7tV`1cl)X!Z#b5#pnd$1OVFYW@`Cp|sGpjAt-jtygJo^fMc;G#LN
zMjap+%a1D8Fh=<CZj=umlFZ#fP6Sk;buZC62W1<-x~S5WevLaKN3;0pAHdFZ!`EKq
z{x4!BjPetMaH6t=3i$0q2Hvii<tbCBI1qKNVmgLg>Ho98=J|NK6afz&0;FPevu{lW
zV)cxjt$B()oa*OO%(K@vL4RXS%Ggxs=tY`4?hot7#-hZ?S@BkDf{@B7lpDjPzMj**
zZr@iA-r5Msb!7=l`0&((e-Rb7&12k)z}X?S;4U~t<DmNDmulmqN8#8>f-s%}d@2@y
zAfAk)FpU}Z8!|6i(SKT{DJpdi+Ss}<1nr7|_Nq-RPbsJfC^<M_Hf};|#lmeekL(uU
zQGXBqDE8A_giXU)Xf&f|t+4+=!Jj&nxFHo_Dn1c3OnB6BWDj&H4e1%)Z(^POSy|<{
zR-|QQHS<d&HMi>x(3xYbCr!kOG%Zc59klOim6TNXiruij9K9|y<w|C=4`j75c@_ML
z3|0)X@%|J`)^jKIkGQ1C+(Eire-lAmU|d0?0|^F(??S6R{j1-JMY4w7a($RZRr}z{
zpud~PZG2WzANP1y@1U}KMz|#naZydFpetzpOGRG5E;*A-THVQ*F0{R)Ai3B5v1)o<
z0Fzm(YJ1V53Oi=dsm{{xU_XYmnKH+x*-jW!v?F0uFh!#u7bU+&yUWBNc5`ZaI!kcX
z`JuRcR(BJf3#|FDU=Kd@K8mv1-#hAc&Q=mkki!Few7Y0r%hTCYO=???CDfP!oo;kW
zrsh^xtBx{bvb?t_UTKJ9EDmsD2GThkSbB_M-n)wXP^%{nZNZY<C@^N1`%X5E74aXw
z6pnnC&d;!QCXLWCG#Fs6s-(h+y0O)}y^`U&<uafEJ4ZtQ_TM;1h!cMnv7L9MNszO=
zi3Z|A8=1)`w_vE7fN(G|>k9(m`xSo}{-WueucX29)A1L4Hi3|-L{m1QS5l?v!cG$`
zXc`IHIM$y#C?m&utCkJ`)-&Rs-?Q*JE)H%D;Z`dp75k6bt>}*LsTT=Uj@&#LwC~nN
zqaZzLeo3|>J`SIJMG4gM<Y%wa#?bwwulZ+mKS@2jCyTeC-qS513D@-GhzJ2L;Dg23
z!~q#(YD&g_<%`{XQT5MnEp;W+)g=k7tYA*%kf_bF(%+<GgG)l0SzxvB-dYy?QSlV}
z>o$&LQ^{)qXp@Vn(g4G-g5tvp1+<T5uH5ib-`s=2*6uEs=M6fvHG849BKLKC)!!eO
zV&NU|n(Nw!tkjMBUT(sdD?=w^yLH6vv#OTr*e_XbMe4DsO#WwFAKPcN2Y4QKUBxNu
zR7)RCg9;ZNos`vh`BPRoFKqHR??*divG}R5-@wqw^={sl4j}X{lhC7o%^~vzKpO-i
zoG?MTf)ci_xSP01`d#j6@100b_ID<F9LgJ{3pm@S0&wSai|I=49xixgjDENt{gJ>V
z-z#Rz*_g|D90h?Um!_n-_9yu{<X*cu222z3PhDR|<#qKk(<~HW=;%vYZqGQ;F44@1
zNxXJ@i7YMamOFg`_hE>c#KbLkAR=<H6AL>#xJ?~=eEhp}tS&{iwA~TKdFygj+~|*9
zryJfdxuPUL9gV+r_p}j!OwpIy{o|~8hS$#CKw7+V{mJY_{O5q%%={T5A|F3}qDF3O
zBLt+UA7rckZ!N$UXe9!sAD5FYS6qsw_QuJC#EOM<P@AgMXStWO{lw#%3i@e_ca7cO
zCtA|Y{K>QL4tg60Y;k(Qdp(QmDxfp~u7|8_xwH&7osGrBNK>m14$?Kdp2B*bh?*LE
zC4+CtIe0`AY!z*r=Ot}i$PM`N(CG(aVleatToX?_0uyk7Sh&#x&CZvY)a-+IYaN$(
zsED2ISI3EtBP00uE`IL&sL-eV&Q=l2IY%vaTl}_>kg+_RAGVyuo~EtvqQ$T%uDQ7R
zM?AYOF+0W|i3mvt5o6nKDNHKXK^@3tsq{K#0v<;kvOcZeuPbOXMbo3Bqvu1rkTcNz
zv^f`s{>zOJ8k*qC)leGg0%Yi(oBr+rZ2RL!EG+h2v>8_Lw})^aP_7s#tgb{L;L3?s
z%K73>9lCFSwtHI+?)vdODMWTwJVBFcMnEg006lLAKBVt#`)O;$QH}=3@Af&me}faI
zv(+u^{n_97MfUwcd(qBN9(m2p1MjElGjcNu9F5a8nT8UikOf>11a?8$y}Bc3f6&;B
zI(ljfB1)@NFM)C5C29;Vk0(-@S;T_wPYha>5cE2(M`1zF8=BK>@v#>WCd>m8Qp@WZ
zium2CuUmc7`eT(AXT9<?OcBP%a^!uw(O&doHSIp<f7_YGiNwB<FtW{@HbSd-eeM2K
z2Y8Ll&3(QrashDI8lQ#0jMvd5q3G)B?pZq)<yFR}?pPiiPK}O@nN%>^lXkQx(=4Bk
znZ(ehC`>IUQ>G}vos|bhQb%BF7#RWgae-c|{F*{c*-B|$6%-@@niK`+gDp;tA)4Ls
zTyrn;WOJ&XNh#6#2EL>r9u7V+3A*WS=tBFx?H@7bUSl}z8S}fB^4wC0ngnX?ez*4w
znL_By6H_j(NigRMx@Ixk+&nt8E~?23LbFm$7$Hbc&o;O8h%JclwYIgVBvR$z6l~cz
zJn<&q*Ze*=N0&dzU}!@U+UoXQnybc3)=I$tg3x^*FTxiyK2mmw05t_>LNKBs{Qe$M
zWPI~(QmRlx9+WK<>l*7PHa^)G@vBPF_2K_8b(K+3{Nb7skj|w;P+B^rL=cef?j@wV
zOF+6rT1r4bng!`tx?8$Cq`TqH^1t_->jym^W`6VL^X3c_9F)SlZ}2Yoy>DN;x-wq6
z*KqL&t9r%^4MitXTbMX~zga-Q$5-8O^1M8-&QoG>`q>yME{p>7jj0V$2Z~G}3`42N
zKR8K*Mmhv~3fFr)uGR$nu6klqQ_^Wbku6ge&I1UM)rlW)%gQSZ>SVLS!oqqfEuxJ5
zw?$Zey(zvxPWmdkm$1V=(vXWfg@uG5pXOnP_!6z8ueYt!OEjTIV_vxYsdm^|fycVo
z$3~0_@dDt2n4A>e$jK>UcYH1+(dwX-2T-57)l*s;DG3DyKnH*k^77G}W2^#YjDK6t
z&oP36I{3F!47Gj&q7C+mISK#dvuxqx6w%#{ytP#+C-=4b5izKPd!it`D+lnL`*nMf
zwc~8NHOMB}3k^||mXvb2t-r3WfbZb*OBp`lWLo@|1+E1r_6%>MtG7jTl<Tis8=0Hi
z^);X@v#Bs#C!UglF==zz9ELe~c;rP4sn<M_#FPk;kUPN~mCo9dvR4@y0~!dOMC<GP
z+dF$yznZI4O<qxJGOVQMeq+-(SfD^=*qo+Ll~yv;lyY~c-&V-lc!(r&7*;+<ziy!_
z8(}CNtLXkVLYR_;J#B#`)_UKLP@qV;9(1*|I=;CV6|GoyOQIEib>;tb^(1=`I(dbd
zZ((j}*RNKlYtLMp#setSsU86nbJsz6XalEf%)$d@#2A0~lzQ4%2<Av+*+SOZwoY59
z^es0Z;zB}hc`~Z1^HM`&DV-0t^fRxpE1}j@&a<B5y9AgQt{dnty6N+!JJ*GL?i22?
zDe=Q4ua{7c@WQ{wFPPFOc<Wl5RnFI!XRx*0CJ6$t$t%zmmx(ArV_c~0Ijr%Rsz86U
zUpydma6n5L=>!Ck!zKUD+G%@ELEtSB|63vn+PMDPs8iE-pvbtep6mSq*#+<8uSDzk
z>B7F`>FWdbWe(d*v@Omj<ah7pJeEXzJ9-p7mqewrwKhVR@35wH@F2vondeTI=ixF$
z`H4CQboeiC?~rfS_Lw~~HysrDSdVYcBW2IAF-{)6k4C5F<ecd`vPAD91l_3|?stq8
z$OQ)<LKSbe=J%S2$qK%)jnB<}U?F`2igvpBCr>Hl@j+idLM}f})aRU0F-zFOja3%V
z85dWTkmqR>eY5ZR1@`IU72EiXk|84pbJY}pEURbbY|eK};*yfx{BGQXb9&8=p=7m(
z-{oGzLLYz?N}9<zxuuD5L~PP@cz<fR<7MdToen^?N5Vf_%3=@~^#QPVI_TC1urW*u
z3_zMel>Df(gQy=4J3IH_syNDtQ0#81HmiPp-`OL4423{vd_nX5Zaf<RDmjets{kHI
z&@BA`!>`{xepjsL3DIkLZ}>zl6?<;;d7#{CvnTb(4`VN{m<N4Ztr1${!@TQ)r}rBc
zPxv(lMl%#Lv*@_ExU)0^)8P&3e%L`Fr=+3Os7%Y5f>N7TtuK&TM)ntk7ViWNtyx-*
zz@_@r^*WFpO@s)|mw|BBMB8S5IBGIq4f2#2@S<7E)=|?t5<uQ4Vy#kC{%tM3w&}vI
zIp<z4q7>!EMfpFh5CO4(8+XKe^J=IX?2=cCZiQXkyu6nlUsS)JS3KRvw&D3Cc!YD(
z#FN1VOf)0AlJsh)rk(PJZX&v9ur##j&;D(%-HG_&;6%0<kPrwO2Q^jjj%Zk&O8|df
z|Cw9ctc6Vd3>>dLl<ltS4i~6+{rl(nrdujU$l*f<_tbPEYtWHk<`pI<kMK|z`gWLP
z)A+C3&A5*T#G2^sQ>1E5|5^@8O9>*orhlYNBNH}tTi1`Dd6p_DR3Bw;H=k@<0T9UJ
zjFnDRR)qsdElz)gxT9}fDoKwh+@tAkc-HJ{p0yt=Ftb=aDL%_N?{`uY`NpA7GKIqb
zhT<bYqUkh4a?GSb?fAv~C#}mG-=5mI8h^%w=#`w8y=)0(N=B~$hpn71RkJ&Pw<)9h
zMh^oznL9%|F9L=y`=dQT=^F}TRAc1}wL_x)k*1CsB`cPD;<Di#M|uaF*-NWZ7yScB
zdBccgv7c{EF`ut}&A63z>OmI70c+34Y53bri{NcI0Nge|PKEjt=IYBPx~j%$+=ACZ
zY|y6u1>&HqkjRVyTbt=3udL7u6+8FF4BG-%nvxS=t+&io4_u%zepcmJ{gjuwLlM9+
z`;Ohj>CS+&gZ=#A!5FhQH*ZJ+gM-P>(9tolF#n43>!q@5_zM~cJ~8>@>x+!6jG`JE
zw)DprjI__oUY+@7k<}iRro3UK668zHySjnvGJ!bFTeUC#KBzn?y_mmG)Omcj?J<6r
zeiELS*7SmBTkn$G;Do<AHKl)jQs}a{+n~kQ;q%vk>{YXA=<W3+)wWqr%kG_0#sf)K
z%7*9iO69c4JUv|0wlVnALOaX;STH+GzOspzb{_h(`$EvWW3YLZv(NqPGzJES<5L^B
zOt0}}YwHTTp+Q6Y-z5>dg@%zTi--HaM<OOJE=`=4=e}8btePY!d0)D8RQn_)XmGc-
ziRyeVG<NM)AB0B0B7#sGz15Rd$U462LW^^jPm5B$?d;$3cZWcpXIgb8OIK|@WR&tl
zqk1#TkK?zHpJ%3x^>F?Ez@es)!iDoMIxe+`8SA^L<rRj5e0}GK;#k86M+3M0;_ivN
zlB~AuIJ2Xm>d5VVD)OzRpq>GP$hYV$Nm3fsSEwj<>x?!H6NLS22}jKWK?E?T4zXMG
zeGkr$b1qN>5Yv&A^dAt+w~7a*;p%+1;T1DQ=(E_q_i-O*xR1nJe%|OlXjJ;a5EB#a
ze9=w8;2Vj3Tw?!)t)*dbbQBfNwsc63-}{Ft6D}nRfp1n??NOXUL6V$E)$9u(3GBPj
zT?KpK%F>X~gQ7X16|L(|Q@QWJDooAw&PT7FcA@wFKBqU+7|~H0bQR4bx7S;zJo)(+
z8oqDo3i635ec|)uWS&}aT8_^C)W+7^B!!F{5Kob!1=GOnC0f`0<l&2rTjm(idgS(x
zbn~2q)!&3j@QU7dJ5<DL>mw~DPF6n(Q?<Lh>ETjRQcPW3qV!ltj(5=QmRfqM)7&)&
zaT3bONJ-2-@4VP`sdSoO8D;OFWxKxoC-0SY@z#5&9~v92$jQ&$J|uWm=l!I-c?mqt
zzNNW4TK7t3#L}v7kB{?R_X3*OD*#z>o^CXuo7hyo{bwy=fz++o2!{*atp!<zMMBda
zEDO2o^Tm7s*Q+L@HUAp_v`y)unF$}~)R`FfytM(ef~##WhQrRy-7dJTfH>L=jc;h+
z4wndw?}<J2f7<r%^h(N=!ct)Yg-OzMnkWyai%3GWOifI_uf{273i&D+F2jk7(;^K!
z9&Aw@F8E4oZi{4|i&&VtU~H2+&*r_+m?9=6BfGgT9#H!6cPNpQq0kg>o1D*nk(M+%
zAP5rvN#yn#{hvx9VODH)k{YL&w=P%d+o8le{d%`Zz#SpwxzUDCTjD^DkxZ_cNu)3`
zcruw+tb`_UiyXq<{mrtZ;L1~U<PHVCchl+q65HRzUz1){mCMKZhBVkD4oC?`=jVGg
zm-URVyNGGQ_{A}2eE!$?v6&*d&83NYtcn2rikZULRChOUSCk^ge#QZY54Tsu)<DU(
zGM>mjbM~<L=E2p~GT$QK^ZHtQvq9IL^0Y_PUjYmj28>~NaDU-DTp2l%$Pv|s=F|DB
zi$d;c`^C+!8GgW_vwS_nlac@+@&u}x#4*TA0P7ug`&}tT4>ms?ops5nN2@W3$*P9L
z)PP9I_8?Q7TwIY<NaP|eUqpTHUjXSbF$SOga&C2XYU359(}6kh&%>~;c@Kk>(*LkI
z@aRs<nprfOLkqk!H{IR-A!gXuc5rZb^-d(^_%XA(bDaaNizXM0FDF0pO~03m`yKFB
zKR%z`;yUzUQ#7|sM1G|0K`FKTZCM6Ga*K<*>xyUGaZ_f`q;;);$Ir;W^s#tI@KZX-
zAt9-$Y3on-{(fE12Tka&=4L{x7Qm)a=5=)W-8R^zuqdcA6gWhotuvhjN9lqc0f!1P
z<o{Mr;w|)=9L#6QD}fYGO{i^S0%If-+hL>kRY|oeXEZnOL>zPChoRviE_UwzLYq1|
zGb^um>HH(ZoIHZE%dI#7a;NjVnfA>T8J<aS_Ms4s#LESfgtWC8JD77EJ7Fzj&p@G(
zRRs#)<j$S8mpV?XO4EzU87A^n{`}$Q66BIyA)g`!oeHoT@DGi32xDNxuyfu9-Zq?9
zj<f>8il@i=`}gnA$__eRem+oDnw@tMhhkG883F(5-@PLyCU3t&q@%Y$MSy<qDx91d
zZ5``C+1<A(7BctM|KY+EPIWCeGRP^&7rLz-LvgTb<W3a)A;7{HkpQjBHXXwLHL^N7
zK#g`R71w4{e!j^O7NnXAt4p#R%4bP>iN(SB0rZMqCqmdkZCAd5t#LARH$>2D=7uYG
z6$Szz&5{s`+4cl#17#LuZazN8!}FJ{nSw*BnGbqG{y#o*%VaBQRkr>lm;Uva@q4($
zn=g#9uj0a1Jt^U)R8m`$tq)79H=I3Tw<KEg8P0i4!3P}0OVcs!Ax3ohm<iS2w^y>I
zM8BjzFEkCjjL~>JKOuUp`bp;pSK{4kyqEv>+Npn%>s51G0KEVl(d$F+U*A8S*3+Dw
zp_2<ch0!bCNz<(r$8dq7zX4~KYD8N&vN8x;NUb151-NjsP@qGp{BN$VfwN7S!rwO6
zaIk$~0U=VaePoi28c>AZgFG`|fzW$Mt3gCx+mv48WmgMyWCtzY@ArrTwlvPi-uXD}
zOKhE*p7{n(+g1)s^f>(Edfo+<!_7ZTuPn^bZwmO~T`41*G9%1BM*|ssQ5^HWCvLlW
zk)1D1EK{<W^qZUKoM72S)5|Ml-6v8ozx(ZLP5Y&-_z~!qyZ`;yzCIG4<$|5Nyx`fG
z8@-qh*}Vsi*f$$z<@HxP9P}|1bg<Iuy|}nAy&`#O!c3F<3Az9EdcploRr&SWo;?r)
zmbM>YVPv7<;maQ1kqQH-Z}U{I#VhCzJIm=}&h|7I!2)Q9dz`<Zdfep_dE93P9LW(l
zQeRl~X5c8^vSgde|4izB{JMzf1<Ei`f%Fod`dk-`)SVx=;)U}IFW%L=+>rV|?#mi9
zrL0_^bR>eUDBg|A+Z&4Rs&U@CEnd~TKXLHcEdWV>%bT=}0>;16nwls;8zB2io9Zvu
zD&G;%sfdbI934-1rOAL3E~)8zU}3Q>lM&y}aZbLo_83I3*X$Q*U%p6#iV*P;Xrfh2
z{MI^79bsh=R1_dPZSwzC&StpYU#%;&iz!>QRe44rBc{Vz`yQgC=O*;tF8e(zQKq0F
z13MpIPfLv34=(wI`lpn~0AnDSezy5tpmqLI?>#4FF%mm-bl;iK8iYT~*EM_u+A!FD
zkKo5a4FqSQ+rY6nqPp=GI#?WrN#Et>-J*Vm1Px{P)x(?Btp8U)l*|x$3QuJ6p|<l`
zT3Lnc?h0HU*<)wbaa;$*G8jDaK-~QUX8oNOdtilTr3Q2LrQXd|zkQ-gPS@p6S=}n+
zfRRC)Bp~;;rtE=rlzQmHx^nlA{+Wt2zUKx3JsFjAU-Nv6iYYMApU9#5*2g_Mw{Sl0
zECO_(^mt``^(la6AW-I*_?Vy(Apu>FOV>ZEdNZz8K-f@Y`ZV3*2IO2oY;0S_ExFkF
zw@;lK1~ffnRrBFT&OFY4C3gs;BpvuGXX3{sgGA}q9KF06ULy3gj<P#DI|KcvI7=Ik
zu00NozB2x8ViF1qH@g^WRDKr+QHWMl{l3sh*44VhpW5oN@pNOLd1I3Oo+;79$}1#k
zOKx-idd?9GBYL#D+_#Us1_+7uKABptWt^QuJcPOz^pdvnF@Sgkl+|-OX~~1W15iad
zISvgSmP|{VpVfKcaYSGMGXaL^=<~z6!M{}d`YQuYA@QnzSIAT4z-EqaU~ysKe;8QN
zM$viArNR06l0bF?6p#eIR8Xy7g^X2eXI$OihKB$0v45>j>JVPMZc#j{f^YgAijL55
z>1u=_C#S$6z$4EYxAEScDAS^rwL<HluLPuZo$N1vJj@nG#a}2~2NWfy10|Y9@Oi?t
zdSB23r_orJT>LZy>&-HFKhH<qteFpeGVF09`>PRa;mx+hs_z#5PA>;5KNQ}K1y;}4
z>$Z&P?^bH-O+sGx7S37(s3Sxq4r^lt7F|g)ph!>NF-#!qZSGk?^9<z=z~yd>#WCd+
zwCZ~z8f5D->-q;AR)!s2@+Y9HB!=Z9^|pDLa{B`h4MD%^yXJiX4a2L-?YcS0>RDyt
z;HQI|8kPzz=SFR6yyzgJ>=fPl?J@vtdKE6|d@VKR2yCkdtRJwvH>0PLvW%<cXARt1
zG+!W>zk5`ls{FJd83Vb^|M&5*{GH*!UgOkBi5oQ9gJSxYf8HZV2xPcnDom%W$j%j@
zJfIEBoZnzr&ui<L($8$^XUn9=I*S<CNv3T8;MQAn+JBmpOujcY^IZe@dCQ$v!V7p0
zWjUH?KGEngIuOhe#7osOX)ASY&QuuMwU&r0l+#cC+|>ZY+7$9(V#ukdO`bAFF)F$e
z*z^w90##Ms`Ohr7_K!5|Je4Ow_^{_YZ2mSW?s-)yQsdq@%an<J48N?EWUu|~X3byU
z3JgHW^enlW@~#-G>AH+c3vGx?c=9UoZMBUc=6OaPu?3D(v{@N+vwzBqXscjSx-{!g
z4L{xN7eN<eJZ9Ij27(<C7z}*E?>D!M+b@cW*pHSS(fL^S(&U1-7?)W}daz1V@CW1D
z3I_WPP;8HON|uF(={XO*K;qkz31b;1awE*n65}@Psm-cY2ahQ{{@P!Qsm%H_9C=E-
zpq^S}XbOFUfH<rS(T-1a<15?JYCv>7fWT!|10nC;1a}#msgX1EicNkoy;yFIsaAcs
z)(1mjk^Aa3N@tWYwc*ASv0vMhPer69G>q{G)c{Ls`Wp!|%4;mC&a<5ZieJs8snXmk
zH#>>$yvy(;k*5?a#6f2dm=wt{Zf}{1B$|D4?4w@cMdO7dA&Fg=RxrPy6|;;E(Z$oN
zlVLYDm@`wo{LWqhcmG9sTe_3RG%`fG9+PAXGl`HN)&UJAf&hE2ZwlB`YWL89TBGHN
zEi%Sev`IwEI8Mb3XyWo&W7NT|F_v6}4Mw#LeS=l^yhB!o>8ytq#_Z<1kDyiZzJ$Pt
zJZmndvx0w~@!EcqYdpmd@y)F`NgvcvaLTsmW$m;d&Q$s$NvedX85gfLNYJg5d)^ES
z2#v7TH;p6wnjY4I>+%q;VaMts_ki$Gz$~aYpOct;m8fhI{)6qy#4bGIP-hWh_F)#r
zTpiwmqKxI#f9+T5v{<v~GH3o4ME(Uue=Ki)L=w`io1YXoB7-rBb<w^SyT!1*=qabr
zN<a)Y8QXR|a?TXqTYmtGz&`GfSZj<dpGV9Q6l4L{ebgg;p;{{kh}$YetP)r748iH_
zTTC%m0?d@dlS>PKHxCh`k`GL2eN9G8=OV@*wj{?|*W>rB%Hu4k05LWB@5(No3Zy&N
z89A~=y#X^BS`cXOf1C<g?8X3rF_YNYwQgs9ZiMDo`ds)~K-R6KkvzQV;hHa+?b9K>
z=2aM&(Z^kUn*4qnN^dHz&(G{VICzIqjto{#C--2rQHH=L93)vza(g@*31C`8z`lxL
z*F^~PK=~%_87&Q&O084VC_v*YEHOrXzlPK^EM_+fioTfUaj>M4Tj6$R8Z%j<Rc-iB
z>}OT3L5c^*q9Fq^S4&J43{-lW$y;nKPZX3QUJV<FetU<$k6p`;>4#;$K#Lkyd;w3t
z8HZ19q*N0P{nsVE$*^el&gdG~&I)Nfx?Io&F#I^zvmym0YXS4TedH-4gsRtmzwiH9
zm9v)%Ssbhn9BvnJfu@Y2yF5U5pd%;;-F}|h?H~!b1JtvL$9x8|X0cIcD%0cQ7v_?s
z$)v`OT~E(#X~BY9zG5!!EzWfTV^mdX`#`mBI(oOyw5(Io(+w&E<Y-c9lB_!G;QXDw
z2FI)`X=#z>N5Zs~vpMuxLkmy0dgWcKvCNnl&R2BsfapMEHIZO1tDAq3%U7o5TBk$J
z=<?HfT_M?7%l_H257`xUHSH<mL^qKX*;9BH6T^!fo2@3h2lnb{>Vh6{P%RAFT$PB<
zx#Iv|(blsjLKKp4t&(UBPEc?l3PE_6iMq0@N7E->95^~C&Z9CZn~gX#OgS%-Q1D{9
z0wYg#BF0^jn~H=PghmqStnE0yF_fn=OtxzduX?Uhg3KY+HY<9Y7B`T>m%N~D6*QJh
zJQR;^qtZ|g%1jH0*!#|^M7ULT?_PELCawB>ZfMUJ9%M}{rhYOV=*bsrnZ$JGiOFfm
zH{I4N#6W85<bm}&c!-wEBoAS@gh}6~$ryc_{?8VX=6kVq60|NM13%;uE~94lI#_3K
zmQHU{0?OO_^Hl$2ZOc6o^H`?sBTk)JY6vx|tU@3)2CLjndSYmiL970fba}*B<wSEQ
zyrXqE8Br3OcH{M?e|;o=xmK~>4cuGv*X>;^k#98A(L+~k5ztjX1re{E@OKmE#G&rG
zd~bP#pPSbKW>M0**7yl39+TRUbMI+U$!o0g8jetmCDblF_@$JVLl{A=Y#{+%#T{>9
zhJ|@DK3-r3Wm}^E-OxUHj{%j(vWn5xv)Stx3*yS7?qI>>EOOWI>(Vm2avC;DFJ$3k
z-yG4n2{JK&Y?%j_GPS!^ZTi>X`8GLr=QJ6t0d4owi#%qz5%Zhnr*i+5_F4S<Gw2QT
z<K?#KCiL<4%E$Wxba7$%c$n;u_@$aT`NLZS^V!YK<{y}JN$qT=j$H_29um58K)KbM
zqzys;%XFufQWO=~wt_*tT`|RqE}1kEFXM@YGL_ihDzDD2^1~vLKAQRPm4>r+K=+~g
zON9Qk_0;hr^~Lm}QcGoj#tNQ1jjBnC{C)$$3I+>`CI2gOpve9BnzW7WHnD4AXo%$T
z0eUwLjIPH=l??0s4?L`&K2QDda2TRE-UkiFi2FdjhGEpisIh)B4&LI8gsR>)3x(^x
zJJO!QjmYyhYdSwsKY>^av^UeqX`!fF@>t7XkVa0=-)8F-nidx&dCa%wI2pbE@EVVj
zGMYtLo|`ngknya)?QL^YO;7Y>?Y9v#M`zX1N<+tBn0pm!R%hrJ=Gmy{{?=4D?T452
zkn@B_zKN|^1G9<*q6uB*{4+x*4uY(L7B=&c%NNMYi}E<s#q8g-R^;a^x9TE+?Dyok
zK)=Y^*i6vsF)6ZOP@kqV;76GFogv9DsL93R@Z%II&}|~twNh^MD4T2)NiDkTHNZ7z
z0WoJY99S9`4tX>$dCJ#U7U9*gAeVN}l%lgNX*J-g_O20b$!ann5dnr%dQnsjVm4%b
zSQ|9i3jC>3){d5^ygBZy``6Zx<Yl4yE|M9hRwRetZk0OWOpAt*GMd7^$rQ~3M~lmz
zhB?d99|ANBRMK}2=vAxCCZ$u_Ffx;LWgQ@=j*-jpr2CZ^$joKUzcZwoA6Zu_8{aYq
z=QObb7OKarsg7(`%sZ$1=S(I$s!I}|;8Z{l2j`^!tp%S;9e@fn_NIBK`J3?4>YqLp
z3Za3ggql4UgX=2q5)mWlF5?pZZx`ScIu}!Ho`|Jf{%%_)vSz@|pu63xE?;4A#<^EP
z-1l!tlp+v{H(X1T1U%HW37G%n<`wzc2=pq!y|OkdLO_$rQ?|#Gn@9(~aAHfS)nwV-
zt|eQNof_OC=uvOOSfRuS3vT-?#VD_2tBA}tS&(_EGN)hia_i2KwIiEEmKrm@AlwvI
zWy*776ZCtrnbBy?MYRcbn_4??I*;o&LOV+_X>n<ORDos3-dfEImNyq{%+We-27{F!
z=T^Ni8t;dT9jv^n=o6OBEHI3(agg!F@?0J3qKv9XO*UzSYU|h}cMUJL`}TWP7`{kQ
zEVZ*tjg;Tj0q03fbqw_9Xo0^#?q_}sFe8z%2^%bT9=^&aS4qf!P(NxZZ0RA(TzqdU
z-MB834VO;z>bL!yl(bH8aHcg(!t<>I4Lqe&A0v1U)0;g8b+J_33i|!<BTIXd5<fJz
ziGVi!nP*c;(AfgzC*nqianKQjUOeZs^36<jD{stHnHT9!>8u(BnrCSalih>oDoH#K
zsQP|9n#(A*=95<3#jJ!-w%?yyr@x1Bf7Wb;cU5U8EPF|dqm3wD7(X*Gj$ZCP%#&`h
zZN?(Qqe_?A%`#}YZVSU(9)s$!#bP@5e!E_)DK2)NO<@hI;rgnE4-jIlx)8S#W+My9
zY}Q-^aoELEQ#woMylp|1q#x89wN<P8ay$n{dUOJ(GHN{<eW<Ko{h)(>dy{Wh1u$J1
z)YM0FFb6i2OhqsJeap~f^LC_It-O@ssaSY<m^qj!6sOQjns1Fxs`bS&sc<?hk*f)+
zeaSxMWH37)#X;Oij5C8dthHa#HP6ybXi%9c?aS5HNsaV)i>(A>Gybz<jm@9ag~rQm
zuekt{$f5>Dw!l6M$GJn;QEu?nsfp2*A_wM%b|_QWsWQ9q#r8dES#84#HWxpE3nZr0
zUEkgnKZZBSxaQ~FAK8|i3oLj>)*3@7nMixOzHe+-b&~R#raUV!GZ0-njw8|1*%hau
z?(A~U->$usLqo)FfD;6Z$p#j2MIGHHb!~Thg8YD>fB}`MX7Y#TlQrAl5}<rbDmsJE
zautT>pS+P9A{RsrbY%|K>BAb*N~yqCdyTb=F_x+v_`z<mhXMAyO4a%w0yWic;?rCE
zqfuRJ#T#p`{9xcCh6y8o6lh_&J?<<8Z|>qg);^lLih;%09A;*hJVuBnLm4cH;%ZMy
zR&y?;<|*M|Eeh<n&=RlX?n3n&4B}vz875u*@8djEu+FEOi_bQpdP(;S7{qrW^sxE|
zcQ(V#m2qO;f(4+v=7HMI7>n<mp-Cs;zZ*XuED`o=m9*DTw%5T`vE#c%j?-~G6WlXd
zlV(QYzGTohtly;H^^1%nj@aCa06MNIs|}lT;?6b}%@5hR)C6W2qE+`&`h~9T^FK~}
z_W1a-^#e>BNC!<1q)~0WHS0qd>++b1Bi9=1|55a|h{*P+LFM<wfL4M*LWrAOL-1NB
zd&^PK1%V!nov`1F`_>VX+1Z4W-p02dcYoP?D!`r?r~m%IeA<n!)i~!R4azW{DSfe>
zswu3zqVZOB)%^X>t!fgze)@0O4w|}OSD*{>$PE$~*jPHd24=oAG-)R82_F&>P<5B1
zYM(HIdo*97)ywdT!-zF4*DhSNxfr>qL6^z?z29n^<0#G3+KxehZqN7rzh{2{3{MK@
zt7uw7*hbYx53+1MTB10tN!PnnLvmnFyXV)<qjQ9L%5);$fEzEjzD3vm`Qsj}wwExC
zo!`-~o#lzFg^zkcw8C@H!F$vrax2O4tb|_V>g3XDZ<*{Og*6p<53zvu{fT$!YR@-r
z17wxBEg77B>L^%ur3jTk3=U0h-{X7r;XhF-=nauKdQIEDT{57JFX`~0j3XMEw9$aw
zdnp7c;FcbfHbLwA&kLMkHYMT%-6n(QW3g2lXKRh-v5X=Gh!5V1w%>|;fyuoW3j%QB
zuBiFDQstb`a#4|E5$Wvj3PN=TAxs9dtw~jl#RNq<uVdsVz>=A3*rJ%zUCDf7|Is3@
zhIWx;5<0v_bVfi_x##cnEH~T1i9tvZ^^9kpQVSIh68xhqClXmE1n%>0H-A>|4w!-E
zOSO*me@gsd+o)}QFl^1Y=LxcYo2mbBo)!mbl{{`xP$A#V<G(OLq>_H7|86eItt#$9
zBpZ>{uzt9=ml(F=aI{u`;TOikWq{<pp?@l_7{)S~RK8ew0^=;qraBsxNoyqPqY{(%
z?diaPkl$$(urCMn{JzO*ma)}0+jPig_m61_MFVT2&C`zcC6|Am2h+j@USpRvY{5)Y
zBC%dsj*t6{ON|~eXBLVlE35HuYNwa{m2H2XBq`RoG$ohNHasD<jf$sac{V!3I_6<E
z&YRjU7cO2>i62PtyBS1p+feRTa}{3(eu;}C0_F=fE_eL_K}UB=GVH{NtPaG8T{*GU
z_9Qm^C{t_h<ayT{@8ZJ6^n8meVbO!8(z;&dvh|9HIRpk3?~p-}P+*hr=6+z{TC5di
z*|WgqV!Taoo4uQzOIT{`O16g#81`pc00m7^r|z%-(3g!1yYdQh!iqwPdmSey*oBhJ
zP(vOft+$x{ELKgY_ZYpK=(0tDfz(=@^oB<(Ci+D77TZMEJL_!Mo6GkJmo*x&sh8i?
zYSs;(|J4YDha#t6@%7&cxc4~xB|LKSL+rxBQr_OkBWFzE!F>KpSSqCiX^qE6{-64t
zzsWRBp8bb)hbm*JQS$G1BX5&kZS=ACoGzU%vv-IbhjAmj;P@Auzx4Gv=m&9_V^-@$
z*0SYWE;$*y<78INxRPovhdgck=zj)$(`lv=m*RpZGUZz}Ykf{oH*p~KPhfvdl|JUw
z(C<Q*bZ?J@l43J-`3Y|13^mZ`(^L+5)I_?aa7a}`H4aiIr!CU1WZ<Kq$F^VPV=V2n
zh~?|Y1GoOM-PYPNstJ6)endHPTAML8x2OU(p)9TXSpZ8>bz3S+Coc)m6GvIvz>&ie
z&(Ce5T16JAfN8vUYQFd5yxE??XS1m6y!!CWpgU%<E*W04?Sn35_!WwlFK0Ofl=`f~
zJCY*#&o^9F08Yo}_M%7D#;%*jhK)>i5)A%Cd_<oDv;Z(mYyaDiC~I2sq~#*g1!)0S
znQ%Bkgtw^$<wu{W2Tki%5<{P@ZJ(RjH}o*Q_+Hl@b{(PjF@{)CoRJfbzaLhV`qVQl
zyHRMW6aE>SMZ1(^RO4ZjblCT1KSh<KOs>6K`h@~ywt%1eq(ROuml%W5s9qp;UQ*?1
zO-r79G_0GfzxeZA7iKALr9i-GEF-Y8<^<)rf*`LiuiYIVu8d9Cn?60K@nQQalc^D>
z(2JTLPS7eM3H7+w7wXKfExI^JX<CtQo+VxH=Gr*>R#ivUINr7MGVAB{`8zwSUoN@h
z|Jk~rntL#(dZV4agi_lQkX<-yu?1Q##d3tS_x@Fp&MtYz&V4MqV1XK$?5VVuSv9r?
z-8B>;Qpp?v&~U`(usJKV=~mTP3DC&}LJow(mo<48ix6`YXOSiV_l!>Rxo%ZteZ;M+
zu34S0|2bf{v|%HZ{_jSLg=THqqEq+RF*)u4+e4an!V-2npHhZKLp-0!k5sc@_;Yp{
zf<fbRyHp5mRpeb0ZjL#+ixdleRx4|c%J1kTX?)>Z5xbn&H3tFf?EeCz8>=c;4_2U&
zd(sEn$f%F0&K`^O$Nzqu9Qi5unUj~OJ3ND-pau*KOXr5{_`$7_YZ>?OkW}LcY*yuS
zLbd1f4=ak@mzIE57dd~Z<xyMuJ-XbJusJf7W$AyCL2GPS+`q)D$>f>pp!7@D630K8
zr{WMi^(m&MeCFM?8g~{=4NbjE%K@G)vy=RZ^`sr)@|LRCqM+8p2*x9geL(|Kkr&G0
z#V?iLZ-P*XZ03j89ags`&Z-Eg^k3S~pY}?@%wc25Mv_s@baA9ka`Ut0$*J2*Rt)}Q
ztn@Z{j8)+V36Z4NUTcVQIA0fYU{_1j)F%RLZAf|ona3~gq+ZBX_#d}bO5Y&Sd$qTp
zVY!}Zk5c38!RyH{o=3?mo=Dg&ua6`D5j>lX;8d9nUn4Dl=qDaLfhXs{T@Ez9M<hRp
z&pHTO0d~5nZ4qL){xqSLZt{mjlAyL#SIReYo)~A>WR<aHYf;)I$#~C3z6E3<_G*j?
zAp7!QhzF4$vLC$ro#G&3ANa_)-<q}NpcO`mgp78CN}9Ing|jfMB}~f+!ZD{y50b_>
zYK8`XCO`E1;Aa;#6PyJWQfzycyNz{q+gDDIZk`Z3TZH?GJu0c-vHfElLdZ@qw0)+R
z*B=2U*vjdNnzTG~->$Mgt$nQk#q(o9BbLX~*k0ps!)xZKg39OAa#jRLvEFU7PucAZ
z1Noho{uCk+j<5?Qm%!JjYff}zJR(0(S^a;OZVuR@n;FJRSo#A;eoBYGQ|hiM?FGFk
zL-nlFKhkH>%}ZXa9aQS8qE&;rKoxaA1xb4vuhAW1Iv*5MyKw}a0!+C>+7bil%{ZRr
zXl-knpCcz|4=q%g3X~?eJ7f#vKB9NsyCP`0C3Nm+CSWH9JF<>ycs(<eXXU=Iq%LTG
zv>)|Rn^-SJ8pHds{CM~=;dY;4Y17~IuuXl7UXs1rgh1!neCdi0(ad-~u+b+7t%MWC
zWiV|$Fm11g^gGKQD82W@??W`Y6`#IJY`;>`@qBiOwBFLw%?HUu@j?Qt<~q1Qxbjkz
zyN;*Ko!~6oks+Ms6YuW3%x~&S&ll_dsu?Pg$*$o7w}p6Xr>G%b_BPH?-Rr7Mjk{fF
zMlH5T7}k7IWJH}kiGx+CVfzGWV|E%Okr(E2n|&#AD3?J`$ZUVbrq@S)Km8Ns<bNl(
zG;{11U?`&k%lN`<(*MbQyjk{A?KYuR%Z%#auED<<c!F2($!bV7CM|ow4_*?yMt5E{
z_f9~>cy_jE<%~jxCtZ7;rsix=717cO_A!~xClP4S&E-?xj3dS+MVp`ph=$|s0SLH0
z4s)H?n9u5<sd=nlu|J^ph~W=K&bQz6r#sdQpV=N)eD800b1eM6hD*4gp#LYG!kR@J
zpoP1&Jm=|Mz)81Hy6Kt1<==PR5>J3k_5BCw>xXO3`e1>z<(oI%XEk2Go|T3USYfpr
zSW}Dv3`s3{K|Xc%*>XPA`k<9qe07#$;X+iY={#0~^`1O=;Db^pZ%t@V%3EEu=Rn!`
zzMjB}mah*G!Qa$=|BmTaTtSZ?+9+FFRP0aZ(6thjmRP|va1)m(8j(90K4rDL!Lv<k
z)Bh*ww?wSRF<0s4q~cC?QPSKIsl+)z0LX_>G(K%;V8e=j3Ac)WQ6Fak50NGWhc$e0
z8DVTv!8Q^VReT(`57)cFd>llVQzgoWV8;KRA5cz#DW>U-D!x$(QQ`+%qVDewV%Lq%
zDO%rwNwYxil}PU9QgVt1+TVp>-|cdalXEnBrA8f?a<o%rvM+y}(%2$S32^rDgDw2p
zjYhmDL~p;}2FxSz?{^t~Awtwd0&{dH@y<Wzx-Zph(_XllH_ih4ox6nA2N6LU-D`wV
zZ7ZLnJ#hw<Jf7_PmaN}+Zqu@Z($eTXPB4h%-+N<Uu6Ww)1+ce{Ml!-6VEf?Ot!lWQ
z2UCY6vYNrZY&(q9fquz?QtEtldgI@G_8CUu?=J6?^q)^6hx>}hG5!^kLC?lvYYwbp
zy~VD$(iR2Y44@;e26ifdom2`k&THY^^R1q!4s!o2n{^So^$P+MMqoA`GR!eY<rC$3
z{+3(_<>c#>E=*Lt5xgd=ym`nHdpYF%LVIK@idlZ26y52EQt8ai&6#U8RT(GtIP!d1
za;B7^M{tYsBv^DpIyoV;Np`p1B0q@<PV66*#hx*bMY;KN<Gp=zsctt7DtV>_Y!>c$
zl~Dz(xmoocRBtK+*yTJWP{JW06nmK<6?w$9<eR=fDoz?gwJMqU;qFzGGe7=eJY=I|
zU61i{6rB|&9-{pe6^3o#G$2{lV~w@I1bC%Mlyf?Fz|O4ofp@=g;uSR1*{49J3jTm=
zjR|Z{rHpq@uL8fpMS>;{Q+9FRwEKN`tHg+>)87XP=wrD@^Z^p^V#GeupyHZ))|(yv
zm+}<+U#Q`hHM!|w&l}*Uw!^&9%Rc>h8e`4ZSn;jnMU1XHl@ss+h{)3|r!S)T{XM;C
z=asLJJ><<l6W|j~r4FpW|8(k%qoD_JU^(Pvsg#m4aPzpcyWi5w4a>!mHi`R`#QMDc
z)Y1~)U{M<;+^F!lgUx;r%AOheX-XUdN4D?^*4~leP0l4(Wf{(JmuuACrlyRAd>c7o
zds?#q=IUeP39EnETJm8_!IDY8^!i2r?rwZPLka*JzmlT+TVWtRugbaDb3!5Kp4Cl7
z)z16JXpeIP`A~J)Kk~Mb$~bFs-QwMe!o!;hp5&MbWN(<)H-1Y0XNH1hPyZ*4r+eiN
zK686)YJ4yQB^G%518m1Co2?ZQWuV+ch=gTLR7-UA0B6m|nlo3L?)-;XO4j$s?z!?f
zve0^putQ)#h+9Yj+c^8_m!wBBcEMnQ6JMKY#}}}<5l!RQVw;%Sl+u}~`tt@emVU9;
z<@<f9`)R|c%%|fWQH0}M=rv_rnAFO`caU1S4!cQIU^t&~RGTq7@OF(?TL#yJc29P<
zq&XQU%O5JEro6QCzXPw!9zoOYD1YZL-`+u@DVVdgiuaydLnZ&Z=FR472#SO%fR%Iq
ztK|oZ^>ut;W}$4DjNDpQ>EV5H>J=@~9-^3!SNcN*<#$T3Ir(eaYpLPU<pue*wry@K
z(Jv?PG{Zd<EvP-v$djDaK0UZ@jb<^XR9Ln!Ysu?0<~EDG-$YSXy|b^knRe9wFk$DG
zTYB#lGVRqGAcu%Z<S%wK^MW*ADlA5r{$}GHq+puE!au@&#f4Z6&dv`RUoqeU6od%_
zoLO8qxyD&5Yl<u&Q5b={%l&41^lN6RYhyKX5K}b!;4)OA{4vDH7<qaDQ?=vxoXI7l
zRq!OJB2sN7R4fGMyf)s|ZJjmte9VGzugu`vc}ULgu)(TN2)B0e&>LI>ZusCA_Y$;_
zP{Z|+C~$0ZtmQv7ZCIe|<u#X47pvx_(QVT}1~CQCiL{UCUa!%={8^7by%7G1S4l6W
z%U-bkPx$KX(b$4Jp}h%?^J%Du@OF*I228D$q(J$wk=~)zSOsL=GI>h#QWvsEQC)R#
zV3wtmD~<qKtG67~({sbuEwmHMkKZ~kL|5)Sn`n>JDfW}hInMK8g^CS2tiVVrZFS}G
zOidi;uDoeSR;h97(}{GAL5=ydJvQfxb2ZF{%1MF5Cqm=CJ6)Fw89$6MhXkov9^o}Z
zV1($F3thPlt3Q_VnXzn47A>VS#C+BzGJWLf(xqBQrcxn3>Y)@0wOzs<Bc{U(l_15B
z4n?Uqa^$;l8vQR|noHHB=`)u?W(sBORk+1A)|QoQL=k{gbJ;z)cSzLao`(1DOU#3m
zL)8>H!Tgpy^XOWT@ACvOpJLWd<MLZ8{g^9;=wGz1FVb$`eX?8IN5vN=7z*wEG3D>r
zF)30)c#m1~OHtyjCJcyS6qA)SFeenwxq8#~{{))jO@w>!SZs-Hh@U^xp-H^M5?jG<
z+R62}Qq56p3I2>G{>4)GOY`w`idx5-XF?gw3boz;D|SrDxYW`A=o1@twpSC1UA@e?
z!{W%eVpND-!502if5p@`xopCE&qKF}I@2{0fS!~jy^9Frxsr9lPQxvvKTY#-w>5z0
z{cA}ul3r0LQS=>gPp;iD>k0M~-|1Zp{zUaReiy8GtImle$roal5I&uF*zKnvpzBb=
z395u2tmJQnv+d(o-S%%+8xT_gkAx{=8s$qe2(^3_xKnM)R|u1U7$D^+%W>DMD?Ycd
zH46=z^lN}}vGR)l=NQ)n1%LZ~0IkQYWneH%+#WR7<DixRY$->BIYumjk90Tk-j{1E
zB&pvE>+Llx=6MR{)3s``F$I5@2g|etZ<9wOeY5(ahIkkRrF|zL&5@SJE~p>^{gOf6
z8FP<Mw<ux!#7I-%AEqquZ1rTyj0yYiR-Il3b#(Bf@XNgm`eYG?V6Y;NV{}i`wt#d)
zMrEAo6DfL1?7bsYvz?JguZL%weAzRK@(I9a(@%}sU#L-&2BGfqj0v0|Z(V`b--k?Q
zQEgQQT9*XjChMP?sGedsBr}zpE>W7Ssc=1gUN~>{q$JtN-L%M#G|cGBP%**$zQbyM
zNT(6<X0u<bcS`pp8oLU+n;4Hmco2Gr9wDxXihN`&a;GT4b&#^fI{2uH6P}1bkVT!@
zozG<)hI^e?rXs1I2O~&vep1k$8T^W^_8c+JW-p}1uJQoo#GIa?WhakcL~dj3joeC*
zGx5*S1^NABZ=^dcd|3;S;z}Ul+#1aP_Aw`|nR#h`zt_(;?O`bGk#;?QFNOV>?aVGJ
zYidql-p%1EUpMQx27UniuhV}{W_3j%FyFT`ZLTd+JxlJn(CK}r3dj^A@q!qhKXm)!
z@jp(b<bdI>#`=39A*j61IZ5!me~1gRC@Jl<Yly_QaS2hd>BH&S9wMwC-k2hjEn!8X
z`*-c<pW2T&U#e5I@Q!*L9j<BO_A%;p{kzG0Bd&I=BNU-sE3J|cVWMNKEW<(5l%n9}
zNpCZT6`<sKZU%qUP;E5G8Q*MG55z8K760e`gvQ4cNJiC~mDUDwX6KPR87^R-i#!l{
zkr`^e5Gu>JmKcbpIT!>XX3N1YfJs8yijVnJBgy}p99VFy+bt(=z;8vX?t<g7iYx`<
zC#Qr2g`<DMyQB!IcO>>f0$Pi=QYmV}*N-O0IoQRp1K_;#7fX7#99IoN#xsZ7vTNtq
z41X?+Ub25lCCPPol8v|$=>m37DBw<RNSdzvUKSR)l5F~N_7i*t?hgA+AajXOMU0Xl
zVruX@r1nT@ld`RaRzfglrTdlx!Vhubn!RDsyWs(HE%<4-6zacXe>_F$;&e70`^O?h
zdSOo8tOjh80lMgIa4_c9y`8T_wh>1!%YGq~EvAp0PxBOVAHA%4bxam0@gs*Q*aL5V
zJo?iVI1tYHm|P;K9E*WK7yz|i&@8NL1(+qJ`dnP&E!Vc*HAx8?FQ@Gt!{a2hfPEn{
zgS0yf_)-FviiaN=5fVSO*EUxZc)5+dftWHSBXEJ-$YG2>zl8)}HYBzHy8+<M2EICt
za8_;;+xX)2CAAgCm~ZG$wRf9{^<(h&``iNHLMqR%PaC95L$MBDB0C)ofVF!_J2~rQ
zn7~7FN{x7mnz&^&a%7vlwg69|C^8KHEP&E_ZYaVgZ@hDvOKExPg$w*QQI9~Q4HCC~
z&ujVjEWuRrct{_&@(o%S2*#|b@QUr*o33PSz}}MrvUlccPw*3mdCyRW3^=07uwH`w
z@>>%0IEn(g#+7zDyH1BuKlXb~mB)^L1mLCY{LIDdo5!A+VrUSTC^C6IM3^Ox=>=ii
z!s}tJCMON?XFq5P2F4D<0#Vhnur3r})=OVGQo1x$jj1a(u(_Mwno%c8e-rfz$gv1p
zct9$<r@L>J0;deed(cA_{f^<c4vF}oUj!~iE=L8+DTAEzLdU3L0pT^JyDfoc8D$!M
z%X<Y(GNsOUK01NWW(CmVh?i!Zzt@V=335d7@q9f&Pmv~gx(!`r#7;+lqKncN@P@42
zVmw<{FDGE!m1GG}5;(k2HjT1j3q_Tt3~Js-aw1vd;U@ypxjx@JvK&qSQ1^$}lv|Ck
zuI!e2J;s;aCUaO~`uJk6W5T;D3?{RjRzjb80%k{(oM86m{2Q=$R*)=D>m9UH9s|5q
z*nhZmm;vpc<g2P+4;>f+-wQ<v(J~MkYPWhjVWwsL932<`RUZ$0?NJVl@_L+@4Q;dO
zBV+GxQ;u^fI^aNhV8qdDw~ZjDDz$rcj4kp*u49%;8R`0I-n;yWKZbkkj3_R3v-nG(
zy9@Ql2!fB{IX=acl8aWrFZ>p?Ft^ywY`oeZXxcw~V!z|tA81+hz8tI<v?#F1Z!B2+
z6R(rxl#orhBivMs&~IkmDPpRg4Ht;Ri2EAO7wYVwlt{<PnM=dja;-Nc6)d6_@tMY%
zp|n*y{o|xe-I*%VI+1d+h@?Bl`!cdQua{gDNH_2*;T!$vQ$zV0sH8Nr=z-Lz?MdQq
zBh8^LA70=Q%HUA4M=Mp8q-^Ml%P@v{enIJ*I6a$dOzZjAZhU`=@Obf8DgBk}DxzEs
zO4>p9h#Ss1R=nKnSE)e~UiEOe$Umzke&QhkJ4J89nupdNyi&!(855|5K5IN4Cn9W0
zylgEk-2<^iD@Gq_Mdng`-jk?x#oMDqSYKxWjm9TeB#X2S>PbZ#wz>nKe3ubqm=?dn
z5-$v6B&>*Z{(W%gcqvj<vwqcI;j+fSis}Lui~7G^0Dz6OkjeI;2;gZxAH%i|WMA1%
z_BTgkPDx6t#Na<v(~lkqSKNIf$ImHC|EW`GM6^A&DZntC0}^RxLOH05!|n-=WlxR;
ze?O=TR+j#g-B>F&!m4(K<#0Zip{9Whu!LkRPP=OULhRZn$u&JhLoC4Uy<lBqEVtk-
z4LcuK5UYYpWNrM2`-*x*%FRoYX~osAJVp9~*_B}=&C>Vd^!&aZGi3)mEsL^UjT{d8
z31qrOK|HPjQlCOWFnM8yd`%-nT4mKPrJdb^{t0xELh+N5BG5;pUV^u!7=_{&Q<SWW
zS;kfHcdYF*K=um4mW3jPpWvs634U@O%t#=o?RNgaN!*jPpQqCA`5LDr9vNR({q}=m
zZqeHGD^x%}SCMZi>@;w`-UaJij_5=d=v?y9Js2Ro_$7~8$o*?P!Mipag(`>B?}fi3
zRw+*lBxQ|z_(R<FhCDg+s|7;)F4=GW*qnK<>`G#R2q{fH%F~)~LMlu^E@E|xGj~Sv
z)cHz*$As%ii4onumgjb|+<vr6;!&sd^lhEUO;~_IC5B=o;(03BU%O%qsRd8zmd<%>
z=&<t;T(DM!PHFgA-Q38%ossgmyLEau9NdK5?qvFbqy-?{@JHgRc{Ins7Q6}C8`SpO
z(eJN(QbXG2KK~pYb612vL4t#rGI2WeyQ@UgbI{u366A9je4n}aJJZL!V?*R-mr$Y6
z#cyXSlh<l+OYm?!ox`%`%Eg$M;iXohj*NuM3)AgtnHGti#@+>wI4C-p8r9ml@spqU
z)bCiGLj72kIlpfz#5okn?dJ@$@&42zesR<p7REie{RiSMJp=?6V-}B5cT3Rh1XXBM
zU_dClt?=(QL<Y7Hr{>MRds4)%{sk(4jX>&>aML9ak}-IE&RAC@pZsHgcu1m?qUh@S
za;}gkqF`NYjoc5`un+57-V8k^6LwKJIj8{@Ds;SppKiOgCwi6Tj_lMLFgM?@gJv9Y
z=#!#y6nV&@@^$C4Kav`Fz$X#$b+D({TsTw#f7^c7+Sz%tvlET{BVT7cFxYgNwcoe#
zM^9SB$o<0W_+ZO5bo@W^1|MSe+jN4+SWF`SIvl6*Ly*)4;<BXZpe}6<TVR2=%7vyE
z9{?5O7!eL+dk3d{co|rsAYr9Zo5|x-2%V0<)5d>E72W6G-f=|sb5s!S6;kl0KOCL<
z(R_|^E9ZMipeQ1#kyvr1;25#<%OITF6f#;P%pz|Trb#RV3@*MHu45(P`%)O$KCZZ`
z3}QIEJVka%f2^*`(inKEbSFAi+?ZO`XJYJZI$VmH$xviHwkRC*tonG103g$h0n@Ps
zRi~UHXa!M1!<>W?#%F%YI6V5cXG>LdNJ;y|s876wci4-oY*Er$7}`KNH9J%Vn3*B4
zu~KEQe%q~+KJSqg*$ptLQ&4#U|1(u^Vb1I>eNi#{4Dq)dC0`~T7Qwt1Y6KTCefQw#
ztF=U$r3K7%t8lGSiM-O}g$w*nsijv<4pAmGY5dCHMN2*=Zja4l{S2Y(Mta3nrTV%N
zS(+YCS;KMCL}s!iSDyo0^DV+B*{!o^k)PG?+WGdF>OTUat-&i@H4hjfv(sP<u97tS
z2!4gyZdF3S{evH?$lLpj_{E<CzbG8`_k3{LTVBYhC3)n;CF?jF-r<QBC*8yA>auaU
z9{TL;pC|GZ9Ti$aT=6&bv2~eOYq3|b4uji#8Teu%=yTsqR7LP@Vz!37nj14>;RdNh
zwI!j&`NR!Q)%p)pPfM$Lh^7xk!<RzN@KXk|aze3wI%TZ9F-?>La1R~wrhQ1qnlZ6o
zugNFc*E=-abE&8&43OJbw$Lv4^rTPTjpE$l?t6ZO!a=Rn`~0ZYPQQ%VU2UZc;i~bx
z{-OQKK~-~eO#0VJc<)U0P3Vd>-<#gWR{ZKfGH(vG<C{};prQV+eKW?c?=H-}pM$Xw
z2RGM$uVGlNAp|%%P~N=rGFm1LN-sUK#ub@zesA+z*UD^gxAtWZ8;-#}@nRLe#WeW0
zxjyaz;JNJ}lfSoG40+c#<^Cf(Gf0St%`RK>(?fT9IA6;-YhxHMlQ3)SN{~9gyNdl%
z$SDi|9Io)9bp?J)*6cp~th3DL!~Q=<z|nTdY4lid5{Q&1@gOdO?<U1w%|~a+#F0@1
zFjJ|@yD9B1$@g5NZngL9lu_WwPKd7+5l+So=F3@9LiwdiycuK&>L}YkMv4dl=5gF@
zxj^raLD#8wld!VBy#8|((Oc8|JAeu8ptiXSlJ%oPqcI#x5#bvW9#PY~8u$|J=Cv#f
zv^Sh#0Q!B^4T(wge5(UtvBdZ>Rz#g>+cQuD_PTR)-!JewBhO29bg~eo9(0S7HB+vK
zpV;SjTO+TQh#3u3UlG2=^BWN-Xkw|BND8Pb#|5Ody}GTz=gc&Ot9MN-=!$7&7I>Jk
z^Qh1N;C-}WmwYEd{1Z<HFXlt}7Mn7aR>xTnyTx+H>;=Eq_A5A-X`D8}rwD-uP*F2>
zVt+W3_OoqdD$HXZrUDEwnVWC(`<-GeKMhZN9t`F3dtLPN#qf1cK!a3j7{y4y%KhB2
zM7F>;1@=17q^t>Vs68g=C+k_)QnB>@r>U=iiYr*Q4HjGi1ef6Mgux}aYk=VH?oROF
zAp{E^f(Cb&0S0#sLy*DU-Tuja_x=A_vsh;hoSvRK-CbR~cJDGrv3n28A-!YBtETUp
zS7_Ezh24LgorKfs%>5pkO#tyCGxH0I8^1V#O{nI@%n%LG?Jh~M%|+^`(~1TnX|fe)
z2xxIfwFrcg05NoK-}Zz0Um+*VG<`xdRy0gT^V5cH$cGHOF>7o6AL*JCHcnu=J4q&R
z-HEBpN<;$OY_D-F#K8RBLcHiX%_wqLcUBO>)9naQV=ZtbYf%TqnkAeL3PUje1e|fB
z*47uUC6$mhbAB%ec<#k;UdW90SYHu$MD08b8LsQ8H&@_sk>-~MlU>8-GRpEYC);H?
z(@r_v1v^Hh2EES*c16WIKR}pY<@#I<(>$k_|2c*a<_<AM!`cE`fnoi;#iI%gH9|RY
z2?!1?akWVVzy7*1VTuj48&heggJX-;=GPK(2P#t4YG_S&ydE7B^Jg6M-do)Z-ov<N
z3pe`2{E=toB{^L_=eY~{xhCY*e*QsOFx?r?xe)JZuC-yS%Q_jfg-{DpRojCmc0Jf$
z9x5-hP@0Y)4SM`ElTWAYzdoerKV>U-%h$5A<I`%zvmOBSOcG%F5r1yr=aoOB{v61^
zI=A@*F$r@U*{HQ4-&2Td0OTO|$IlBnEx0gHF>mK_7tkuYf0LF8C?uU_A#W_0D$}0v
zW1&SjrkA4vzDa`>XF;v<R?+;?3Ke7@Ct*J}H?p8>z{<rZI&?(}%E<b$y}9?sO_h==
zMS(RrjwILJv8DACr{nAPPAq<`Wozrwq+Ub06VsF)N@#Vdn)pxWuY0rDJ-4(=7fB`D
zLJ2=Q;j$(9?<mZ?hFBG`v3M55;#x#MaQYFH#RBF*7*K?15uU=BJeXzOU}R;ja``u=
zWeACru1}iN5v~p|qFROG`$sPc%8dm}59)Um;9=^<#xHd*q$T8uINs(XEKp8l|4sWf
z!Cf9_v-&XiUFce2fw|qs!_ljI)YoS6TAfubyhWZCG4ES+^Bvw2>m+-Sp#_P*pY0=)
z7CeTFsYOMtC^Qpi(LI&)Dl)70XE(JkS@(3zZY9prTxboaYSfb`oYpnpUrL|bpXOg!
z&Sf+L)(UTV^B(%rEXyC;DX~wa8s<|dhI6aXv762d<v3JbT~_z4edT+<w$t`q?%(}m
zb9(Q2F4<AB2^QuUiK~$$7B~5I{oFEk70u~{S4p3yaN){`BZj6cLyjKOR~xYj&`bWP
z!*~{I_V@<9@@GCNnW71KB^kKJG$sbcL>HQ_x;OVmo&C0~G^FVHbzU+qp^+oW%$ST4
zSaxTm(D2o+biTX*9yh1XqL}xQw`TGy{{sPFvZV(M!5nHb-?Y>70i0W1R>6hXpi6!J
zr?whKg5qML1t#WbK0${RU$I|n$QHs;M|b!`+i%%$q|q@PzayT>ln#&8sxD8_W`6{z
z3%z=>m6l7hJ}q|XxqYL{-Q)4*Va($NtC3Px@wEp#_oC~7`@0sU5fBF^r5bM?b3-$&
zuDyCD)3SqLHg&cNoErPb+tWqY$k(q=g>T%CnzipyVkYw5q3DOvC68~LMK`d)_Ya3u
z=D19dEPW4EV!wwrI{#5uDZ=xMs+p4>edI0E1``XuN|2mZ{3>$GT1bK@k&-dVDS-ys
zUYlTF6Hx!W(p<plIrFRKUHJlAnFha4dVQL?%ckmLViDIFuh%`vc#rdj2gYY(D&Y&h
zfO}-HJ#gI7jL>Q?rtpiin0UagYwH0Ah|uJb+qCfWimS~Z#O2apGNl0~!Vv?*@>+(>
zK6P)wkH!aIQr<L^u%?jZh(&fhG8J(qDD32G=Ue4_t&yf%5Gcuw>mPkA<pfQbjdw;J
zlYH>R;JXO>w=a}Bj@p_KNkp6T3Gs7He7W368~Ro7zAObqvu+K9HDxX%x=m%`Q?;3c
zOoFuuwMi3^+UmHSkygEWF2Y)6<P3A29&Ujc<1C6R2xhJy%+D(FH%W0@Xt^p}zXJr_
z_z0H_zh#(yx)Ryu2WHjPJJ8_wZS};CP@YF3Y5ck^9RFXrw~Hu4nbv1yQMZ%5{4+XS
z#YTl*NBB<q9D)lHoSx)D?s(-rMgk$jS3mE)3;YW>yBc{D(d-U~z1$nLHZB7)#+wzL
zr_Cdv_ik}L3E5$dMp~^MbXiW{3H#y<!}~v79@8`ln+kzcd>&O7?@+Zmu>qP_o>0@9
z9Dho7iUL?6F=-HmJqFj&Oj^v;Hd_Q7)&RT7L}wz{OlCz=>{&#Mn(nt5U?^R^>?TqN
zIn9#S6+t*8HqOra7mHh~Czi^<W{poQ31f;nyk6#yrrpqN@q2Uc-bBnf;jt++<f!nv
zdtO79$&uHl5y*#=sl#jUc!o~6w6D%)j%Z^(mPvg64ahtE*uNeq#U!`jB$gDy!Y@`g
zC!E|M1hz8Wx4z#=WwDQ<w6Oe-@U1)$Wn?*Pcol4w3ST9}a>Iog{SX)8kFERe-jRMx
zKFK8)V2Zl~lP#%^u{)6dgc38eSREN}eU@|orm>=pB1vIg)hH}EIN_(4p>X8V1DW0K
z`k+7?XT-}oM^DxIVc=_#0Dgr8lyuoni9gsjU9K^Y-4taVLg^eSEsbiXwa2mR&Fn3I
zP`T-8cbDY@&dIjVi}@dNtvhJqc*jWRub2TVN%MVd{xhem$U>FTgd~B}$y(7#+;rH2
z+|Pn5ZY+zOF^LnIYG<>U6Z6KNUwDJ}iSE^)7tBIvJ-@i7(D!hezsz4~SW)UB3fr8X
z-4tDOu?AhYa4I{D5fsU!vhm~90*Ty7_Z7?H0Yf$6N|^39PN?*k*IjX#fgb*1wA}o4
zAQYn)S>}hXh?$P~>e?^f=O4|2GKNGrzNWWJUH9d#JtC)Hi;gZLEla(AI>gR1uinb|
zL=UgKcunhHcCl#QMSV4E{B?gN@_^XtH{6*|+-QpGpkV-|m{?!GZ0A5D57r-+T(f(x
z{EH_=DnA|n()WmoGF}(0;yw&k#uWq0@v>1q!6>2KR_LHM68n$Zyad1!or!?CNoJ_}
zsk6vcBU>){r-oyR7IeC?6*cYn50Xfm$%JN;0fS&Yg(DLNCl3gPa7TTJPw52f!)Lb)
zPK-f%T>eVgiuzX!&vN(4#<)IATC5S)2$%UM--wqdQbNLLowD-nx8YkdbMrNQV-f>M
z#X#cRC};aW^2%yhwaYA)fOE)QA5PbuPgh?A?kBx>j}MDU5t5U`bwnZjuHPPVi#f7Q
z0-$ejFF<DrFL3U+$h!tQPojD>vhP2Tu`jT`5C19$H+Md5oMBwZPIhbhr3T0Q>Zm_L
zyCOP<m;cLWDimXRt&y*J^PfsgI*`!+AtDDZJ68Od$_n;cMfaZ+8fce(Q!bp6*LWtg
z7xsj&HSZpi?;gt_v2wA}roj|VKbVzbkoiTsteGy~aV_@?%r+vFgmhSxTYAG6d5H~=
zb^aZV%a$x$Vk|~3d8^o0Z}<A(1Jkzz$twGvAsEG-==vTAW%Y?rY2kV>(ItqCTyf*j
zpCIL=m{hU6I0|_5)1Ld@eWG{d+eyxOW~f??=-=xul|EN%Q<ZD0xfdXo@_oxjc!mFY
zh}pv@rhVo-oK%z?!ueLW2tnG04MAaQuiC&s_YF?>e1X#)J?$s&#jcCg&REo_bTX7s
zwVe=Df`9NsXmv~T1JymBdWIenre1)oUgF%iMozTV{W0T)z!;<c)kb-Q1%WlRm-7}U
zT+|1(+bksffIB=tE1IX{feQt*jVyLv9^Rhmfj}0h#M@U7wnY&uB^zHq$@dluX>lVf
zdpmnfQxfp<t*%+4z6Gl$8(TBJr8)N*`F^pA%L(J#5o2M=^-<4n{HeMEqOKTByG1v+
zElBD0@y}&Zaek0AghU}j1*{wAAgt2vPs-0OjSd=#lroJAIbcBT{i+f4%8TL`&oNtY
z<sA2%0&%b@+G-V7?tW4F9<`I*C9TL(b_!<gk7l3D`dv1vq%`t6Jrd8UcoeW<jZsI@
zdzcr^@?p{Y2t9PJh<_;GyiCL9t&LO5Rm2nyVtG6wW(R(Ud?{r1Q?iH2;15QK<6l~)
z*T1yujy2eln$8jUj{=jdEZ-vg*i)5KiPlH`e1_{8RbI}Maoa~L)9B8ijN<I}7I7#-
z&pv>R9Tw{`Z5WBjn;pVq5mdZOpHvSOFFYm3jsW*^+pxKk!Vuq=haQg7f$b^P$@~yO
ze^tOF%msv#g#zD=A$0@`da(KFq~l-n^8;qA5>#SR*4~uzvLN!mnKy8V=<Zq&c07#3
z0=5)~%>zeKS%lsBx^g;UAky3^C~>g1XJHe`YV+*@&B}5!;dGiUCmKH%4YfZ><NO1M
zLR}_ww^=(LBWe%lYN>(w_dC2Isp-3s8^@<Ju-)q;_@&p8H3+?mD)U=NhnMfQxMMRN
zBnxZDE;K#|?%EzMN^5n3Vjv&0XbH5q8uV!Mv|@aQdo&4ZMJ7RUF$}KkWW5=g-7Q}5
zc#*e|HHaCaBSP~Tt&={(t*F;I$=R^*UUVH1DfmwlleJ*R(XvVql)|A1?<0%3Ri!3*
zSLBZt6>t?!LVN$^T98={{#mCfESh%fhpMSljFgVpo&`xUe}u>bg&mCgdG}M6B*(hP
zNgrU!4H#nw?HA&(kIci<(=d3mIk)wqKce%q&>d~DWGE6VPWYG<4q{wZTX)|qhR9#T
z!$m%E!IYnR6{Zz*1hMP<NnSnaC1Q+)KjceEp(Od{4fDAyj1H|Xb>3k%tiVtSzgizR
zX7oPhm~OQ_s!Jt)OUF}^|I{SLmm_uLW3lJYu6N}vI-&eLL@7~$QMZBEz${pjCln>O
zEtp2xw1dKjQNy#~`iO>aB{z{^HZYE3z#k(IK)CSvqztK}CQX#$0~)(9TJjm48dFT3
zbHSjs4cjQEfr2s-G)d$k6+^JL;&(4$d8bE={p6SKODyvp`Vt6FpWr=8BPBI~YJR3D
z#c!LueckTSgpldkC4affIZ=9pvv*f+V{^6WC8>c{cW0q4xwa^1RUOZK4C7hmCpm`g
z!FPB?Ma6fJg`BOvQl8JSQkq0lAexDv`N`zSI(oqIHO_q|D$@{sN*CCFp%|l<8~Ueo
zTp=}6tFOE11*n%t=;9W8GnA%SvT=}Y@s}KvPc4MVk>aQr^e~NWjH*hot|)QuHs0_~
z@Lu5I%(@U*b>B$XYq3hVbILzLh+Rasqq_QUyJ<%2F{eeJiEH+_>@q008{pTiWlL2I
zB%3@XgI<Xx4^*Vo#JjXZOmJ=M57u>KGf1Vg{j$~kwY-I8D!dsg)5Pg`#&$oo8qg*=
zRz0khw|{i^56N$-x)MG^dUMPX-*N&r%TQHGIQ0&`AjH7Dgi|xnwQvadhAk>8v3ymc
z^;`I2*3@9F16$10%L!nM-_!&`U(M;okLXk;e;E*X@0J>%PpV2T5PncDQn7mZ4?+j<
zR+fpKj%NLvDmks@YXKJR9i1y>o42~Dk7`+d^B1qj_8Q+S`^q%UMZ}HgzqLH>#Sn?p
zj!|k%RwCjSs*f#AfODIJ|6?GeqT8F%lHh;MxP2&uXoQwIWX_Dv78=n{JH6(o-Z6}n
zHBOaf5wmz#z-lBC5>NLosTBP+nkGAL5wyTjJpwgZ0GuH4?XTqb?V@V;jpiVvddc)*
zT*Zd<M$b3w<K`TJacJ<Dm69cm6rbksk6Gg70|ZknZpIxSAJblXMpp0W{k>0UGvzlP
zL^~<JD$x#Eaht5Cbp@tyV(gwA09Q{m!4EoO)R|+aFHZ*?0=K84favD!>Bh$B_}E2Q
zT9@a+dmU7eVl3)hhm2A_FiQ|kHQfkR#^dD-#yGp##*cZ=ZF!Z#_jh<lBemZck+cY!
zNnssz=nBqIFhdOqHH3+lqmId>kVVHYJ=4jhJsq80<Gk}YBiIPJ3aNqEC^Ud_*cAo(
zFho#PP5-%Ns3d3jgDho3i~Qwci&YTLflrBU%)1PUk0T{|NDf4EGts6EMN2D*73?r2
z02M~rmZ?eXP@-P-A5evA6RIa5I~^uW+<}*=Pg%D@J~al~LYg<RXJeEq`DiN-PKUj_
zhV)MYx;cWb{LhaBf-axZb`SQf1}5K;1G>&ZyRQKL8*p6T{_!OErE9H~eZe`-)ozna
zRmAm+0bg$tP1ItI<L8iPqvrv!m(Y`rOBi<@leScZ-M_rTKY)0d#wotB-|l*8%pwSI
zf86;M$S?hKEDTKq9~W2wwAM@5&tBjOJO9;RC0e5OL%)E%X-|f^lAM_4nv%%LFtAcq
zF}F|zxmaT6p4828_t1^GBqa|>g)@j)#FLntJl*;kkc(D5TvqqFcq#mLM6YC_&ez8h
z2VyuKCV5mMged2QGOxgBnNdovj*HrFkj^tOhV`_FK5NfzTcg-phJtBBjA>@t@QU55
znI0MOq~_Sh^UUM{m79T0x}R^iI1w?l1xi}+D&bHh)$&N;O#=J>Fu$eyZcw`3T3DgI
zY2U>5;;Fc})9}-k4bSK}Jd$4dr+<YsbIRt{N&BGt1o?VmRcF80%i+Q!W$=qJ;LP9g
zhMNw~&SId)(mn4P2gw&|^xxQHU|#<_+@3DWmkj6%2@0JzS4y@RP$kQA82*~u8Pr1s
zvHV+$z+t9iNL^6rzj}Nvj`3wzzS}04`DSmsqTYT;oGyE>D%2WGS^6C4vS!a>kDH;5
zDF%0u{B<mB;|;QdKEw858za7xBUPEP|7xu;xsh7P(uZGcwv>qP8ad-{Ai>4ojzg;>
zn>eEz(AxPErhO7(KcdO3{F?3}Oxf$6E!*w?#5lHbyH(<3n!=gda~y=mu^{hrL1q>7
z4M5<aE6U~OpZCFJ%feCemVic`G8w*=(|40gcvE3z;Us`6Ql8n}2f+7g4H~pW5fZdQ
z78gl4RtFO`d!rO01)OnNYuTy0hKh?@qAc_dLlSi{UcX!uyz~MhNB6h=6dDdS6|eL%
zbozgktSGN<JCUb?iYS#E^enx+aymOHYWH|K0xyuBt6m<u-b7L<u+(b<Wfbv=o-bA(
z`?-~+@|(rSusI>HR(4=Ce*qgc(PKNJ98@j@Y?sls;{P7)KSBCg^Dke5xK409CUnOq
zkGmlFjP9Pfe5xQ3t10+-e<^aNf$d&{P4@SVk=dQE>8lV*<<WhW8ZXy`)7#&DDKU*#
zhhLNmz6k)u+el+sQ|T*VZ;MfjtCH2E1~3kLEc_=WkU~jX4S-}8j^-yRy1Kq|1fN1;
z@R@tM8{CBA&-0GO=!;o9OS5R+=l(?3dYLvZI|<B>EO=fRZ#I$Dg5Fx+lQ70tvYY%R
zx*o)qzdkC{&;0~Ghi4rx0F&4Sh1&VPam4h31}Rl$>aunoz9H%kr_vf}E>#-Ifi~IO
zV0d}qe{-A?_-)rd>Q>h_@0J$mi2_%#FA+`J`JrN9<vMDKlTtrF-u9lf#^TvK*Jd<O
zu@Ft{;p&8V_oSdeWMGKL`cIoRAwb%k`P=g_pn`K_ChU6qPN)g{Z1wr9m`u1jt-l|4
zLd-opeMh5G%jcoP+@rC{_3&0c#}B4XR=j2OJ%M`Q?e;sApMCiWgc;b4uT`MTB<SnE
zWeX%zMm~0bQ=&g!V>RW`VAoMoNS5ofm@<jgU{0CR1FjdOv)P4x6biW&Y`+bSO+)Nw
zo!_s9L+pIoJAo3WBWO+Wh*2`%v;BPZQ=A5TAKMr?D5TT%=uD|!Aw1-1i4%uGWZpDr
zIF3l7M8R2I)0gRm=)(V9iLFRcuiH@j5#~V1ybzsDI*=2X`w#OEiBgAR=e5%kg)ytF
z14&N+sct8-^T%IWgy3>|B`rmDQ3I3;`ah959TL@?vDum8I<X}_uQ_r;hvA<ak{5GK
zvkZ_WiqPNQ^QRywHAtzk6tGC|u1ZeUU@~MtSP4vAvFi0F_{`LoB+}3kfy7Pe!B;0)
z&&yCe=j~L~Z%6zhLQ~PS4O5Cf7v_k2SW1FX(i^GhM4-rcI#+Yl%S9wG3}4$e^f^lI
z(xIy%J98;+-Z80rJ&I~&m``up8F565!~Tw^>e2U4%7dQLxYLEcmM3L#KxaIF9qI6U
zT7_r)fls6q?lzCQHm}%<9-yG(jlBB+SHUZGH#O4!vR*toMmYE@IqxSK!H`sFfPQL(
zM)XdO;AnCbHrt(aKIekUx`Bv<+Eq<<)LKU{&GHjQtGBhs!+fQ0n=w8RVH^w1DPYFF
z=`lmsteBllkfe5{%p!|G{!pmV<7$8#eM?Tp<{$$+%*u_VV?e9u)hj}3c_|4kC8rQR
zBQ|}1no9>7vQxGodbZz7etWtpqm5){Y1)0CL~}hfTHf|IY)-v$G4NQp-kLjyeRVPp
zGHT7%)#vbY!kDYWdu#9OeeE!Sb(E-G&=97;`ey-}FY5wi`yLP3%JGmfRCf5XI#lyp
zBdgCaF5HpG7HIUdySsd{Tn<_1qfKi{_Tl+b29zy0hxovn&L%UyafAc&!)l)#M0ucJ
z__bIHG>-b*vQ6N@JkGLUoxhtykePG`2EJ7Acmi(fPIV1NpQie3l^*dy^I9J@oVgP|
zsiG;EB+l66EXg_l5ls*UC7C!&i!}Pt%~7j>TtP==!|?u#!U<<<N0jb_io@;(;ZAk3
zj@lNyX<7Vjr*$gE!Lb#)AJkmOwbj_t*jSY?P{gn)D_eo?-`>@*@#tVetl{t>ApgsJ
z3<DM=b+SS&uKS%PST!gy`9u<%{PPd=LV?hm;E;zy)ycm?)H5YQ9Sd#ZO8Mar3De99
zP0S9TzH4Wicxv8q+t4|-&rRtqJP|kYH+j>3dN1&^f#p1;>W5~pnSKPyJ~2c;u|jBu
zk0CdHQ02<pQKYZ3mH;$^l_9rQ4WMnz3biOxo70WoE4~qDNv`(nt;W`hcM=*9k|jnK
zOR3G2=#tHWKJ!#lI%13_$ZQ+`881;;iq0AyK^kqjK_T3B^*)wvx|x43P!@FL_ZRx`
z^B`|hqGGRGWf#wYvM0AQ8YHITuv)hp63FR|#}mji9bh4tr`?l^9v)$PdCV~74;;r|
z5zh_4RCz4T==eK@R!A0bq;aquIp$@i{N<o|5sdG5$T}QeGLmajb8AEv3yBn+Jt$0}
zZuW{<YLu->h|*;G(DFqyr?)hPO9ex0Aq*=(sS)9T&h>pso=h`mlx7y~DE7~bj=p;@
zO}23p0;?}gH*1@%G^Vj~qy$<7sN=z3lr7|s1>)3IHq(tKepX0Q>KlFeX0a5>5uSi6
ztEVQV@_@AlfO8&^3bsiW!g}4m$5ivjI}$Z>2JYW_OsM#rNAWW!w2GqjzTn;OPM}PX
zOL}5RS*aV-?HqA@Q&(GDWc4PvM{6(d1ec`B7mtcGF^(+hlK8UV-uWw#R&6HaL}&yY
z`<<47ePcBPBnxUa{?tMqZHAmh?|(H!u4Fff7?Fmd*@?ps{I)N;kJRA+B@YNZ4m<LI
ztNfZOn+|_B-PpFL3|t=d>_Oc}O2=c&ccEeaTRz_>rxd$ktm_}(ten54x4oW8qb?}-
zqor+ldb1oj0h?z|pQ$oely;~D;g;3<0#GhzV9IG-0c};Fp7Hv;=LHD;%z`(XBTfby
zyYV@i{w_c9078}1m)#lH;Y;EUOY9@#z^xAbJUt1Ai!CDUc?myWnCSAv&Bc*`X?Lq5
zn%{Wz?rg=75##xmZaH=UWto3_(mVQQ?qYP7S&dCIvo$6=vzv4g{YZq?y6VguYTqpT
zLa9`SAePaa*%1;`thUs{I+}+SKyJoVqV1}TvL|e3!V%XVhPMAqa4=Z`%7TD@$iXVK
z%DBHea*;SvL;~8aWphhmmEWpW!c9M;V$PM(EDf?L5RsH)mT22&_B|Ul>w_m>O+zwC
z+LqE%OVl7g<38DKK%VtgD|46ji``@iJD%>UshMYwV(fjRF(?eqOKn!?FVm_oHUJ8i
z6dp;~^pC|giVx=9%DwAdx<(-T{C!C$oSY~9(Om(QM@D>?GIGum2Zlu){jB)#_z2>1
zV}LnQtRcShpk@fIsWZO?(2^~9=IjBR_?$L**gk{HUSPWxp#-;VgCmZgml?tKDMXaU
z>k83tldZgT9a4MSVzfNXI$qq>pX)=!kVD&kO>E?vrOT?kOQ*J=isK`{yictH-0coZ
z$1(h^HsU;hAttfRQLyr0JM+STf;rP*_jG6P@Yh$rBDrBQkdT!m<Cz3O%e>l7idBww
z<2%b+$Uw2hNB>w@>r`TgF006H8Onha4eLsP>AHlDv@U}I2$uvsq957tt!efbh@J70
z5Bi2LR%chn%D`__Ha<6MZ=p1Q&@!N0D$6eLo@0trq;9P~q?j7{`2*b&C|Kmaio9BM
zm$x}X4liZ5|25n)5z^7mR?6sl68b(nWR5vm>Z#~Pss<%cP%Yj8fB3Mpx5s+_1tT6T
zWoJ?%;$2Q*>cIfh1P+Iic<Jldw>D6jQg;wU85i=6zd?Fg0q(b9s6~!K5F;86juy%W
zaU4>WMiI?SM5h-Cv?IAu>)VSj)XTy54L$qMw#NKroBOVSke3x!ie>K}m^NO?mfgG*
zwTiX1Xv`MNZyNoWLQZ>ivXfb6ELre&(G(=b4e7HKHx)@N+c3-wwc_oNg@$%r6C*mh
zI%%=SG(<n;gDKpAt6VY2m$WH7-*Mmam_E+oca>t8<+^PqvW)C|x4PN7!K%x)nj?h8
z{6R!Ef=I><S?PHiu0P`e9ws6?vZEx^A&H)^tFIzjEG2%IN0YJDgYDvLaX?fVc}{ov
zOR10CON|?gH0cv7kf2+Z+Q8lbY(<wV3>8<Br`g2J=eHy0Q_BU7tDHq@6@6wD%XSeA
z4sQZhWtr%vEE+i*ek9H()SnR)X_UhFf&OM_lu|pk_kYr5TyLF*9K;-yUlM&n!d%gF
ztp4$JQ~Zz*z1~0Pod@ssDD-v)grfBMpnpoWJ@mNr!MQvcr^Pg@Nym)TcE>pyefd6D
zXxXcc`?zS?Umc>&exow{MHJfV(N&Lf`ff>XDM)0*a0PO3m$}4$^Ek(xw$1!kgS1X!
zpMERoj$04v(Gb)=b>d6&-5+FGuG8Y}Hc>v62pX<icGwafY<(j=HjOoYOv5jUoZjV%
zdgMoo?ESg!nQDzvZkG;<QPa5TE+Mi!{|)=~*h@vO{>$oKBQ(Vow6zk;-RRchts%4F
zj$hrgqA~yEKx6xG#Rjr7uRHE4jrW)^E_6$QpHX|Mnp$X@7IB_;>yjgmKbikw>Dd11
z`2X*YSA0;rdd`*=%T3R`<52sM-n*P%KS<vVUWMYdSswotL~a^U>g?oa#^|-Avn?!{
z52#!vvb;G>c(8JBDs_Aa7cyhI@UjTzKs|3{|K{s|P<L0$jqa^D3yHDdrgChT{$#+K
z|NVB#!BX#OT{Lmx0h9OIxY;Knzb6RBg<bdIov%re8cP!8#eDG;gj&rZP!FoxdOiQ^
z{cUL6sy?3WMun<iwp~yv-^!2r#<Lw)jvxf=QGC0to4Hvodq@4TtbH%&v2+V*ixX-7
zavPI5P5qQ#N?Q`trVjXhK)>bm*ec&b+U2C*^6#%5!yU=~joLw)q#f~{axKR7ofWHH
zjHD2Jh5tlV0FW3T-GQK4g8bhw>tCexT$3sTGqPb>{1V#V(aL4ILCy9!*R7{WM6x$y
z%=N*X6SCZs1hxX+vH3+AJjXem*n}T{bEdb<a_|e8S$BE7QPE+W&$xvp5d^`Bq%tL9
zN2hU!>nUHH=LZQ6n4CzGR_FlzkDqT_o+nh7G~DNJ70`9O6lLFHU!Av^fToyMI9{J=
zn}VlED$=_2_(nvW``u28;$$+TtZd1&71npRo_q>cFJu^RoQ8F)+(=K>Q_L?MA6%O~
zvWyBSom2RhP#n7`JngSH?!h^RXQECH_8sF#l`qnZYN7-;7kU6g1Uo(m$WCm|;1&Cx
zpHdC&-@k?3mQJ{kM<HN~h1U?Unco*#`B^1g5OQ4X(mcq~(b4YdM9k!7OS6piK3to`
zwMY-*@Da}NxCijL<yhy6#%=JA1-0u18~sj%j^R5Ipn5i1o`!v1eLC}WS|*cZiP946
zF@D*?{xV=5m(?*ydeR=X;~Vg@<vb1ji_{&hNz;C@BKu1NHRy7<X|SnDY!`}i{2B6&
zN06@vnUb90P2$D5qA_xw0kb>o5+w@jn6SqzdD3P=p>X(2GuL%}+R`X!j}%~LWj>H3
z$B`D8Kc0hv``krBIX5SJ5QJ|oHw@Z+Qgp!^-M!eR@kp|et9YdX!;utb=dK4hLUyD4
z;<i}A9RHZ(N-RKS-CBmd`Xt}<TXcAPF$gZ6k6gez$$RC6$o>PCNNX+wW$^cu=!nWy
zvW?Z?3uN3H{uH~Es^BAA3X$r>5gq5XeYt8JgzYhaoVqm!N^m;@Acui<q1Aw#Y@tSV
zMgjp>tMK9k;Yg<`@yhw!E(gk0wa|#__a$0ofCBm1u;OZ79gk|E9u2_-AXKtFR&a8$
zaS|MLcL(2fm#A}R{4%oY;_eZ8lNdN`{IqpLM4q{(^KwQ4m@=4}qgq*fgZyKqmM&{b
zdQOJp@r)iv=9k-`S7^dIk=gMo4casqm<yGO8WmbPB32#_p&o+$#xI-5-xb~pPawR@
z`KLGzkxI_gXi?h7!sS85PMxZ8#XUm|z5xXf?M;kdL@lgrioJ{(|7?)tCPvnx_0az9
zQyII#kKH;9W@fdGBTUc9Sx-qb>UI|ix*l2Onq81J2DEnH?hQpVEkWHL=Ms(ctE=Vc
z$*<<p342{V{<6}OY$T%&Ni<9`rHpuZ)?a|#fr=8#(m@vw3%9G!&t13|?UvE(BU@-S
z=}qiUx*qyqk%F3{H?QF!M6hUnMrj>uib8xH=VjVV<*5beSZ7BY(?@2&kgiwvn1VtV
z9^PIqyGc<Q*_q~EPS0?w@h%KD<fR%>-cCD%>tSO<mI(G32^*ZojOQF;FPM0EGs>iD
zrW;e~4)t5Lz_2kDT!K2cX@B}XF9*S!wG+)y`eDF_c@?cp6mC=DbFll+es}#ab_9kL
zOl|iHHT~=Tw-GBJSPg<fm*~7OJq9M%gr!qww`%aSK0s#^U-FQgwwZ9C@4c81l@B)N
zFf=5`z`Azb+xOI+a+>?0%ATCBQ97C7tgnYyBJ{vq=yMcb(EiXffV~|V-u9Dbb$muu
zRE0e=zp5$Ig8Glc$t3odeqjhb{oR7`V=4>`Yn_!qL|<P~r~F;Nfpbg6pqh9ZYy1-8
zEp|&}&$guzaotFVSDbLoS5#~bAnlvb+OZ=?N5@e@H1(u&*b;C0+c$ZvdEe<z1C4?Y
zL@1W(l6yjoO6+;U<Hr;Rj$X${E3#6UcQ$teQ*y`p<yqe;Azo61SxYe60w|QFsN;MJ
zL!SWT#uuA;e?qdJ4}}q)M5biST@0h#3Wx2hx_ZqMg_XQ@if{EC;7MZnJ1Jd`?iqW9
zqD-WXJq-Fs1Au-$h6mr>Ij3j+i1ZUvp~G7yBDZ#Mi?PlXe(XM$Y!-?^kA2*`^1e;W
zeTm%36$#add57|)e}k1Z_4a~P)^~r<yuN8`e(V0+i}08@zJ@tP;k@@GICdvj0KAav
zAEUGC6Ll8NM7AIS-D1#X+0N7_u6X|f3{=;rZRH%{06BV^%q(pQAQ@vCbUWPS`@F|6
zXrb|V?H&C3-L~I9w(85-MAzC-FLHl?S+Yq*zbh|KoqPMmGgs#o%w6#zZfmFOksMER
zhD`9b`f@`#eEtWp0!{!R>J)T_+|j|dEzsZV;=IXtt$!z1hG{H!H6W1_L!K6v)#Z^A
z{8${kQObbze#|C3Ua{9fnMs;CW3#(6s@Y?UWL~d+V{O6i@zS~r7*r1wBt}iw_&pz*
zcbXG=#f>>iwZ0ueU+*65n+0{sac5XG>e^6%gg~m+);Yi?Fu!_o*@zK;(2_=8&&Vhf
zc@BmnBXYPqJBqQV^(`dFNL0YJXz!85A%Fxcs6svayT%ybxFyf!l}C~So$qr7Lk0^$
z{;|31G1xN4|J+sd`I1)xN3_}>=QvIq`PlgJT2=UAa{~kGQo@kO^X%O7^=rQ&i<vBu
z;8J2`8DK0dUGDl1q;s$XkbdXX^1{9L*j6pjcrwuVaRvjhRB_J39^mkjHOYJ*fwCk5
zvK8AS4?bad@e^O=i+zswdfOkf2?c`AdH&T8qf5?e^N7fXMsY&i&dyUj?N#5fhWn4V
zpr;WD_$Rp)f`I}FOsuK9Cnu&hd5JA8@O5?W2Ayv4=?7DD8P?TZi9@BfG5ts~<3{h2
zRc4Z${U@8KmZ|;)S0r0*8FqyVg&Of@#rlfE+h@Hxonv*XVm)mql;%!eScUaU3Vi%Q
zIBLu(rL$$MsUx$GHX5$ymnh~Nc;8t*iazf01D~bE{O<}8x{o&n1H;Q=i4D6?kWpnT
zo!;x81nxEOg72r&Kc^~KKJOL3s5pBgKTxiWz{~)?22h0HxKfxy^o4iz`TpYZc@D&w
zHo`r$+|4^mwCZ~!nGom~a}x9*9UzuxfZgYEK8s>*jt3z2P+2jP&T4VM*=%eS?;h>-
zc^W=;TfeNBc0L>t0>-^tTP&WA0>N32=j>cs?=S+a>lQzRLVQ3XR-s{7+!(a#vre4j
zzm58Qxg<2U+S>uSiweGtda3ga(}>~Z#g9Sv*o0<65!Y4D0&^EugAx<P1o{DU-)%4C
znnKrJm1^M*xZd^tb_VL4VFF%Q8d&G+8!UmZBi4RR0Fq_6eI8i_vReOKx@Z(6{8x4L
zU{%lqgG#nQ6y<G}U|=fXblqB5ff6Z)#4GZms}L}ZI(eq<a8Z=82%i++oZ5QKdqo7|
z9CX91zy#bS+oEN9bBgu+2vL9R=4^Kt(y3bAoCB?_-wsWnD0gFb-8jN(h=7MuZ|Uwp
zdT5<jl?LP%$Bzw#fX&DBF)4l_ezB;~glujO?WN?3&@~rjpxd{E0HGFxNZ^SHc{>!)
ztO8SJaCBK1`uZWWi_mn#q(a($>~LAwx&2A>{p!x)U2OaF^#h=Te?Iu!(SE<J{(ato
zl;l6zsJpQOGBMZJ15@pf)3JT+j4|f)*=9D?xj-gL^mdmY`{+-^j=WY^ywWc6zc$Do
zHfd>gCN5wX6V>hyII*#@tgPSv@U$lbMsZ3|UPFcp?D*>~!p>5do}JB*HUbkIXPZAH
zWtPaPZa@jeS?IV>vtj|;qlZ?bHbaBhQNR!y3`ta~_6U^FSKj0w+MHVveZliRda(zB
z%5%1o{D!3phAkcIKC}+(aSiGlbGD1HCMzSOJAE8Jb{6F{lA}LMk?|ApL7lZ6piWZr
zDZQnDg1R3qdiHIrm4OlQBW%wOp1GxS_ca^*_6ZBqi6HE$#}Px%XHT*F8WaL>t4mvX
zLes;W5k))qf)u@vf9t2ubQa0#1#PRAgFv~ZJA!ZcM4Rm0MT`G6b_DiYtrS@xsVby&
zrZ~2EZpeAk&H6|F`j_yKXlC=?Z|5NSM-2MU%V{LrPCyDc&G4V}9SXQi*PaO<?fVEo
zLexh5zyGC8has6jAu_x*l9qow8Z6ZJ|91M8tkQ?|eP%U6Xr?To|A-g=d5zfIe&j`r
z5prE4==sml6yVdstvX{fLC)NQf-yV3MCObHu{Sc+%`>MO|8on#Rx^s@vZ*N5Vg@9;
zljLY$0g~c<8)p+DF9bc{#u8E5{<i-%AMoC0^dTxLikO0;(=wPS(uSOF)<F3x56o%M
z*pUPa4C)kj&6gUrAEfx^ss7(D+9=AW!B-!k$!HfO&X@eNf^FeDv0jUGwau|wjG1YO
zLIr^>XCL+d1~8p|C4(Z=Z(@_iES(ORUG;n5#qH(Y|JO<%!nw8^ALcz)uzFoY_G$jt
zcJ#m2V>?Lf(Fp9b8UOoU=%lzq_WxKKP_>`_V)DODfYW6DpLsiZssBSh_-BaI!n+r^
YX^ecKoFA3KuYezUX%(r;4<@1i2dkb=i~s-t

literal 0
HcmV?d00001

diff --git a/torchao/prototype/moe_training/README.md b/torchao/prototype/moe_training/README.md
index 553e50f90d..befa53cc00 100644
--- a/torchao/prototype/moe_training/README.md
+++ b/torchao/prototype/moe_training/README.md
@@ -6,7 +6,7 @@ This prototype provides:
     - Using MXFP8 on a B200 GPU, this provides:
         - **~1.4x - 1.8x speedups** over bfloat16 `torch._grouped_mm` for Llama4 Scout shapes
         - **~1.19 - 1.6x speedups** over bfloat16 `torch._grouped_mm` for DeepSeekV3 671b shapes
-
+    - These benchmarks use `seq_len=8192`, `local_batch_size=16` (so `total_M = 8192 * 16 = 131,072`). We recommend using a large `total_M` dim to maximize speedup. See [benchmarks](#microbenchmarks) for more details.
 
 
 2. [TorchTitan](https://github.com/pytorch/torchtitan/tree/main) integration: pretrain DeepSeekV3/Llama4 with MXFP8 grouped GEMMs by adding the flag to your training command: `--model.converters="quantize.grouped_mm.mx" --quantize.grouped_mm.mx.fqns="experts"`
@@ -14,6 +14,28 @@ This prototype provides:
 3. Model conversion API to swap all `torch._grouped_mm` ops in your model definition to use torchao `_quantize_then_scaled_grouped_mm` under the hood (see [example](#model-conversion-api-example-end-to-end-training) below).
 
 
+## Equivalent convergence to bfloat16 training baseline
+
+Training runs on 64 node GB200 cluster with TorchTitan Llama4 Scout show that MXFP8 MoE training has equivalent convergence to bfloat16 training baseline. Infact, after 3,000 steps it finishes with slightly *lower* loss than bfloat16! This is consistent with our scaling experiments with [MXFP8 training for dense models](https://pytorch.org/blog/accelerating-2k-scale-pre-training-up-to-1-28x-with-torchao-mxfp8-and-torchtitan-on-crusoe-b200-cluster/).
+
+<img alt="Image" src="../../../docs/static/mxfp8_with_loss.png" />
+
+Training and model configurations for this run:
+- Model: Llama4 Scout
+- Dataset: C4
+- Sequence length: 8192
+- Local batch size: 1
+- Learning rate: 1e-4
+- LR scheduler warmup steps: 2000
+- Parallelisms (64 nodes of 4 devices each = 256 chips):
+    - FSDP=256 (on attention layers, shared experts, dense layer FFNs) and 256/4=64 (on routed experts)
+    - EP=16 (on routed experts)
+- Activation checkpointing mode: `none` (ideally this should use selective per op AC but there was a bug at the time preventing us from using it).
+- `torch.compile` enabled
+- `mxfp8` applied to routed experts computation (grouped GEMMs)
+- `mxfp8` applied to all linear layers except: `output`, `router.gate`, `attention.wk`, `attention.wv` (Wk and Wv too small to benefit from mxfp8)
+
+
 ## Table of Contents
 
 - [Examples](#examples)

From e4ecec02b81c05169aab4a688bee56afbc312212 Mon Sep 17 00:00:00 2001
From: Apurva Jain <apurvajain.kota@gmail.com>
Date: Fri, 7 Nov 2025 17:50:35 -0800
Subject: [PATCH 07/22] Move marlin_qqq_tensor to prototype/dtypes (#3307)

---
 benchmarks/microbenchmarks/utils.py           |   2 +-
 docs/source/api_ref_dtypes.rst                |   4 +-
 test/dtypes/test_uintx.py                     |   1 +
 test/quantization/test_marlin_qqq.py          |   2 +-
 torchao/_models/llama/generate.py             |   2 +-
 torchao/dtypes/__init__.py                    |   8 +-
 torchao/dtypes/affine_quantized_tensor_ops.py |   8 +-
 torchao/dtypes/uintx/marlin_qqq_tensor.py     | 359 +-----------------
 torchao/prototype/dtypes/__init__.py          |   6 +
 torchao/prototype/dtypes/uintx/__init__.py    |   8 +
 .../dtypes/uintx/marlin_qqq_tensor.py         | 351 +++++++++++++++++
 11 files changed, 397 insertions(+), 354 deletions(-)
 create mode 100644 torchao/prototype/dtypes/uintx/marlin_qqq_tensor.py

diff --git a/benchmarks/microbenchmarks/utils.py b/benchmarks/microbenchmarks/utils.py
index d7300a6a81..2c6a443a86 100644
--- a/benchmarks/microbenchmarks/utils.py
+++ b/benchmarks/microbenchmarks/utils.py
@@ -218,7 +218,7 @@ def string_to_config(
         )
     if "marlin" in quantization:
         if "qqq" in quantization:
-            from torchao.dtypes import MarlinQQQLayout
+            from torchao.prototype.dtypes import MarlinQQQLayout
 
             return Int8DynamicActivationInt4WeightConfig(
                 group_size=128,
diff --git a/docs/source/api_ref_dtypes.rst b/docs/source/api_ref_dtypes.rst
index 5c73d275eb..58ad4ee8a4 100644
--- a/docs/source/api_ref_dtypes.rst
+++ b/docs/source/api_ref_dtypes.rst
@@ -23,8 +23,6 @@ Layouts and Tensor Subclasses
     FloatxTensorCoreLayout
     MarlinSparseLayout
     UintxLayout
-    MarlinQQQTensor
-    MarlinQQQLayout
     Int4CPULayout
     CutlassSemiSparseLayout
 
@@ -53,6 +51,8 @@ Prototype
     BlockSparseLayout
     CutlassInt4PackedLayout
     Int8DynamicActInt4WeightCPULayout
+    MarlinQQQTensor
+    MarlinQQQLayout
 
 ..
   _NF4Tensor - add after fixing torchao/dtypes/nf4tensor.py:docstring
diff --git a/test/dtypes/test_uintx.py b/test/dtypes/test_uintx.py
index 5d54a80753..0878dfed4d 100644
--- a/test/dtypes/test_uintx.py
+++ b/test/dtypes/test_uintx.py
@@ -182,6 +182,7 @@ def test_uintx_api_deprecation():
         ),
         ("CutlassInt4PackedLayout", "torchao.dtypes.uintx.cutlass_int4_packed_layout"),
         ("BlockSparseLayout", "torchao.dtypes.uintx.block_sparse_layout"),
+        ("MarlinQQQLayout", "torchao.dtypes.uintx.marlin_qqq_tensor"),
     ]
 
     for api_name, module_path in deprecated_apis:
diff --git a/test/quantization/test_marlin_qqq.py b/test/quantization/test_marlin_qqq.py
index e0733520ff..6f0f0d69ba 100644
--- a/test/quantization/test_marlin_qqq.py
+++ b/test/quantization/test_marlin_qqq.py
@@ -10,7 +10,7 @@
 from torch import nn
 from torch.testing._internal.common_utils import TestCase, run_tests
 
-from torchao.dtypes import MarlinQQQLayout
+from torchao.prototype.dtypes import MarlinQQQLayout
 from torchao.quantization.marlin_qqq import (
     pack_to_marlin_qqq,
     unpack_from_marlin_qqq,
diff --git a/torchao/_models/llama/generate.py b/torchao/_models/llama/generate.py
index da1b848bcb..fc3d371139 100644
--- a/torchao/_models/llama/generate.py
+++ b/torchao/_models/llama/generate.py
@@ -460,7 +460,7 @@ def ffn_or_attn_only(mod, fqn):
                 )
         if "marlin" in quantization:
             if "qqq" in quantization:
-                from torchao.dtypes import MarlinQQQLayout
+                from torchao.prototype.dtypes import MarlinQQQLayout
 
                 quantize_(
                     model,
diff --git a/torchao/dtypes/__init__.py b/torchao/dtypes/__init__.py
index 354692e794..4c83de7ddd 100644
--- a/torchao/dtypes/__init__.py
+++ b/torchao/dtypes/__init__.py
@@ -16,19 +16,21 @@
 from .uintx import (
     Int4CPULayout,
     Int4XPULayout,
-    MarlinQQQLayout,
-    MarlinQQQTensor,
     MarlinSparseLayout,
     PackedLinearInt8DynamicActivationIntxWeightLayout,
     QDQLayout,
     SemiSparseLayout,
     TensorCoreTiledLayout,
     UintxLayout,
-    to_marlinqqq_quantized_intx,
 )
 from .uintx.block_sparse_layout import BlockSparseLayout
 from .uintx.cutlass_int4_packed_layout import CutlassInt4PackedLayout
 from .uintx.dyn_int8_act_int4_wei_cpu_layout import Int8DynamicActInt4WeightCPULayout
+from .uintx.marlin_qqq_tensor import (
+    MarlinQQQLayout,
+    MarlinQQQTensor,
+    to_marlinqqq_quantized_intx,
+)
 from .utils import (
     Layout,
     PlainLayout,
diff --git a/torchao/dtypes/affine_quantized_tensor_ops.py b/torchao/dtypes/affine_quantized_tensor_ops.py
index 3816f9bf1f..21f13729dd 100644
--- a/torchao/dtypes/affine_quantized_tensor_ops.py
+++ b/torchao/dtypes/affine_quantized_tensor_ops.py
@@ -39,10 +39,6 @@
     _linear_fp_act_uint4_weight_int8_zero_check,
     _linear_fp_act_uint4_weight_int8_zero_impl,
 )
-from torchao.dtypes.uintx.marlin_qqq_tensor import (
-    _linear_int8_act_int4_weight_marlin_qqq_check,
-    _linear_int8_act_int4_weight_marlin_qqq_impl,
-)
 from torchao.dtypes.uintx.marlin_sparse_layout import (
     _linear_fp_act_int4_weight_sparse_marlin_check,
     _linear_fp_act_int4_weight_sparse_marlin_impl,
@@ -94,6 +90,10 @@
     _linear_int8_act_int4_weight_cpu_check,
     _linear_int8_act_int4_weight_cpu_impl,
 )
+from torchao.prototype.dtypes.uintx.marlin_qqq_tensor import (
+    _linear_int8_act_int4_weight_marlin_qqq_check,
+    _linear_int8_act_int4_weight_marlin_qqq_impl,
+)
 from torchao.quantization.quant_primitives import (
     ZeroPointDomain,
     _dequantize_affine_no_zero_point,
diff --git a/torchao/dtypes/uintx/marlin_qqq_tensor.py b/torchao/dtypes/uintx/marlin_qqq_tensor.py
index 04066a6c65..19d16a1e9f 100644
--- a/torchao/dtypes/uintx/marlin_qqq_tensor.py
+++ b/torchao/dtypes/uintx/marlin_qqq_tensor.py
@@ -3,349 +3,24 @@
 #
 # This source code is licensed under the BSD 3-Clause license found in the
 # LICENSE file in the root directory of this source tree.
-import logging
-import math
-from dataclasses import dataclass
-from typing import Optional, Tuple
 
-import torch
-from torch.utils._python_dispatch import (
-    return_and_correct_aliasing,
-)
+# Backward compatibility stub - imports from the new location
+import warnings
 
-from torchao.dtypes.affine_quantized_tensor import (
-    AffineQuantizedTensor,
-    get_tensor_impl_constructor,
-    register_layout,
-)
-from torchao.dtypes.uintx.plain_layout import (
-    _aqt_is_int8_reduced_range,
+warnings.warn(
+    "Importing from torchao.dtypes.uintx.marlin_qqq_tensor is deprecated. "
+    "Please use 'from torchao.prototype.dtypes import MarlinQQQLayout, MarlinQQQTensor' instead. "
+    "This import path will be removed in a future release of torchao. "
+    "See https://github.com/pytorch/ao/issues/2752 for more details.",
+    DeprecationWarning,
+    stacklevel=2,
 )
-from torchao.dtypes.utils import AQTTensorImpl, Layout
-from torchao.quantization.quant_primitives import (
-    ZeroPointDomain,
-    _choose_qparams_and_quantize_affine_qqq,
-    _dequantize_affine_qqq,
-)
-
-logger = logging.getLogger(__name__)
-
-aten = torch.ops.aten
-
-
-class MarlinQQQTensor(AffineQuantizedTensor):
-    """MarlinQQQ quantized tensor subclass which inherits AffineQuantizedTensor class.
-
-    To see what happens during _choose_qparams_and_quantize_affine_qqq, quantization and dequantization for marlin qqq quantization,
-    please checkout https://github.com/pytorch/ao/blob/main/torchao/quantization/quant_primitives.py
-    and check the two quant primitive ops: _choose_qparams_and_quantize_affine_qqq and _dequantize_affine_qqq
-    """
-
-    def dequantize(self, output_dtype: Optional[torch.dtype] = None) -> torch.Tensor:
-        if output_dtype is None:
-            output_dtype = self.dtype
-
-        int_data, s_group, s_channel = self.tensor_impl.get_plain()
-        nbits = int(math.log2(self.quant_max - self.quant_min + 1))
-        group_size = max(self.block_size)
-        return _dequantize_affine_qqq(
-            int_data, s_group, s_channel, nbits, group_size, output_dtype
-        )
-
-    @classmethod
-    def from_hp_to_intx(
-        cls,
-        input_float: torch.Tensor,
-        block_size: Tuple[int, ...],
-        quant_min: Optional[int] = None,
-        quant_max: Optional[int] = None,
-        zero_point_domain: ZeroPointDomain = ZeroPointDomain.INT,
-        _layout: Optional[Layout] = None,
-    ):
-        """Converts a floating point tensor to a Marlin QQQ quantized tensor."""
-        if zero_point_domain is None:
-            raise ValueError("Please use ZeroPointDomain.NONE instead of None")
-        original_shape = input_float.shape
-        input_float = _layout.pre_process(input_float)
-        nbits = int(math.log2(quant_max - quant_min + 1))
-        group_size = max(block_size)
-        data, s_group, s_channel, _ = _choose_qparams_and_quantize_affine_qqq(
-            input_float, nbits, group_size
-        )
-        tensor_impl_ctr = get_tensor_impl_constructor(type(_layout))
-        tensor_impl = tensor_impl_ctr(data, s_group, s_channel, _layout)
-        return cls(
-            tensor_impl,
-            block_size,
-            original_shape,
-            quant_min,
-            quant_max,
-            zero_point_domain,
-            dtype=input_float.dtype,
-        )
-
-
-@dataclass(frozen=True)
-class MarlinQQQLayout(Layout):
-    """MarlinQQQLayout is a layout class for Marlin QQQ quantization."""
-
-    pass
-
-
-@register_layout(MarlinQQQLayout)
-class MarlinQQQAQTTensorImpl(AQTTensorImpl):
-    """
-    TensorImpl storage class for sparse_qqq layout for affine quantized tensor.
-
-    Can only be used with 4 bits quantization for now.
-
-    Original marlin documentation and information:
-    https://github.com/IST-DASLab/marlin/tree/master
-
-    Marlin qqq information:
-    https://github.com/HandH1998/QQQ/tree/main
-    https://arxiv.org/pdf/2406.09904
-
-    fields:
-        original_shape (torch.Size): the original shape of the tensor. used to unpack the tensor to the original shape
-        group_size (int): the group size used to pack the tensor
-        num_bits (int): the number of bits used to quantize the tensor
-    """
-
-    @staticmethod
-    def __new__(
-        cls,
-        int_data: torch.Tensor,
-        s_group: torch.Tensor,
-        s_channel: torch.Tensor,
-        _layout: Layout,
-        original_shape: torch.Size,
-        group_size: int,
-        num_bits: int,
-    ):
-        kwargs = {}
-        kwargs["device"] = int_data.device
-        kwargs["layout"] = (
-            kwargs.get("layout") if kwargs.get("layout", False) else int_data.layout
-        )
-        kwargs["dtype"] = int_data.dtype
-        kwargs["requires_grad"] = False
-        shape = int_data.shape
-        return torch.Tensor._make_wrapper_subclass(cls, shape, **kwargs)  # type: ignore[attr-defined]
-
-    def __init__(
-        self,
-        int_data: torch.Tensor,
-        s_group: torch.Tensor,
-        s_channel: torch.Tensor,
-        _layout: Layout,
-        original_shape: torch.Size,
-        group_size: int,
-        num_bits: int,
-    ):
-        self.int_data = int_data
-        self.s_group = s_group
-        self.s_channel = s_channel
-        self._layout = _layout
-        self.original_shape = original_shape
-        self.group_size = group_size
-        self.num_bits = num_bits
-
-    @classmethod
-    def __torch_dispatch__(cls, func, types, args, kwargs):
-        kwargs = {} if kwargs is None else kwargs
-
-        if func is aten.detach.default:
-            return return_and_correct_aliasing(
-                func, args, kwargs, args[0]._apply_fn_to_data(torch.detach)
-            )
-
-        raise NotImplementedError(
-            f"MarlinQQQAQTTensorImpl dispatch: attempting to run {func}, this is not supported"
-        )
-
-    def __tensor_flatten__(self):
-        return ["int_data", "s_group", "s_channel"], [
-            self._layout,
-            self.original_shape,
-            self.group_size,
-            self.num_bits,
-        ]
-
-    @classmethod
-    def __tensor_unflatten__(
-        cls, tensor_data_dict, tensor_attributes, outer_size, outer_stride
-    ):
-        int_data = tensor_data_dict["int_data"]
-        s_group = tensor_data_dict["s_group"]
-        s_channel = tensor_data_dict["s_channel"]
-        _layout, original_shape, group_size, num_bits = tensor_attributes
-        return cls(
-            int_data, s_group, s_channel, _layout, original_shape, group_size, num_bits
-        )
-
-    def get_plain(self):
-        from torchao.quantization.marlin_qqq import (
-            unpack_from_marlin_qqq,
-        )
 
-        int_data_expanded, s_group_expanded, s_channel_expanded = (
-            unpack_from_marlin_qqq(
-                self.int_data,
-                self.s_group,
-                self.s_channel,
-                self.original_shape,
-                self.num_bits,
-                self.group_size,
-            )
-        )
-        int_data_expanded_t = int_data_expanded.t()
-        s_group_expanded_t = s_group_expanded.t()
-        s_channel_expanded_t = s_channel_expanded.t()
-        return int_data_expanded_t, s_group_expanded_t, s_channel_expanded_t
-
-    @classmethod
-    def from_plain(
-        cls,
-        int_data: torch.Tensor,
-        s_group: torch.Tensor,
-        s_channel: torch.Tensor,
-        _layout: Layout,
-    ):
-        from torchao.quantization.marlin_qqq import (
-            const,
-            pack_to_marlin_qqq,
-        )
-
-        assert isinstance(_layout, MarlinQQQLayout)
-
-        # Linear layers are (in_features, out_features) but the int_data that is reaching this point
-        # is (out_features, in_features). We need to transpose it to match the expected shape in the marlin code.
-        q_w = int_data.t()
-        s_group_t = s_group.t()
-        s_channel_t = s_channel.t()
-
-        if not torch.cuda.get_device_capability()[0] >= 8:
-            raise ValueError(
-                f"Can not use Marlin QQQ int4*int8 kernel with a device of compute capability {torch.cuda.get_device_capability()}, the minimum compute capability is 8.0 for Marlin kernel."
-            )
-
-        if q_w.dtype != torch.int32:
-            raise ValueError("Only `torch.int32` weights are supported.")
-
-        in_features, out_features = q_w.shape
-        # (thread_k, thread_n)
-        thread_config = [(64, 256), (128, 128), (128, 64), (64, 128)]
-        if not any(
-            [
-                in_features % thread_k == 0 and out_features % thread_n == 0
-                for thread_k, thread_n in thread_config
-            ]
-        ):
-            raise ValueError(
-                "Not supported `in_features`: {} and `out_features`: {}.".format(
-                    in_features, out_features
-                )
-            )
-
-        num_bits = 4 if torch.max(q_w) - torch.min(q_w) < 16 else -1
-        if num_bits not in [4]:
-            raise ValueError(f"Only {[4]} bits are supported, got {num_bits}.")
-
-        if s_group.numel() == 0:
-            group_size = -1
-        else:
-            group_size = in_features // s_group_t.shape[0]
-        assert group_size <= in_features, (
-            "Group size must be less than or equal to in_features."
-        )
-
-        if group_size not in const.SUPPORTED_GROUP_SIZES:
-            raise ValueError(
-                f"Only {const.SUPPORTED_GROUP_SIZES} group sizes are supported, got {group_size}."
-            )
-
-        # Compress quantized weight to marlin format
-        marlin_qqq_q_w, marlin_qqq_s_group, marlin_qqq_s_channel = pack_to_marlin_qqq(
-            q_w, s_group_t, s_channel_t, num_bits, group_size
-        )
-
-        return cls(
-            marlin_qqq_q_w,
-            marlin_qqq_s_group,
-            marlin_qqq_s_channel,
-            _layout,
-            q_w.shape,
-            group_size,
-            num_bits,
-        )
-
-    def get_layout(self) -> Layout:
-        return self._layout
-
-    def _apply_fn_to_data(self, fn):
-        self.int_data = fn(self.int_data)
-        self.s_group = fn(self.s_group)
-        self.s_channel = fn(self.s_channel)
-        return self
-
-
-def _linear_int8_act_int4_weight_marlin_qqq_check(input_tensor, weight_tensor, bias):
-    return (
-        isinstance(input_tensor, AffineQuantizedTensor)
-        and _aqt_is_int8_reduced_range(input_tensor)
-        and input_tensor.dtype == torch.float16
-        and input_tensor.tensor_impl.scale.dtype == torch.float32
-        and len(input_tensor.tensor_impl.scale.shape) == len(input_tensor.shape) - 1
-        and isinstance(weight_tensor, AffineQuantizedTensor)
-        and weight_tensor.tensor_impl.dtype == torch.int32
-        and len(weight_tensor.shape) == 2
-        and isinstance(weight_tensor._layout, MarlinQQQLayout)
-    )
-
-
-def _linear_int8_act_int4_weight_marlin_qqq_impl(input_tensor, weight_tensor, bias):
-    from torchao.ops import marlin_qqq_gemm
-    from torchao.quantization.marlin_qqq import marlin_qqq_workspace
-
-    assert isinstance(input_tensor, AffineQuantizedTensor)
-    assert isinstance(weight_tensor, AffineQuantizedTensor)
-
-    input = input_tensor.tensor_impl.int_data
-    input_scale = input_tensor.tensor_impl.scale
-
-    w_int4 = weight_tensor.tensor_impl.int_data
-    s_group = weight_tensor.tensor_impl.s_group
-    s_channel = weight_tensor.tensor_impl.s_channel
-    original_shape = weight_tensor.tensor_impl.original_shape
-
-    # Folds batch dimension into the first dimension
-    input_2d = input.view(-1, input.shape[-1])
-    input_scale = input_scale.view(1, -1)
-
-    size_m = input_2d.shape[0]
-    size_n = s_channel.shape[1]
-    size_k = input_2d.shape[1]
-    workspace_qqq = marlin_qqq_workspace(original_shape[1])
-
-    out = marlin_qqq_gemm(
-        input_2d,
-        w_int4,
-        input_scale,
-        s_channel,
-        s_group,
-        workspace_qqq,
-        size_m,
-        size_n,
-        size_k,
-    )
-
-    # Unfold the batch dimension
-    out = out.reshape(input.shape[:-1] + (s_channel.shape[1],))
-
-    if bias is not None:
-        out += bias.to(out.dtype)
-    return out
-
-
-to_marlinqqq_quantized_intx = MarlinQQQTensor.from_hp_to_intx
+from torchao.prototype.dtypes.uintx.marlin_qqq_tensor import (  # noqa: F401
+    MarlinQQQAQTTensorImpl,  # noqa: F401
+    MarlinQQQLayout,  # noqa: F401
+    MarlinQQQTensor,  # noqa: F401
+    _linear_int8_act_int4_weight_marlin_qqq_check,  # noqa: F401
+    _linear_int8_act_int4_weight_marlin_qqq_impl,  # noqa: F401
+    to_marlinqqq_quantized_intx,  # noqa: F401
+)
diff --git a/torchao/prototype/dtypes/__init__.py b/torchao/prototype/dtypes/__init__.py
index 52a5aec425..294c7d0b15 100644
--- a/torchao/prototype/dtypes/__init__.py
+++ b/torchao/prototype/dtypes/__init__.py
@@ -8,10 +8,16 @@
     BlockSparseLayout,
     CutlassInt4PackedLayout,
     Int8DynamicActInt4WeightCPULayout,
+    MarlinQQQLayout,
+    MarlinQQQTensor,
+    to_marlinqqq_quantized_intx,
 )
 
 __all__ = [
     "BlockSparseLayout",
     "CutlassInt4PackedLayout",
     "Int8DynamicActInt4WeightCPULayout",
+    "MarlinQQQLayout",
+    "MarlinQQQTensor",
+    "to_marlinqqq_quantized_intx",
 ]
diff --git a/torchao/prototype/dtypes/uintx/__init__.py b/torchao/prototype/dtypes/uintx/__init__.py
index 89c1f3f810..cd333a90e9 100644
--- a/torchao/prototype/dtypes/uintx/__init__.py
+++ b/torchao/prototype/dtypes/uintx/__init__.py
@@ -7,9 +7,17 @@
 from .block_sparse_layout import BlockSparseLayout
 from .cutlass_int4_packed_layout import CutlassInt4PackedLayout
 from .dyn_int8_act_int4_wei_cpu_layout import Int8DynamicActInt4WeightCPULayout
+from .marlin_qqq_tensor import (
+    MarlinQQQLayout,
+    MarlinQQQTensor,
+    to_marlinqqq_quantized_intx,
+)
 
 __all__ = [
     "BlockSparseLayout",
     "CutlassInt4PackedLayout",
     "Int8DynamicActInt4WeightCPULayout",
+    "MarlinQQQLayout",
+    "MarlinQQQTensor",
+    "to_marlinqqq_quantized_intx",
 ]
diff --git a/torchao/prototype/dtypes/uintx/marlin_qqq_tensor.py b/torchao/prototype/dtypes/uintx/marlin_qqq_tensor.py
new file mode 100644
index 0000000000..04066a6c65
--- /dev/null
+++ b/torchao/prototype/dtypes/uintx/marlin_qqq_tensor.py
@@ -0,0 +1,351 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple
+
+import torch
+from torch.utils._python_dispatch import (
+    return_and_correct_aliasing,
+)
+
+from torchao.dtypes.affine_quantized_tensor import (
+    AffineQuantizedTensor,
+    get_tensor_impl_constructor,
+    register_layout,
+)
+from torchao.dtypes.uintx.plain_layout import (
+    _aqt_is_int8_reduced_range,
+)
+from torchao.dtypes.utils import AQTTensorImpl, Layout
+from torchao.quantization.quant_primitives import (
+    ZeroPointDomain,
+    _choose_qparams_and_quantize_affine_qqq,
+    _dequantize_affine_qqq,
+)
+
+logger = logging.getLogger(__name__)
+
+aten = torch.ops.aten
+
+
+class MarlinQQQTensor(AffineQuantizedTensor):
+    """MarlinQQQ quantized tensor subclass which inherits AffineQuantizedTensor class.
+
+    To see what happens during _choose_qparams_and_quantize_affine_qqq, quantization and dequantization for marlin qqq quantization,
+    please checkout https://github.com/pytorch/ao/blob/main/torchao/quantization/quant_primitives.py
+    and check the two quant primitive ops: _choose_qparams_and_quantize_affine_qqq and _dequantize_affine_qqq
+    """
+
+    def dequantize(self, output_dtype: Optional[torch.dtype] = None) -> torch.Tensor:
+        if output_dtype is None:
+            output_dtype = self.dtype
+
+        int_data, s_group, s_channel = self.tensor_impl.get_plain()
+        nbits = int(math.log2(self.quant_max - self.quant_min + 1))
+        group_size = max(self.block_size)
+        return _dequantize_affine_qqq(
+            int_data, s_group, s_channel, nbits, group_size, output_dtype
+        )
+
+    @classmethod
+    def from_hp_to_intx(
+        cls,
+        input_float: torch.Tensor,
+        block_size: Tuple[int, ...],
+        quant_min: Optional[int] = None,
+        quant_max: Optional[int] = None,
+        zero_point_domain: ZeroPointDomain = ZeroPointDomain.INT,
+        _layout: Optional[Layout] = None,
+    ):
+        """Converts a floating point tensor to a Marlin QQQ quantized tensor."""
+        if zero_point_domain is None:
+            raise ValueError("Please use ZeroPointDomain.NONE instead of None")
+        original_shape = input_float.shape
+        input_float = _layout.pre_process(input_float)
+        nbits = int(math.log2(quant_max - quant_min + 1))
+        group_size = max(block_size)
+        data, s_group, s_channel, _ = _choose_qparams_and_quantize_affine_qqq(
+            input_float, nbits, group_size
+        )
+        tensor_impl_ctr = get_tensor_impl_constructor(type(_layout))
+        tensor_impl = tensor_impl_ctr(data, s_group, s_channel, _layout)
+        return cls(
+            tensor_impl,
+            block_size,
+            original_shape,
+            quant_min,
+            quant_max,
+            zero_point_domain,
+            dtype=input_float.dtype,
+        )
+
+
+@dataclass(frozen=True)
+class MarlinQQQLayout(Layout):
+    """MarlinQQQLayout is a layout class for Marlin QQQ quantization."""
+
+    pass
+
+
+@register_layout(MarlinQQQLayout)
+class MarlinQQQAQTTensorImpl(AQTTensorImpl):
+    """
+    TensorImpl storage class for sparse_qqq layout for affine quantized tensor.
+
+    Can only be used with 4 bits quantization for now.
+
+    Original marlin documentation and information:
+    https://github.com/IST-DASLab/marlin/tree/master
+
+    Marlin qqq information:
+    https://github.com/HandH1998/QQQ/tree/main
+    https://arxiv.org/pdf/2406.09904
+
+    fields:
+        original_shape (torch.Size): the original shape of the tensor. used to unpack the tensor to the original shape
+        group_size (int): the group size used to pack the tensor
+        num_bits (int): the number of bits used to quantize the tensor
+    """
+
+    @staticmethod
+    def __new__(
+        cls,
+        int_data: torch.Tensor,
+        s_group: torch.Tensor,
+        s_channel: torch.Tensor,
+        _layout: Layout,
+        original_shape: torch.Size,
+        group_size: int,
+        num_bits: int,
+    ):
+        kwargs = {}
+        kwargs["device"] = int_data.device
+        kwargs["layout"] = (
+            kwargs.get("layout") if kwargs.get("layout", False) else int_data.layout
+        )
+        kwargs["dtype"] = int_data.dtype
+        kwargs["requires_grad"] = False
+        shape = int_data.shape
+        return torch.Tensor._make_wrapper_subclass(cls, shape, **kwargs)  # type: ignore[attr-defined]
+
+    def __init__(
+        self,
+        int_data: torch.Tensor,
+        s_group: torch.Tensor,
+        s_channel: torch.Tensor,
+        _layout: Layout,
+        original_shape: torch.Size,
+        group_size: int,
+        num_bits: int,
+    ):
+        self.int_data = int_data
+        self.s_group = s_group
+        self.s_channel = s_channel
+        self._layout = _layout
+        self.original_shape = original_shape
+        self.group_size = group_size
+        self.num_bits = num_bits
+
+    @classmethod
+    def __torch_dispatch__(cls, func, types, args, kwargs):
+        kwargs = {} if kwargs is None else kwargs
+
+        if func is aten.detach.default:
+            return return_and_correct_aliasing(
+                func, args, kwargs, args[0]._apply_fn_to_data(torch.detach)
+            )
+
+        raise NotImplementedError(
+            f"MarlinQQQAQTTensorImpl dispatch: attempting to run {func}, this is not supported"
+        )
+
+    def __tensor_flatten__(self):
+        return ["int_data", "s_group", "s_channel"], [
+            self._layout,
+            self.original_shape,
+            self.group_size,
+            self.num_bits,
+        ]
+
+    @classmethod
+    def __tensor_unflatten__(
+        cls, tensor_data_dict, tensor_attributes, outer_size, outer_stride
+    ):
+        int_data = tensor_data_dict["int_data"]
+        s_group = tensor_data_dict["s_group"]
+        s_channel = tensor_data_dict["s_channel"]
+        _layout, original_shape, group_size, num_bits = tensor_attributes
+        return cls(
+            int_data, s_group, s_channel, _layout, original_shape, group_size, num_bits
+        )
+
+    def get_plain(self):
+        from torchao.quantization.marlin_qqq import (
+            unpack_from_marlin_qqq,
+        )
+
+        int_data_expanded, s_group_expanded, s_channel_expanded = (
+            unpack_from_marlin_qqq(
+                self.int_data,
+                self.s_group,
+                self.s_channel,
+                self.original_shape,
+                self.num_bits,
+                self.group_size,
+            )
+        )
+        int_data_expanded_t = int_data_expanded.t()
+        s_group_expanded_t = s_group_expanded.t()
+        s_channel_expanded_t = s_channel_expanded.t()
+        return int_data_expanded_t, s_group_expanded_t, s_channel_expanded_t
+
+    @classmethod
+    def from_plain(
+        cls,
+        int_data: torch.Tensor,
+        s_group: torch.Tensor,
+        s_channel: torch.Tensor,
+        _layout: Layout,
+    ):
+        from torchao.quantization.marlin_qqq import (
+            const,
+            pack_to_marlin_qqq,
+        )
+
+        assert isinstance(_layout, MarlinQQQLayout)
+
+        # Linear layers are (in_features, out_features) but the int_data that is reaching this point
+        # is (out_features, in_features). We need to transpose it to match the expected shape in the marlin code.
+        q_w = int_data.t()
+        s_group_t = s_group.t()
+        s_channel_t = s_channel.t()
+
+        if not torch.cuda.get_device_capability()[0] >= 8:
+            raise ValueError(
+                f"Can not use Marlin QQQ int4*int8 kernel with a device of compute capability {torch.cuda.get_device_capability()}, the minimum compute capability is 8.0 for Marlin kernel."
+            )
+
+        if q_w.dtype != torch.int32:
+            raise ValueError("Only `torch.int32` weights are supported.")
+
+        in_features, out_features = q_w.shape
+        # (thread_k, thread_n)
+        thread_config = [(64, 256), (128, 128), (128, 64), (64, 128)]
+        if not any(
+            [
+                in_features % thread_k == 0 and out_features % thread_n == 0
+                for thread_k, thread_n in thread_config
+            ]
+        ):
+            raise ValueError(
+                "Not supported `in_features`: {} and `out_features`: {}.".format(
+                    in_features, out_features
+                )
+            )
+
+        num_bits = 4 if torch.max(q_w) - torch.min(q_w) < 16 else -1
+        if num_bits not in [4]:
+            raise ValueError(f"Only {[4]} bits are supported, got {num_bits}.")
+
+        if s_group.numel() == 0:
+            group_size = -1
+        else:
+            group_size = in_features // s_group_t.shape[0]
+        assert group_size <= in_features, (
+            "Group size must be less than or equal to in_features."
+        )
+
+        if group_size not in const.SUPPORTED_GROUP_SIZES:
+            raise ValueError(
+                f"Only {const.SUPPORTED_GROUP_SIZES} group sizes are supported, got {group_size}."
+            )
+
+        # Compress quantized weight to marlin format
+        marlin_qqq_q_w, marlin_qqq_s_group, marlin_qqq_s_channel = pack_to_marlin_qqq(
+            q_w, s_group_t, s_channel_t, num_bits, group_size
+        )
+
+        return cls(
+            marlin_qqq_q_w,
+            marlin_qqq_s_group,
+            marlin_qqq_s_channel,
+            _layout,
+            q_w.shape,
+            group_size,
+            num_bits,
+        )
+
+    def get_layout(self) -> Layout:
+        return self._layout
+
+    def _apply_fn_to_data(self, fn):
+        self.int_data = fn(self.int_data)
+        self.s_group = fn(self.s_group)
+        self.s_channel = fn(self.s_channel)
+        return self
+
+
+def _linear_int8_act_int4_weight_marlin_qqq_check(input_tensor, weight_tensor, bias):
+    return (
+        isinstance(input_tensor, AffineQuantizedTensor)
+        and _aqt_is_int8_reduced_range(input_tensor)
+        and input_tensor.dtype == torch.float16
+        and input_tensor.tensor_impl.scale.dtype == torch.float32
+        and len(input_tensor.tensor_impl.scale.shape) == len(input_tensor.shape) - 1
+        and isinstance(weight_tensor, AffineQuantizedTensor)
+        and weight_tensor.tensor_impl.dtype == torch.int32
+        and len(weight_tensor.shape) == 2
+        and isinstance(weight_tensor._layout, MarlinQQQLayout)
+    )
+
+
+def _linear_int8_act_int4_weight_marlin_qqq_impl(input_tensor, weight_tensor, bias):
+    from torchao.ops import marlin_qqq_gemm
+    from torchao.quantization.marlin_qqq import marlin_qqq_workspace
+
+    assert isinstance(input_tensor, AffineQuantizedTensor)
+    assert isinstance(weight_tensor, AffineQuantizedTensor)
+
+    input = input_tensor.tensor_impl.int_data
+    input_scale = input_tensor.tensor_impl.scale
+
+    w_int4 = weight_tensor.tensor_impl.int_data
+    s_group = weight_tensor.tensor_impl.s_group
+    s_channel = weight_tensor.tensor_impl.s_channel
+    original_shape = weight_tensor.tensor_impl.original_shape
+
+    # Folds batch dimension into the first dimension
+    input_2d = input.view(-1, input.shape[-1])
+    input_scale = input_scale.view(1, -1)
+
+    size_m = input_2d.shape[0]
+    size_n = s_channel.shape[1]
+    size_k = input_2d.shape[1]
+    workspace_qqq = marlin_qqq_workspace(original_shape[1])
+
+    out = marlin_qqq_gemm(
+        input_2d,
+        w_int4,
+        input_scale,
+        s_channel,
+        s_group,
+        workspace_qqq,
+        size_m,
+        size_n,
+        size_k,
+    )
+
+    # Unfold the batch dimension
+    out = out.reshape(input.shape[:-1] + (s_channel.shape[1],))
+
+    if bias is not None:
+        out += bias.to(out.dtype)
+    return out
+
+
+to_marlinqqq_quantized_intx = MarlinQQQTensor.from_hp_to_intx

From 865583b774166289185ffcd02f4f701c79389357 Mon Sep 17 00:00:00 2001
From: Vasiliy Kuznetsov <vkuzo@users.noreply.github.com>
Date: Mon, 10 Nov 2025 08:21:38 -0500
Subject: [PATCH 08/22] Enable `PerRow(axis)` to support axes other than `-1`
 (#3303)

* Update

[ghstack-poisoned]

* Update

[ghstack-poisoned]

* Update

[ghstack-poisoned]

* Update

[ghstack-poisoned]

* Update

[ghstack-poisoned]

* Update

[ghstack-poisoned]

* Update

[ghstack-poisoned]

* Update

[ghstack-poisoned]

* Update

[ghstack-poisoned]

* Update

[ghstack-poisoned]
---
 .../workflows/float8/test_float8_tensor.py    | 65 +++++++++++++++++++
 test/quantization/test_quant_primitives.py    | 25 +++++++
 torchao/quantization/granularity.py           | 23 ++++---
 .../workflows/float8/float8_tensor.py         |  4 +-
 torchao/quantization/utils.py                 |  6 +-
 torchao/testing/utils.py                      | 12 ++--
 6 files changed, 120 insertions(+), 15 deletions(-)

diff --git a/test/quantization/quantize_/workflows/float8/test_float8_tensor.py b/test/quantization/quantize_/workflows/float8/test_float8_tensor.py
index 1b91875359..4bc106a60f 100644
--- a/test/quantization/quantize_/workflows/float8/test_float8_tensor.py
+++ b/test/quantization/quantize_/workflows/float8/test_float8_tensor.py
@@ -15,6 +15,7 @@
 from torch.testing._internal import common_utils
 from torch.testing._internal.common_utils import run_tests
 
+from torchao.core.config import config_from_dict, config_to_dict
 from torchao.quantization import (
     Float8DynamicActivationFloat8WeightConfig,
     Float8Tensor,
@@ -634,6 +635,44 @@ def forward(self, x):
         sqnr = compute_error(original, quantized)
         self.assertTrue(sqnr > 20)
 
+    @unittest.skipIf(not is_sm_at_least_90(), "Nedd sm90+")
+    @unittest.skipIf(not _is_fbgemm_gpu_genai_available(), "Need fbgemm_gpu_genai")
+    def test_bmm_weight_in_bkn_layout(self):
+        # Tests rowwise quantization of a 3d weight stored with shape (B, K, N)
+        # and contigous with that shape. Since the `K` dimension is not last, we
+        # need to specify granularity with `PerRow(1)`.
+
+        # only support per row quantization
+        granularity = [PerRow(), PerRow(1)]
+        config = Float8DynamicActivationFloat8WeightConfig(granularity=granularity)
+
+        class Model(torch.nn.Module):
+            def __init__(self, weight):
+                super().__init__()
+                self.weight = weight
+
+            def forward(self, x):
+                return torch.bmm(x, self.weight)
+
+        dtype = torch.bfloat16
+        device = "cuda"
+
+        B, M, K, N = 10, 32, 128, 256
+
+        input = torch.randn(B, M, K, dtype=dtype, device=device)
+        weight = torch.randn(B, K, N, dtype=dtype, device=device)
+        m = Model(weight).eval()
+        original = m(input)
+        quantize_(m, config, filter_fn=lambda x, fqn: True)
+
+        assert m.weight.scale.shape == (B, 1, N), (
+            f"unexpected scale shape {m.weight.scale.shape}"
+        )
+
+        quantized = m(input)
+        sqnr = compute_error(original, quantized)
+        self.assertTrue(sqnr > 20)
+
     @common_utils.parametrize("granularity", [PerTensor(), PerRow()])
     @common_utils.parametrize(
         "sizes",
@@ -1007,6 +1046,32 @@ def test_transpose(self):
         self.assertEqual(x_fp8.block_size, (1, 512), atol=0, rtol=0)
         self.assertEqual(x_fp8_t.block_size, (512, 1), atol=0, rtol=0)
 
+    def test_per_row_config_before_dim(self):
+        """
+        Test that loading a serialized config of `PerRow` before the `dim`
+        argument was introduced works properly
+        """
+
+        # create a config with PerRow granularity
+        config = Float8DynamicActivationFloat8WeightConfig(
+            granularity=PerRow(),
+        )
+
+        # serialize it
+        config_ser = config_to_dict(config)
+
+        # manually modify the serialized config to match v1
+        # reference: https://gist.github.com/vkuzo/d347c4f8b8121819483d2d31e79f7335
+        del config_ser["_data"]["granularity"][0]["_data"]["dim"]
+        del config_ser["_data"]["granularity"][1]["_data"]["dim"]
+        assert len(config_ser["_data"]["granularity"][0]["_data"]) == 0
+        assert len(config_ser["_data"]["granularity"][1]["_data"]) == 0
+
+        # load the modified version, verify that granularity is as expected
+        config_deser = config_from_dict(config_ser)
+        assert config_deser.granularity[0].dim == -1
+        assert config_deser.granularity[1].dim == -1
+
 
 common_utils.instantiate_parametrized_tests(TestFloat8Tensor)
 
diff --git a/test/quantization/test_quant_primitives.py b/test/quantization/test_quant_primitives.py
index 5f7895b4ea..cc6b7fff91 100644
--- a/test/quantization/test_quant_primitives.py
+++ b/test/quantization/test_quant_primitives.py
@@ -10,6 +10,7 @@
 
 import torch
 
+from torchao.quantization.granularity import PerRow
 from torchao.quantization.quant_primitives import (
     MappingType,
     ZeroPointDomain,
@@ -27,6 +28,7 @@
 # TODO: remove test for utils?
 from torchao.quantization.utils import (
     _quantize_activation_per_token_absmax,
+    get_block_size,
     get_group_qparams_symmetric,
     groupwise_affine_dequantize_tensor_from_qparams,
     groupwise_affine_quantize_tensor_from_qparams,
@@ -844,6 +846,29 @@ def test_float8_blockwise_scaling(self):
         torch.testing.assert_close(scale, ref_scale, atol=0, rtol=0)
         torch.testing.assert_close(data.float(), ref_data.float(), atol=0, rtol=0)
 
+    def test_float8_rowwise_scaling_3d_weight_axis_1(self):
+        """
+        Test scaling a weight with shape (B, K, N) and row-major memory layout
+        across the K dimension.
+        """
+
+        B, K, N = 8, 16, 32
+        hp_tensor = torch.randn(B, K, N, dtype=torch.float)
+
+        granularity = PerRow(1)
+        block_size = get_block_size(hp_tensor.shape, granularity)
+        scale = _choose_scale_float8(
+            hp_tensor,
+            float8_dtype=torch.float8_e4m3fn,
+            block_size=block_size,
+            hp_value_lb=None,
+            hp_value_ub=None,
+        )
+        data = _quantize_affine_float8(hp_tensor, scale, torch.float8_e4m3fn)
+
+        assert scale.shape == (B, 1, N)
+        assert data.shape == (B, K, N)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/torchao/quantization/granularity.py b/torchao/quantization/granularity.py
index d83032d7be..97d9c07b6f 100644
--- a/torchao/quantization/granularity.py
+++ b/torchao/quantization/granularity.py
@@ -39,12 +39,14 @@ class PerAxis(Granularity):
     This granularity type calculates different quantization parameters
     along a specified axis of the tensor.
 
-    For example if the input tensor is shape [8, 16] and axis=0, then
-    the quantization parameters are calculated for each row of the tensor.
-    Giving a total of 8 quantization parameters.
+    Examples:
+    * input_tensor shape [A, B], axis 0 -> scale_shape [A, 1]
+    * input_tensor shape [A, B], axis 1 -> scale_shape [1, B]
+    * input_tensor shape [A, B, C], axis 1 -> scale_shape [1, B, 1]
 
     Attributes:
-        axis (int): The axis along which reduction is performed.
+        axis (int): The axis which is kept, reduction is performed across all
+          the other axes
     """
 
     axis: int
@@ -76,12 +78,17 @@ class PerRow(Granularity):
     """
     Represents row-wise granularity in quantization.
 
-    This is a special case of per-axis quantization and is unique to Float8 matmuls
-    where the input is quantized with a block_size of (1, ..., input.shape[-1]). And the weight
-    is quantized with a block_size of (1, weight.shape[1]).
+    Examples:
+    * input_tensor shape [A, B], dim 0 -> scale_shape [1, B]
+    * input_tensor shape [A, B], dim 1 -> scale_shape [A, 1]
+    * input_tensor shape [A, B], dim -1 -> scale_shape [A, 1]
+    * input_tensor shape [A, B, C], dim 1 -> scale_shape [A, 1, C]
+
+    Attributes:
+        dim (int): The dim which is reduced across, all other dims are kept
     """
 
-    pass
+    dim: int = -1
 
 
 @dataclass(frozen=True)
diff --git a/torchao/quantization/quantize_/workflows/float8/float8_tensor.py b/torchao/quantization/quantize_/workflows/float8/float8_tensor.py
index a9c7af34b3..abb9ddc1f9 100644
--- a/torchao/quantization/quantize_/workflows/float8/float8_tensor.py
+++ b/torchao/quantization/quantize_/workflows/float8/float8_tensor.py
@@ -179,6 +179,8 @@ def from_hp(
             and _is_fbgemm_gpu_genai_available()
             and is_sm_at_least_90()
             and isinstance(granularity, PerRow)
+            # fbgemm path only supports quantizing along the last dim
+            and granularity.dim in (-1, len(hp_tensor.shape) - 1)
             and float8_dtype == torch.float8_e4m3fn
             and hp_value_lb is None
         ):
@@ -475,7 +477,7 @@ def _(func, types, args, kwargs):
 
         res = torch.ops.fbgemm.f8f8bf16_rowwise_batched(
             a_data,
-            b_data.transpose(-2, -1),
+            b_data.transpose(-2, -1).contiguous(),
             a_scale,
             b_scale.transpose(-2, -1),
             b_scale,
diff --git a/torchao/quantization/utils.py b/torchao/quantization/utils.py
index db9a5149c3..1a0375f3d2 100644
--- a/torchao/quantization/utils.py
+++ b/torchao/quantization/utils.py
@@ -723,8 +723,12 @@ def get_block_size(
                 f"Not all shapes in input shape {input_shape} are divisible by block size {block_size}"
             )
         return block_size
-    elif isinstance(granularity, (PerRow, PerToken)):
+    elif isinstance(granularity, PerToken):
         return (1,) * (len(input_shape) - 1) + (input_shape[-1],)
+    elif isinstance(granularity, PerRow):
+        block_size = [1] * len(input_shape)
+        block_size[granularity.dim] = input_shape[granularity.dim]
+        return tuple(block_size)
     elif isinstance(granularity, PerGroup):
         assert input_shape[-1] % granularity.group_size == 0, (
             f"Last dimension of input {input_shape[-1]} is not divisible by group size {granularity.group_size}"
diff --git a/torchao/testing/utils.py b/torchao/testing/utils.py
index a1dc40fdd3..10315d45f5 100644
--- a/torchao/testing/utils.py
+++ b/torchao/testing/utils.py
@@ -444,7 +444,9 @@ def _test_slice_and_copy_similar_to_vllm(self, config: AOBaseConfig):
         dummy_l = torch.nn.Linear(1024, 1024).to("cuda").to(torch.bfloat16)
         # making the weight different
         dummy_l.weight = torch.nn.Parameter(
-            dummy_l.weight + 2 * torch.randn(1024, 1024, device=device, dtype=dtype),
+            dummy_l.weight
+            + 1.0
+            + 2 * torch.randn(1024, 1024, device=device, dtype=dtype),
             requires_grad=False,
         )
         quantize_(dummy_l, config)
@@ -456,15 +458,15 @@ def _test_slice_and_copy_similar_to_vllm(self, config: AOBaseConfig):
             param = l.weight
             param_data = param.data
             param_data = param_data.narrow(output_dim, start_idx, shard_size)
-            orig_value = param_data.qdata[0][0]
+            orig_values = param_data.qdata[0]
             loaded_weight = dummy_l.weight
             loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
 
-            # making sure param.data.qdata[0][0] is not the same as loaded_weight.qdata[0][0]
-            assert not torch.equal(orig_value, loaded_weight.qdata[0][0])
+            # making sure param.data.qdata[0] is not the same as loaded_weight.qdata[0]
+            assert not torch.equal(orig_values, loaded_weight.qdata[0])
             param_data.copy_(loaded_weight)
             # making sure param.data is updated to loaded_weight
-            assert torch.equal(param_data.qdata[0][0], loaded_weight.qdata[0][0])
+            assert torch.equal(param_data.qdata[0], loaded_weight.qdata[0])
             if hasattr(param_data, "scale"):
                 assert torch.equal(param_data.scale, loaded_weight.scale)
             if hasattr(param_data, "zero_point"):

From 2c109431bffa1d00315ccfdcb967804dcca29abe Mon Sep 17 00:00:00 2001
From: andrewor14 <andrewor14@gmail.com>
Date: Mon, 10 Nov 2025 11:47:34 -0500
Subject: [PATCH 09/22] Remove old TORCH_VERSION variables (#3146)

* Remove config functions like `int4_weight_only`

**Summary:** As a follow-up to https://github.com/pytorch/ao/pull/2994,
this commit removes all quantization functions that were used
as configs. These functions were deprecated in 0.14.0 and will
be removed in the next release, 0.15.0.

**Test Plan:** CI

[ghstack-poisoned]

* Remove old TORCH_VERSION variables

**Summary:** As a follow-up to https://github.com/pytorch/ao/pull/2719,
which deprecated these variables in 0.13.0, we remove them now in
the next release 0.15.0.

**Test Plan:** CI

[ghstack-poisoned]

* Update base for Update on "Remove old TORCH_VERSION variables"


**Summary:** As a follow-up to https://github.com/pytorch/ao/pull/2719,
which deprecated these variables in 0.13.0, we remove them now in
the next release 0.15.0.

**Test Plan:** CI

[ghstack-poisoned]

* Update base for Update on "Remove old TORCH_VERSION variables"


**Summary:** As a follow-up to https://github.com/pytorch/ao/pull/2719,
which deprecated these variables in 0.13.0, we remove them now in
the next release 0.15.0.

**Test Plan:** CI

[ghstack-poisoned]
---
 test/test_utils.py | 50 -----------------------------------
 torchao/utils.py   | 66 ----------------------------------------------
 2 files changed, 116 deletions(-)

diff --git a/test/test_utils.py b/test/test_utils.py
index b46d600053..0e77388f13 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -4,7 +4,6 @@
 # This source code is licensed under the BSD 3-Clause license found in the
 # LICENSE file in the root directory of this source tree.
 import unittest
-import warnings
 from unittest.mock import patch
 
 import torch
@@ -37,55 +36,6 @@ def test_torch_version_at_least(self):
                     f"Failed for torch.__version__={torch_version}, comparing with {compare_version}",
                 )
 
-    def test_torch_version_deprecation(self):
-        """
-        Test that TORCH_VERSION_AT_LEAST* and TORCH_VERSION_AFTER*
-        trigger deprecation warnings on use, not on import.
-        """
-        # Reset deprecation warning state, otherwise we won't log warnings here
-        warnings.resetwarnings()
-
-        # Importing and referencing should not trigger deprecation warning
-        with warnings.catch_warnings(record=True) as _warnings:
-            from torchao.utils import (
-                TORCH_VERSION_AFTER_2_2,
-                TORCH_VERSION_AFTER_2_3,
-                TORCH_VERSION_AFTER_2_4,
-                TORCH_VERSION_AFTER_2_5,
-                TORCH_VERSION_AT_LEAST_2_2,
-                TORCH_VERSION_AT_LEAST_2_3,
-                TORCH_VERSION_AT_LEAST_2_4,
-                TORCH_VERSION_AT_LEAST_2_5,
-                TORCH_VERSION_AT_LEAST_2_6,
-                TORCH_VERSION_AT_LEAST_2_7,
-                TORCH_VERSION_AT_LEAST_2_8,
-            )
-
-            deprecated_api_to_name = [
-                (TORCH_VERSION_AT_LEAST_2_8, "TORCH_VERSION_AT_LEAST_2_8"),
-                (TORCH_VERSION_AT_LEAST_2_7, "TORCH_VERSION_AT_LEAST_2_7"),
-                (TORCH_VERSION_AT_LEAST_2_6, "TORCH_VERSION_AT_LEAST_2_6"),
-                (TORCH_VERSION_AT_LEAST_2_5, "TORCH_VERSION_AT_LEAST_2_5"),
-                (TORCH_VERSION_AT_LEAST_2_4, "TORCH_VERSION_AT_LEAST_2_4"),
-                (TORCH_VERSION_AT_LEAST_2_3, "TORCH_VERSION_AT_LEAST_2_3"),
-                (TORCH_VERSION_AT_LEAST_2_2, "TORCH_VERSION_AT_LEAST_2_2"),
-                (TORCH_VERSION_AFTER_2_5, "TORCH_VERSION_AFTER_2_5"),
-                (TORCH_VERSION_AFTER_2_4, "TORCH_VERSION_AFTER_2_4"),
-                (TORCH_VERSION_AFTER_2_3, "TORCH_VERSION_AFTER_2_3"),
-                (TORCH_VERSION_AFTER_2_2, "TORCH_VERSION_AFTER_2_2"),
-            ]
-            self.assertEqual(len(_warnings), 0)
-
-        # Accessing the boolean value should trigger deprecation warning
-        with warnings.catch_warnings(record=True) as _warnings:
-            for api, name in deprecated_api_to_name:
-                num_warnings_before = len(_warnings)
-                if api:
-                    pass
-                regex = f"{name} is deprecated and will be removed"
-                self.assertEqual(len(_warnings), num_warnings_before + 1)
-                self.assertIn(regex, str(_warnings[-1].message))
-
 
 class TestTorchAOBaseTensor(unittest.TestCase):
     def test_print_arg_types(self):
diff --git a/torchao/utils.py b/torchao/utils.py
index 26191e2482..e123dfe891 100644
--- a/torchao/utils.py
+++ b/torchao/utils.py
@@ -35,17 +35,6 @@
     "is_sm_at_least_100",
     "is_package_at_least",
     "DummyModule",
-    # Deprecated
-    "TORCH_VERSION_AT_LEAST_2_2",
-    "TORCH_VERSION_AT_LEAST_2_3",
-    "TORCH_VERSION_AT_LEAST_2_4",
-    "TORCH_VERSION_AT_LEAST_2_5",
-    "TORCH_VERSION_AT_LEAST_2_6",
-    "TORCH_VERSION_AT_LEAST_2_7",
-    "TORCH_VERSION_AFTER_2_2",
-    "TORCH_VERSION_AFTER_2_3",
-    "TORCH_VERSION_AFTER_2_4",
-    "TORCH_VERSION_AFTER_2_5",
 ]
 
 
@@ -379,61 +368,6 @@ def torch_version_at_least(min_version):
     return parse_version(torch.__version__) >= parse_version(min_version)
 
 
-def _deprecated_torch_version_at_least(version_str: str) -> str:
-    """
-    Wrapper for existing TORCH_VERSION_AT_LEAST* variables that will log
-    a deprecation warning if the variable is used.
-    """
-    version_str_var_name = "_".join(version_str.split(".")[:2])
-    deprecation_msg = f"TORCH_VERSION_AT_LEAST_{version_str_var_name} is deprecated and will be removed in torchao 0.14.0"
-    return _BoolDeprecationWrapper(
-        torch_version_at_least(version_str),
-        deprecation_msg,
-    )
-
-
-def _deprecated_torch_version_after(version_str: str) -> str:
-    """
-    Wrapper for existing TORCH_VERSION_AFTER* variables that will log
-    a deprecation warning if the variable is used.
-    """
-    bool_value = is_fbcode() or version("torch") >= version_str
-    version_str_var_name = "_".join(version_str.split(".")[:2])
-    deprecation_msg = f"TORCH_VERSION_AFTER_{version_str_var_name} is deprecated and will be removed in torchao 0.14.0"
-    return _BoolDeprecationWrapper(bool_value, deprecation_msg)
-
-
-class _BoolDeprecationWrapper:
-    """
-    A deprecation wrapper that logs a warning when the given bool value is accessed.
-    """
-
-    def __init__(self, bool_value: bool, msg: str):
-        self.bool_value = bool_value
-        self.msg = msg
-
-    def __bool__(self):
-        warnings.warn(self.msg)
-        return self.bool_value
-
-    def __eq__(self, other):
-        return bool(self) == bool(other)
-
-
-# Deprecated, use `torch_version_at_least` directly instead
-TORCH_VERSION_AT_LEAST_2_8 = _deprecated_torch_version_at_least("2.8.0")
-TORCH_VERSION_AT_LEAST_2_7 = _deprecated_torch_version_at_least("2.7.0")
-TORCH_VERSION_AT_LEAST_2_6 = _deprecated_torch_version_at_least("2.6.0")
-TORCH_VERSION_AT_LEAST_2_5 = _deprecated_torch_version_at_least("2.5.0")
-TORCH_VERSION_AT_LEAST_2_4 = _deprecated_torch_version_at_least("2.4.0")
-TORCH_VERSION_AT_LEAST_2_3 = _deprecated_torch_version_at_least("2.3.0")
-TORCH_VERSION_AT_LEAST_2_2 = _deprecated_torch_version_at_least("2.2.0")
-TORCH_VERSION_AFTER_2_5 = _deprecated_torch_version_after("2.5.0.dev")
-TORCH_VERSION_AFTER_2_4 = _deprecated_torch_version_after("2.4.0.dev")
-TORCH_VERSION_AFTER_2_3 = _deprecated_torch_version_after("2.3.0.dev")
-TORCH_VERSION_AFTER_2_2 = _deprecated_torch_version_after("2.2.0.dev")
-
-
 class _ConfigDeprecationWrapper:
     """
     A deprecation wrapper that directs users from a deprecated "config function"

From 36e8d0b7dc906b2799a20b4ed7d64d3f0f0bd95c Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh168@gmail.com>
Date: Mon, 10 Nov 2025 15:34:28 -0800
Subject: [PATCH 10/22] Add per tensor fp8 conv2d support (#3315)

Summary:
Add fp8 conv2d support, using the same conv3d kernels, by setting the D dimension to 1.

1. unsqueeze both input and weight in dim 2 ( the D dimension)
2. call fp8 conv3d op from fbgemm `torch.ops.fbgemm.f8f8bf16_conv`
3. assert D dimension shape to be 1 and call sequeeze at dim 2: res.squeeze(2) to remove the D dimension

Test Plan:
python test/quantization/quantize_/workflows/float8/test_float8_tensor.py -k test_unsqueeze_conv2d_weight
python test/quantization/quantize_/workflows/float8/test_float8_tensor.py -k test_fp8_conv_variants
---
 .../workflows/float8/test_float8_tensor.py    | 153 ++++++++++++++----
 torchao/quantization/quant_api.py             |  14 +-
 .../workflows/float8/float8_tensor.py         |  97 ++++++++++-
 3 files changed, 218 insertions(+), 46 deletions(-)

diff --git a/test/quantization/quantize_/workflows/float8/test_float8_tensor.py b/test/quantization/quantize_/workflows/float8/test_float8_tensor.py
index 4bc106a60f..df11b71e66 100644
--- a/test/quantization/quantize_/workflows/float8/test_float8_tensor.py
+++ b/test/quantization/quantize_/workflows/float8/test_float8_tensor.py
@@ -87,6 +87,8 @@ def __init__(
         )
         if dim == 3:
             self.conv = self.conv.to(memory_format=torch.channels_last_3d)
+        elif dim == 2:
+            self.conv = self.conv.to(memory_format=torch.channels_last)
 
     def forward(self, x):
         return self.conv(x)
@@ -337,12 +339,14 @@ def _test_fp8_matmul_model(
     @common_utils.parametrize("dtype", [torch.bfloat16, torch.float32])
     @common_utils.parametrize("compile", [True, False])
     @common_utils.parametrize("inference_mode", [True, False])
-    # only test for 3D conv for now
-    # Inputs are (N, C_in, C_out, D, H, W)
+    # test for 2D/3D conv
+    # Inputs are (N, C_in, C_out, (D, H, W) or
+    # (N, C_in, C_out, (H, W)
     @common_utils.parametrize(
         "sizes",
         [
-            (4, 16, 64, 32, 32, 32),
+            (4, 16, 64, (32, 32, 32)),
+            (4, 16, 64, (32, 32)),
         ],
     )
     def test_fp8_conv_variants(
@@ -350,20 +354,28 @@ def test_fp8_conv_variants(
         dtype: torch.dtype,
         compile: bool,
         inference_mode: bool,
-        kernel_preference: KernelPreference,
         sizes: Tuple,
     ):
+        torch.compiler.reset()
         granularity = PerTensor()
         kernel_preference = KernelPreference.AUTO
-        N, C_in, C_out, D, H, W = sizes
-        dim = 3
+
+        N, C_in, C_out, spatial_dims = sizes
+        dim = len(spatial_dims)
+        convs = {1: torch.nn.Conv1d, 2: torch.nn.Conv2d, 3: torch.nn.Conv3d}
+        assert dim in convs, f"Unsupported dim: {dim}"
+        conv_class = convs[dim]
+
         kernel_size = 3
 
         # Note: this is channel last memory format
-        input_tensor = torch.randn(N, C_in, D, H, W, dtype=dtype, device="cuda")
-        input_tensor = input_tensor.to(memory_format=torch.channels_last_3d)
+        input_tensor = torch.randn(N, C_in, *spatial_dims, dtype=dtype, device="cuda")
+        if dim == 3:
+            input_tensor = input_tensor.to(memory_format=torch.channels_last_3d)
+        else:
+            assert dim == 2
+            input_tensor = input_tensor.to(memory_format=torch.channels_last)
 
-        # Create a linear layer with bfloat16 dtype
         model = ToyConvModel(
             dim,
             C_in,
@@ -382,9 +394,9 @@ def test_fp8_conv_variants(
             kernel_preference=kernel_preference,
         )
 
-        _is_conv3d = lambda m, fqn: isinstance(m, torch.nn.Conv3d)
+        _is_conv = lambda m, fqn: isinstance(m, conv_class)
 
-        quantize_(quantized_model, config, filter_fn=_is_conv3d)
+        quantize_(quantized_model, config, filter_fn=_is_conv)
 
         if compile:
             quantized_model = torch.compile(quantized_model, fullgraph=True)
@@ -408,13 +420,16 @@ def test_fp8_conv_variants(
         "Requires fbgemm_gpu_genai to be installed",
     )
     @common_utils.parametrize("dtype", [torch.bfloat16, torch.float32])
-    # only test for 3D conv for now
-    # Inputs are (N, C_in, C_out, D, H, W)
+    # test for 2D/3D conv
+    # Inputs are (N, C_in, C_out, (D, H, W) or
+    # (N, C_in, C_out, (H, W)
     @common_utils.parametrize(
         "sizes",
         [
-            (4, 12, 64, 32, 32, 32),
-            (4, 16, 12, 32, 32, 32),
+            (4, 12, 64, (32, 32, 32)),
+            (4, 16, 12, (32, 32, 32)),
+            (4, 12, 64, (32, 32)),
+            (4, 16, 12, (32, 32)),
         ],
     )
     def test_fp8_conv_skip_quant(
@@ -427,14 +442,23 @@ def test_fp8_conv_skip_quant(
         """
         granularity = PerTensor()
         kernel_preference = KernelPreference.AUTO
-        N, C_in, C_out, D, H, W = sizes
-        dim = 3
+
+        N, C_in, C_out, spatial_dims = sizes
+
+        dim = len(spatial_dims)
+        convs = {1: torch.nn.Conv1d, 2: torch.nn.Conv2d, 3: torch.nn.Conv3d}
+        assert dim in convs, f"Unsupported dim: {dim}"
+        conv_class = convs[dim]
+
         kernel_size = 3
 
         # Note: this is channel last memory format
-        input_tensor = torch.randn(N, C_in, D, H, W, dtype=dtype, device="cuda")
-        input_tensor = input_tensor.to(memory_format=torch.channels_last_3d)
-        # Create a linear layer with bfloat16 dtype
+        input_tensor = torch.randn(N, C_in, *spatial_dims, dtype=dtype, device="cuda")
+        if dim == 3:
+            input_tensor = input_tensor.to(memory_format=torch.channels_last_3d)
+        else:
+            input_tensor = input_tensor.to(memory_format=torch.channels_last)
+
         model = ToyConvModel(
             dim,
             C_in,
@@ -453,9 +477,9 @@ def test_fp8_conv_skip_quant(
             kernel_preference=kernel_preference,
         )
 
-        _is_conv3d = lambda m, fqn: isinstance(m, torch.nn.Conv3d)
+        _is_conv = lambda m, fqn: isinstance(m, conv_class)
 
-        quantize_(quantized_model, config, filter_fn=_is_conv3d)
+        quantize_(quantized_model, config, filter_fn=_is_conv)
         assert not isinstance(quantized_model.conv.weight, Float8Tensor)
 
         output_original = model(input_tensor)
@@ -832,7 +856,6 @@ def test_index_select(self):
         ],
     )
     def test_unsqueeze_operation(self, granularity, sizes):
-        """Test aten.unsqueeze.default operation on Float8Tensor"""
         config = Float8DynamicActivationFloat8WeightConfig(granularity=granularity)
         dtype = torch.bfloat16
         device = "cuda"
@@ -845,7 +868,7 @@ def test_unsqueeze_operation(self, granularity, sizes):
         original_weight = linear.weight
         original_shape = original_weight.shape
 
-        # Test unsqueeze operation at dim=0 (only supported dimension)
+        # Test unsqueeze operation at dim=0
         unsqueezed_weight = original_weight.unsqueeze(0)
 
         # Verify the unsqueezed tensor has correct shape
@@ -887,22 +910,84 @@ def test_unsqueeze_operation(self, granularity, sizes):
 
         self.assertEqual(unsqueezed_dequant, expected_dequant)
 
-    @common_utils.parametrize("granularity", [PerTensor(), PerRow()])
-    def test_unsqueeze_error_cases(self, granularity):
-        """Test error cases for aten.unsqueeze.default operation"""
+    def test_unsqueeze_conv2d_weight(self):
+        granularity = PerTensor()
         config = Float8DynamicActivationFloat8WeightConfig(granularity=granularity)
         dtype = torch.bfloat16
         device = "cuda"
+        N, C_in, C_out, spatial_dims = 4, 16, 64, (32, 32)
+        dim = len(spatial_dims)
+        kernel_size = 3
 
-        # Create a linear layer and quantize it
-        linear = torch.nn.Linear(128, 256, bias=False, dtype=dtype, device=device)
-        quantize_(linear, config)
+        input_tensor = torch.randn(N, C_in, *spatial_dims, dtype=dtype, device=device)
+        input_tensor = input_tensor.to(memory_format=torch.channels_last)
+        model = ToyConvModel(
+            dim,
+            C_in,
+            C_out,
+            kernel_size,
+            bias=False,
+            padding=0,
+            dtype=dtype,
+            device=device,
+        ).eval()
+
+        quantized_model = copy.deepcopy(model)
+
+        config = Float8DynamicActivationFloat8WeightConfig(
+            granularity=granularity,
+        )
+
+        _is_conv = lambda m, fqn: isinstance(m, torch.nn.Conv2d)
 
-        weight = linear.weight
+        quantize_(quantized_model, config, filter_fn=_is_conv)
 
-        # Test that unsqueezing on unsupported dimensions raises an error
-        with self.assertRaisesRegex(AssertionError, "Only dim == 0 is supported"):
-            weight.unsqueeze(1)  # dim=1 should not be supported
+        original_weight = quantized_model.conv.weight
+        original_shape = original_weight.shape
+
+        # Test unsqueeze operation at dim=2
+        unsqueezed_weight = original_weight.unsqueeze(2)
+
+        # Verify the unsqueezed tensor has correct shape
+        original_shape_list = list(original_shape)
+        expected_shape = original_shape_list[:2] + [1] + original_shape_list[2:]
+        scale_shape_list = list(original_weight.scale.shape)
+        expected_scale_shape = scale_shape_list[:2] + [1] + scale_shape_list[2:]
+
+        self.assertEqual(unsqueezed_weight.shape, torch.Size(expected_shape))
+        # Verify qdata and scale shapes
+        expected_qdata_shape = expected_shape
+
+        self.assertEqual(
+            unsqueezed_weight.qdata.shape, torch.Size(expected_qdata_shape)
+        )
+        self.assertEqual(
+            unsqueezed_weight.scale.shape, torch.Size(expected_scale_shape)
+        )
+
+        # Verify block_size is correctly updated
+        expected_block_size = []
+        for i in range(len(expected_shape)):
+            expected_block_size.append(expected_shape[i] // expected_scale_shape[i])
+
+        self.assertEqual(unsqueezed_weight.block_size, expected_block_size)
+
+        # Test that metadata is preserved
+        self.assertEqual(unsqueezed_weight.mm_config, original_weight.mm_config)
+        self.assertEqual(
+            unsqueezed_weight.act_quant_kwargs, original_weight.act_quant_kwargs
+        )
+        self.assertEqual(
+            unsqueezed_weight.kernel_preference, original_weight.kernel_preference
+        )
+        self.assertEqual(unsqueezed_weight.dtype, original_weight.dtype)
+
+        # Test numerical correctness
+        original_dequant = original_weight.dequantize()
+        unsqueezed_dequant = unsqueezed_weight.dequantize()
+        expected_dequant = original_dequant.unsqueeze(2)
+
+        self.assertEqual(unsqueezed_dequant, expected_dequant)
 
     @common_utils.parametrize("granularity", [PerTensor(), PerRow()])
     @common_utils.parametrize("slice_dim", [0, 1, 2])
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
index e3a75bbb3e..09c2edcd9f 100644
--- a/torchao/quantization/quant_api.py
+++ b/torchao/quantization/quant_api.py
@@ -1816,13 +1816,19 @@ def _float8_dynamic_activation_float8_weight_quantize_tensor(weight, config):
     _check_hardware_support(granularity)
     activation_granularity, weight_granularity = granularity
 
-    if weight.dim() == 5:
-        # weights for conv3d
+    # Note: right now we assume it's weights of conv2d and conv3d purely based
+    # on the dimension of weight, currently there is no conflict with linear 2d
+    # and moe weights 3d
+    # if we need to support conv1d, which also has 3d weight, we may have to
+    # pass around the module as well to distinguish between conv1d and 3d moe weight
+    if weight.dim() in [4, 5]:
+        # weights for conv2d or 3d
         assert isinstance(activation_granularity, PerTensor) and isinstance(
             weight_granularity, PerTensor
-        ), "5D tensor only supports per tensor activation and weight quantization"
+        ), "4D/5D tensor only supports per tensor activation and weight quantization"
 
-        # weight dim: (C_out, C_in, K1, K2, K3)
+        # conv3d weight dim: (C_out, C_in, K1, K2, K3)
+        # conv2d weight dim: (C_out, C_in, K1, K2)
         # skip quantization when either C_out or C_in
         # is not a multiple of 16
         if weight.shape[0] % 16 != 0 or weight.shape[1] % 16 != 0:
diff --git a/torchao/quantization/quantize_/workflows/float8/float8_tensor.py b/torchao/quantization/quantize_/workflows/float8/float8_tensor.py
index abb9ddc1f9..733d7a17a5 100644
--- a/torchao/quantization/quantize_/workflows/float8/float8_tensor.py
+++ b/torchao/quantization/quantize_/workflows/float8/float8_tensor.py
@@ -539,6 +539,7 @@ def _quantize_and_scaled_conv3d(
 
     # move C_in to last dim
     # after permute: (C_out, K1, K2, K3, C_in)
+
     weight_qdata = weight_tensor.qdata.permute([0, 2, 3, 4, 1])
 
     assert act_qdata.is_contiguous() and weight_qdata.is_contiguous(), (
@@ -574,10 +575,71 @@ def _(func, types, args, kwargs):
         groups,
     ) = args
     assert not transposed, "transposed conv is not supported currently"
-    assert tuple(output_padding) == (0, 0, 0), (
-        f"Only (0, 0, 0) is supported for `output_padding`, got: f{output_padding}"
-    )
+    dim = len(output_padding)
+    assert dim in [2, 3], "Only 2d or 3d convs are supported"
     assert groups == 1, f"Only 1 is supported for `groups`, got: {groups}"
+
+    if dim == 2:
+        assert input_tensor.is_contiguous(
+            memory_format=torch.channels_last
+        ) and weight_tensor.qdata.is_contiguous(memory_format=torch.channels_last), (
+            "Please make sure both activation and weights are in the `channels_last` memory_format"
+        )
+        # (N, C, H, W) --> (N, C, 1, H, W)
+        input_tensor = input_tensor.unsqueeze(2)
+        weight_tensor = weight_tensor.unsqueeze(2)
+        assert tuple(output_padding) == (0, 0), (
+            f"Only (0, 0) is supported for `output_padding`, got: f{output_padding}"
+        )
+        padding = [0, *padding]
+        stride = [1, *stride]
+        dilation = [1, *dilation]
+        res = _quantize_and_scaled_conv3d(
+            input_tensor,
+            weight_tensor,
+            bias,
+            stride,
+            padding,
+            dilation,
+        )
+        assert res.shape[2] == 1
+        res = res.squeeze(2)
+        return res
+    else:
+        assert input_tensor.is_contiguous(
+            memory_format=torch.channels_last_3d
+        ) and weight_tensor.qdata.is_contiguous(memory_format=torch.channels_last_3d), (
+            "Please make sure both activation and weights are in the `channels_last_3d` memory_format"
+        )
+        assert tuple(output_padding) == (0, 0, 0), (
+            f"Only (0, 0, 0) is supported for `output_padding`, got: f{output_padding}"
+        )
+        return _quantize_and_scaled_conv3d(
+            input_tensor,
+            weight_tensor,
+            bias,
+            stride,
+            padding,
+            dilation,
+        )
+
+
+@implements(aten.conv3d.default)
+def _(func, types, args, kwargs):
+    (
+        input_tensor,
+        weight_tensor,
+        bias,
+        stride,
+        padding,
+        dilation,
+        groups,
+    ) = fill_defaults(args, 7, [None, [1, 1, 1], [0, 0, 0], [1, 1, 1], 1])
+    assert input_tensor.is_contiguous(
+        memory_format=torch.channels_last_3d
+    ) and weight_tensor.qdata.is_contiguous(memory_format=torch.channels_last_3d), (
+        "Please make sure both activation and weights are in the `channels_last_3d` memory_format"
+    )
     return _quantize_and_scaled_conv3d(
         input_tensor,
         weight_tensor,
@@ -588,7 +650,7 @@ def _(func, types, args, kwargs):
     )
 
 
-@implements(aten.conv3d.default)
+@implements(aten.conv2d.default)
 def _(func, types, args, kwargs):
     (
         input_tensor,
@@ -598,9 +660,26 @@ def _(func, types, args, kwargs):
         padding,
         dilation,
         groups,
-    ) = fill_defaults(args, 7, [None, [1, 1, 1], [0, 0, 0], [1, 1, 1], 1])
-    assert groups == 1, f"Only 1 is supported for `groups`, got: {groups}"
-    return _quantize_and_scaled_conv3d(
+    ) = fill_defaults(args, 7, [None, [1, 1], [0, 0], [1, 1], 1])
+    # (N, C, H, W) --> (N, C, 1, H, W)
+    # memory_format of both tensors should be torch.channels_last
+    # and it should be preserved with unsqueeze(2) (becoming torch.channels_last_3d)
+    assert input_tensor.is_contiguous(
+        memory_format=torch.channels_last
+    ) and weight_tensor.qdata.is_contiguous(memory_format=torch.channels_last), (
+        "Please make sure both activation and weights are in the `channels_last` memory_format"
+    )
+    input_tensor = input_tensor.unsqueeze(2)
+    weight_tensor = weight_tensor.unsqueeze(2)
+
+    assert input_tensor.is_contiguous(
+        memory_format=torch.channels_last_3d
+    ) and weight_tensor.qdata.is_contiguous(memory_format=torch.channels_last_3d)
+
+    padding = [0, *padding]
+    stride = [1, *stride]
+    dilation = [1, *dilation]
+    res = _quantize_and_scaled_conv3d(
         input_tensor,
         weight_tensor,
         bias,
@@ -608,6 +687,9 @@ def _(func, types, args, kwargs):
         padding,
         dilation,
     )
+    assert res.shape[2] == 1
+    res = res.squeeze(2)
+    return res
 
 
 @implements(aten.slice.Tensor)
@@ -839,7 +921,6 @@ def _(func, types, args, kwargs):
 @implements(aten.unsqueeze.default)
 def _(func, types, args, kwargs):
     self, dim = args
-    assert dim == 0, f"Only dim == 0 is supported, got: {dim}"
     qdata = self.qdata.unsqueeze(dim=dim)
     scale = self.scale.unsqueeze(dim=dim)
     block_size = []

From bab6ce5ed17b8e18126a473f0d2fe7e90faea788 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Mon, 10 Nov 2025 16:16:16 -0800
Subject: [PATCH 11/22] Pin pytest==8.4.2 (#3321)

Signed-off-by: Huy Do <huydhn@gmail.com>
---
 dev-requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dev-requirements.txt b/dev-requirements.txt
index 600d5001cf..ef00257bb7 100644
--- a/dev-requirements.txt
+++ b/dev-requirements.txt
@@ -1,5 +1,5 @@
 # Test utilities
-pytest
+pytest==8.4.2
 unittest-xml-reporting
 parameterized
 packaging

From 8bce9b1af0b4fae84314525a89c03f96a64d589e Mon Sep 17 00:00:00 2001
From: namgyu-youn <yynk2012@gmail.com>
Date: Wed, 12 Nov 2025 03:01:57 +0900
Subject: [PATCH 12/22] Update common used toy linear model (#3275)

* build common used toy linear model
Co-authored-by: Jerry Zhang <jerryzh168@gmail.com>

* update model to use direct input

* revert unit test skip
---
 test/sparsity/test_fast_sparse_training.py | 18 +-----
 torchao/testing/model_architectures.py     | 68 ++++++++++++++++++++--
 2 files changed, 66 insertions(+), 20 deletions(-)

diff --git a/test/sparsity/test_fast_sparse_training.py b/test/sparsity/test_fast_sparse_training.py
index 424306f897..a9f57bb5a5 100644
--- a/test/sparsity/test_fast_sparse_training.py
+++ b/test/sparsity/test_fast_sparse_training.py
@@ -15,22 +15,10 @@
     swap_linear_with_semi_sparse_linear,
     swap_semi_sparse_linear_with_linear,
 )
+from torchao.testing.model_architectures import ToyTwoLinearModel
 from torchao.utils import is_fbcode
 
 
-class ToyModel(nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.linear1 = nn.Linear(128, 256, bias=False)
-        self.linear2 = nn.Linear(256, 128, bias=False)
-
-    def forward(self, x):
-        x = self.linear1(x)
-        x = torch.nn.functional.relu(x)
-        x = self.linear2(x)
-        return x
-
-
 class TestRuntimeSemiStructuredSparsity(TestCase):
     @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
     @unittest.skipIf(is_fbcode(), "broken in fbcode")
@@ -41,7 +29,7 @@ def test_runtime_weight_sparsification(self):
 
         input = torch.rand((128, 128)).half().cuda()
         grad = torch.rand((128, 128)).half().cuda()
-        model = ToyModel().half().cuda()
+        model = ToyTwoLinearModel(128, 256, 128, device="cuda", dtype=torch.float16)
         model_c = copy.deepcopy(model)
 
         for name, mod in model.named_modules():
@@ -89,7 +77,7 @@ def test_runtime_weight_sparsification_compile(self):
 
         input = torch.rand((128, 128)).half().cuda()
         grad = torch.rand((128, 128)).half().cuda()
-        model = ToyModel().half().cuda()
+        model = ToyTwoLinearModel(128, 256, 128, device="cuda", dtype=torch.float16)
         model_c = copy.deepcopy(model)
 
         for name, mod in model.named_modules():
diff --git a/torchao/testing/model_architectures.py b/torchao/testing/model_architectures.py
index 8f41a8464c..4100a3cd76 100644
--- a/torchao/testing/model_architectures.py
+++ b/torchao/testing/model_architectures.py
@@ -11,14 +11,72 @@
 import torch.nn.functional as F
 
 
+class ToySingleLinearModel(torch.nn.Module):
+    def __init__(
+        self,
+        input_dim,
+        output_dim,
+        dtype,
+        device,
+        has_bias=False,
+    ):
+        super().__init__()
+        self.dtype = dtype
+        self.device = device
+        self.linear1 = torch.nn.Linear(
+            input_dim, output_dim, bias=has_bias, dtype=dtype, device=device
+        )
+
+    def example_inputs(self, batch_size=1):
+        return (
+            torch.randn(
+                batch_size,
+                self.linear1.in_features,
+                dtype=self.dtype,
+                device=self.device,
+            ),
+        )
+
+    def forward(self, x):
+        x = self.linear1(x)
+        return x
+
+
 # TODO: Refactor torchao and tests to use these models
-class ToyLinearModel(torch.nn.Module):
-    def __init__(self, k=64, n=32, dtype=torch.bfloat16):
+class ToyTwoLinearModel(torch.nn.Module):
+    def __init__(
+        self,
+        input_dim,
+        hidden_dim,
+        output_dim,
+        dtype,
+        device,
+        has_bias=False,
+    ):
         super().__init__()
-        self.linear1 = torch.nn.Linear(k, n, bias=False).to(dtype)
+        self.dtype = dtype
+        self.device = device
+        self.linear1 = torch.nn.Linear(
+            input_dim, hidden_dim, bias=has_bias, dtype=dtype, device=device
+        )
+        self.linear2 = torch.nn.Linear(
+            hidden_dim, output_dim, bias=has_bias, dtype=dtype, device=device
+        )
+
+    # Note: Tiny-GEMM kernel only uses BF16 inputs
+    def example_inputs(self, batch_size=1):
+        return (
+            torch.randn(
+                batch_size,
+                self.linear1.in_features,
+                dtype=self.dtype,
+                device=self.device,
+            ),
+        )
 
     def forward(self, x):
         x = self.linear1(x)
+        x = self.linear2(x)
         return x
 
 
@@ -179,8 +237,8 @@ def create_model_and_input_data(
         m, k, n (int): dimensions of the model and input data
     """
     if model_type == "linear":
-        model = ToyLinearModel(k, n, high_precision_dtype).to(device)
-        input_data = torch.randn(m, k, device=device, dtype=high_precision_dtype)
+        model = ToySingleLinearModel(k, n, device=device, dtype=high_precision_dtype)
+        input_data = model.example_inputs(batch_size=m)[0]
     elif "ln_linear" in model_type:
         # Extract activation type from model_type string
         match = re.search(r"ln_linear_?(\w+)?", model_type)

From 4a102c25ecb954a62383a9f7b193becc29553991 Mon Sep 17 00:00:00 2001
From: Andrey Talman <atalman@fb.com>
Date: Tue, 11 Nov 2025 18:58:51 -0500
Subject: [PATCH 13/22] Use conda libgcc-ng 11.2 (#3327)

* Remove devtoolset install

* Update regression_test.yml

* Update regression_test.yml
---
 .github/workflows/regression_test.yml | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/.github/workflows/regression_test.yml b/.github/workflows/regression_test.yml
index 46928b30cf..149a7b07da 100644
--- a/.github/workflows/regression_test.yml
+++ b/.github/workflows/regression_test.yml
@@ -117,11 +117,8 @@ jobs:
       gpu-arch-version: ${{ matrix.gpu-arch-version }}
       submodules: recursive
       script: |
-        conda create -n venv python=3.10 -y
+        conda create -n venv python=3.10 libgcc-ng=11.2.0 libstdcxx-ng=11.2.0  -y 
         conda activate venv
-        echo "::group::Install newer objcopy that supports --set-section-alignment"
-        dnf install -y gcc-toolset-10-binutils
-        export PATH=/opt/rh/gcc-toolset-10/root/usr/bin/:$PATH
         python -m pip install --upgrade pip
         pip install ${{ matrix.torch-spec }}
         sed -i '${{ matrix.dev-requirements-overrides }}' dev-requirements.txt

From 5c3e652e1a0fe0483f6b761774cc74608050677b Mon Sep 17 00:00:00 2001
From: Apurva Jain <apurvajain.kota@gmail.com>
Date: Tue, 11 Nov 2025 18:43:15 -0800
Subject: [PATCH 14/22] Move gemlite layout to prototype/dtypes (#3313)

---
 torchao/dtypes/affine_quantized_tensor_ops.py |   8 +-
 torchao/dtypes/uintx/gemlite_layout.py        | 461 +-----------------
 torchao/prototype/dtypes/__init__.py          |   2 +
 torchao/prototype/dtypes/uintx/__init__.py    |   2 +
 .../prototype/dtypes/uintx/gemlite_layout.py  | 452 +++++++++++++++++
 torchao/quantization/autoquant.py             |   2 +-
 torchao/quantization/quant_api.py             |   2 +-
 7 files changed, 480 insertions(+), 449 deletions(-)
 create mode 100644 torchao/prototype/dtypes/uintx/gemlite_layout.py

diff --git a/torchao/dtypes/affine_quantized_tensor_ops.py b/torchao/dtypes/affine_quantized_tensor_ops.py
index 21f13729dd..6c7216ab12 100644
--- a/torchao/dtypes/affine_quantized_tensor_ops.py
+++ b/torchao/dtypes/affine_quantized_tensor_ops.py
@@ -25,10 +25,6 @@
     _linear_f16_bf16_act_floatx_weight_check,
     _linear_f16_bf16_act_floatx_weight_impl,
 )
-from torchao.dtypes.uintx.gemlite_layout import (
-    _linear_fp_act_int4_weight_gemlite_check,
-    _linear_fp_act_int4_weight_gemlite_impl,
-)
 from torchao.dtypes.uintx.int4_cpu_layout import (
     _linear_fp_act_uint4_weight_cpu_check,
     _linear_fp_act_uint4_weight_cpu_impl,
@@ -90,6 +86,10 @@
     _linear_int8_act_int4_weight_cpu_check,
     _linear_int8_act_int4_weight_cpu_impl,
 )
+from torchao.prototype.dtypes.uintx.gemlite_layout import (
+    _linear_fp_act_int4_weight_gemlite_check,
+    _linear_fp_act_int4_weight_gemlite_impl,
+)
 from torchao.prototype.dtypes.uintx.marlin_qqq_tensor import (
     _linear_int8_act_int4_weight_marlin_qqq_check,
     _linear_int8_act_int4_weight_marlin_qqq_impl,
diff --git a/torchao/dtypes/uintx/gemlite_layout.py b/torchao/dtypes/uintx/gemlite_layout.py
index 8a8f2309c9..c75c7fe1b1 100644
--- a/torchao/dtypes/uintx/gemlite_layout.py
+++ b/torchao/dtypes/uintx/gemlite_layout.py
@@ -3,450 +3,25 @@
 #
 # This source code is licensed under the BSD 3-Clause license found in the
 # LICENSE file in the root directory of this source tree.
-from dataclasses import dataclass
-from typing import Dict, Optional, Tuple
 
-import torch
-from torch.utils._python_dispatch import (
-    is_traceable_wrapper_subclass,
-    return_and_correct_aliasing,
-)
+# Backward compatibility stub - imports from the new location
+import warnings
 
-from torchao.dtypes.affine_quantized_tensor import (
-    AffineQuantizedTensor,
-    register_layout,
+warnings.warn(
+    "Importing from torchao.dtypes.uintx.gemlite_layout is deprecated. "
+    "Please use 'from torchao.prototype.dtypes import GemlitePackedLayout' instead. "
+    "This import path will be removed in a future release of torchao. "
+    "See https://github.com/pytorch/ao/issues/2752 for more details.",
+    DeprecationWarning,
+    stacklevel=2,
 )
-from torchao.dtypes.uintx.tensor_core_tiled_layout import TensorCoreTiledAQTTensorImpl
-from torchao.dtypes.utils import Layout
-from torchao.utils import fill_defaults
-
-try:
-    import gemlite
-except:
-    gemlite = None
-
-aten = torch.ops.aten
-
-
-def _same_metadata(
-    self: "GemliteAQTTensorImpl",
-    src: "GemliteAQTTensorImpl",
-) -> bool:
-    kwargs_match = len(self.gemlite_kwargs) == len(src.gemlite_kwargs)
-    for k, v in self.gemlite_kwargs.items():
-        if k in [
-            "in_features",
-            "out_features",
-            "packing_bitwidth",
-            "elements_per_sample",
-        ]:
-            kwargs_match = kwargs_match and (v == src.gemlite_kwargs[k])
-
-    return (
-        isinstance(self, GemliteAQTTensorImpl)
-        and isinstance(src, GemliteAQTTensorImpl)
-        and self.shape == src.shape
-        and self.packed_weight.shape == src.packed_weight.shape
-        and self.scale.shape == src.scale.shape
-        and self.zero_point.shape == src.zero_point.shape
-        and kwargs_match
-        and type(self._layout) == type(src._layout)
-    )
-
-
-def get_gemlite_quant_kwargs(bit_width, group_size, dtype):
-    from torchao.quantization.quant_primitives import MappingType, ZeroPointDomain
-
-    kwargs = {}
-    if bit_width != 8:
-        kwargs["mapping_type"] = MappingType.ASYMMETRIC
-        kwargs["block_size"] = (1, group_size)
-        kwargs["target_dtype"] = torch.uint8
-        kwargs["eps"] = 1e-6
-        kwargs["quant_min"] = 0
-        kwargs["quant_max"] = (2**bit_width) - 1
-        kwargs["eps"] = 1e-6
-        kwargs["zero_point_dtype"] = dtype
-        kwargs["zero_point_domain"] = ZeroPointDomain.FLOAT
-    elif bit_width == 8:
-        kwargs["mapping_type"] = MappingType.SYMMETRIC
-        kwargs["block_size"] = (1, group_size)
-        kwargs["target_dtype"] = torch.int8
-        kwargs["quant_min"] = -128
-        kwargs["quant_max"] = 127
-        kwargs["eps"] = 1e-5
-        kwargs["zero_point_dtype"] = None
-        kwargs["zero_point_domain"] = ZeroPointDomain.NONE
-    return kwargs
-
-
-def get_gemlite_aqt_kwargs(
-    weight,
-    group_size=64,
-    bit_width=4,
-    packing_bitwidth=None,
-    mode="weight_only",
-    use_hqq=True,
-):
-    if gemlite is None:
-        raise ImportError(
-            "Unable to import 'gemlite'. Please ensure it is installed correctly. You can install it with: pip install gemlite"
-        )
-
-    assert bit_width in [
-        4,
-        8,
-    ], f"gemlite only works with bit_width 4,8 but got {bit_width}"
-
-    assert weight.dtype in [torch.float16, torch.bfloat16], (
-        f"gemlite only works with dtype torch.float16 or torch.bfloat16 but got {weight.dtype}"
-    )
-    assert group_size in [32, 64, 128, 256, 512, 1024, None]
-    assert group_size is None or bit_width != 8, (
-        "gemlite only works with group_size=None for bit_width=8"
-    )
-    assert packing_bitwidth in [8, 16, 32, None], (
-        f"Invalid packing bitwidth, got {packing_bitwidth}"
-    )
-
-    assert mode in ["weight_only", "dynamic"], (
-        f"Invalid mode: should be either weight_only or dynamic, got {mode}"
-    )
-
-    out_features, in_features = weight.shape
-    group_size = in_features if group_size is None else group_size
-
-    aqt_kwargs = get_gemlite_quant_kwargs(bit_width, group_size, weight.dtype)
-    aqt_kwargs["_layout"] = GemlitePackedLayout(
-        group_size=group_size,
-        bit_width=bit_width,
-        packing_bitwidth=packing_bitwidth,
-        mode=mode,
-    )
-    aqt_kwargs["use_hqq"] = use_hqq
-    return aqt_kwargs
-
-
-@dataclass(frozen=True)
-class GemlitePackedLayout(Layout):
-    group_size: Optional[int] = 128
-    bit_width: int = 4
-    packing_bitwidth: Optional[int] = None
-    mode: Optional[str] = "weight_only"
-
-
-@register_layout(GemlitePackedLayout)
-class GemliteAQTTensorImpl(TensorCoreTiledAQTTensorImpl):
-    def __new__(
-        cls,
-        packed_weight: torch.Tensor,
-        scale: torch.Tensor,
-        zero_point: torch.Tensor,
-        gemlite_kwargs: Dict,
-        _layout: Layout,
-    ):
-        kwargs = {}
-        kwargs["device"] = packed_weight.device
-        kwargs["layout"] = (
-            kwargs.get("layout")
-            if kwargs.get("layout", False)
-            else packed_weight.layout
-        )
-        kwargs["dtype"] = packed_weight.dtype
-        kwargs["requires_grad"] = False
-        shape = packed_weight.shape
-        return torch.Tensor._make_wrapper_subclass(cls, shape, **kwargs)  # type: ignore[attr-defined]
-
-    def __init__(
-        self,
-        packed_weight: torch.Tensor,
-        scale: torch.Tensor,
-        zero_point: torch.Tensor,
-        gemlite_kwargs: Dict,
-        _layout: Layout,
-    ):
-        self.packed_weight = packed_weight
-        self.scale = scale
-        self.zero_point = zero_point
-        self.gemlite_kwargs = gemlite_kwargs
-        self._layout = _layout
-
-    def __tensor_flatten__(self):
-        return ["packed_weight", "scale", "zero_point"], [
-            self._layout,
-            self.gemlite_kwargs,
-        ]
-
-    @classmethod
-    def __tensor_unflatten__(
-        cls, tensor_data_dict, tensor_attributes, outer_size, outer_stride
-    ):
-        packed_weight, scale, zero_point = (
-            tensor_data_dict["packed_weight"],
-            tensor_data_dict["scale"],
-            tensor_data_dict["zero_point"],
-        )
-        _layout, gemlite_kwargs = tensor_attributes
-        return cls(packed_weight, scale, zero_point, gemlite_kwargs, _layout)
-
-    @classmethod
-    def from_plain(
-        cls,
-        int_data: torch.Tensor,
-        scale: torch.Tensor,
-        zero_point: Optional[torch.Tensor],
-        _layout: Layout,
-    ):
-        assert isinstance(_layout, GemlitePackedLayout), (
-            f"GemliteAQTTensorImpl only works with GemliteLinearTriton but got {_layout}"
-        )
-        device = int_data.device
-        if device.type != "cuda":
-            int_data = (
-                int_data.cuda()
-            )  # We need int_data on cuda device because of Triton packing
-
-        group_size, bit_width = _layout.group_size, _layout.bit_width
-        out_features, in_features = int_data.shape
-        packing_bitwidth = _layout.packing_bitwidth
-        mode = _layout.mode
-
-        if bit_width == 8 and group_size == in_features:
-            processor = (
-                gemlite.helper.A8W8_int8_dynamic
-                if mode == "dynamic"
-                else gemlite.helper.A16W8
-            )
-            gemlite_linear = processor(device=int_data.device).from_weights(
-                int_data, scales=scale, bias=None
-            )
-        else:
-            processor = (
-                gemlite.helper.A8Wn_dynamic
-                if mode == "dynamic"
-                else gemlite.helper.A16Wn
-            )
-            gemlite_linear = processor(
-                device=int_data.device, packing_bitwidth=packing_bitwidth
-            ).from_weights(
-                int_data, scale, zero_point, bit_width, group_size, bias=None
-            )
-
-        meta_args = gemlite_linear.get_meta_args()
-        gemlite_kwargs = {
-            "in_features": in_features,
-            "out_features": out_features,
-            "packing_bitwidth": packing_bitwidth,
-            "data_contiguous": gemlite_linear.data_contiguous,
-            "elements_per_sample": gemlite_linear.elements_per_sample,
-            "W_group_mode": gemlite_linear.W_group_mode,
-            "meta_args": meta_args,
-        }
-
-        packed_weight, scale, zero_point = gemlite_linear.get_tensor_args()
-        packed_weight = packed_weight.to(device)
-        if zero_point is None:
-            zero_point = torch.tensor(
-                [[]], device=packed_weight.device, dtype=torch.int32
-            )
-
-        return cls(packed_weight, scale, zero_point, gemlite_kwargs, _layout)
-
-    def to(self, *args, **kwargs):
-        kwargs = self._get_to_kwargs(*args, **kwargs)
-        device = kwargs["device"]
-        return self.__class__(
-            self.packed_weight.to(device),
-            self.scale.to(device),
-            self.zero_point.to(device),
-            self.gemlite_kwargs,
-            self._layout,
-        )
 
-    def _apply_fn_to_data(self, fn):
-        return self.__class__(
-            fn(self.packed_weight),
-            fn(self.scale),
-            fn(self.zero_point),
-            self.gemlite_kwargs,
-            self._layout,
-        )
-
-    def get_plain(self) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        device = self.packed_weight.device
-        int_data = (
-            (
-                gemlite.bitpack.unpack_over_rows(
-                    self.packed_weight.cuda(),
-                    W_nbits=self._layout.bit_width,
-                    num_output_rows=self.gemlite_kwargs["in_features"],
-                    dtype=torch.uint8,
-                )
-            )
-            .to(device)
-            .t()
-        )
-
-        # Preserve col-row major layout
-        if self.gemlite_kwargs["data_contiguous"]:
-            int_data = int_data.contiguous()
-
-        # Handle FMA mode: W_q * s + z  -> (W_q - z) * s
-        if self.gemlite_kwargs["W_group_mode"] == 4:
-            scale_min_val = 1e-8
-            scale = self.scale.clone().float()
-            scale[torch.logical_and(scale >= 0, scale.abs() <= scale_min_val)] = (
-                scale_min_val
-            )
-            scale[
-                torch.logical_and(scale < 0, scale.abs() <= scale_min_val)
-            ] = -scale_min_val
-            zero_point = (-self.zero_point.float() / scale).clamp_(-100, 100)
-            zero_point = zero_point.to(self.scale.dtype)
-        else:
-            zero_point = self.zero_point
-
-        scale = self.scale.t().contiguous()
-        zero_point = zero_point.t().contiguous()
-
-        return int_data, scale, zero_point
-
-    @classmethod
-    def __torch_dispatch__(cls, func, types, args, kwargs):
-        kwargs = {} if kwargs is None else kwargs
-
-        # we don't handle transpose operations and just ignore them. In practice the only
-        # reason a transpsoe should occur is because the functional linear
-        # op can decompose into e.g. transpose + addmm so since we want
-        # to use the gemlite matmul kernel, which expects teh weight to be passed in as is,
-        # we ignore the transpose
-        if func is aten.detach.default or func is aten.t.default:
-            return return_and_correct_aliasing(
-                func, args, kwargs, args[0]._apply_fn_to_data(torch.detach)
-            )
-
-        if func is aten.clone.default:
-            return return_and_correct_aliasing(
-                func, args, kwargs, args[0]._apply_fn_to_data(torch.clone)
-            )
-
-        if func is aten.slice.Tensor:
-            self, dim, start, end, step = fill_defaults(args, 5, [0, None, None, 1])
-            assert step == 1, "Only step == 1 is supported in slicing right now"
-
-            if dim in [0, 1]:
-                # data in self is transposed, meaning forward() performs x @ W_deq not x @ W_deq.T
-                dim = 1 - dim
-                packed_weight = self.packed_weight
-                scale = self.scale
-                zero_point = self.zero_point
-
-                gemlite_kwargs = self.gemlite_kwargs.copy()
-                orig_shape = [
-                    gemlite_kwargs["in_features"],
-                    gemlite_kwargs["out_features"],
-                ]
-                elements_per_sample = gemlite_kwargs["elements_per_sample"]
-                data_len = orig_shape[dim]
-                scale_len = scale.shape[dim]
-                ratio = data_len / scale_len
-                start_scale = int(start / ratio)
-                end_scale = int(end / ratio)
-
-                # For packing only the K dimension. This should be flipped for N-dim packing.
-                div = elements_per_sample if dim == 0 else 1
-                packed_weight = aten.slice.Tensor(
-                    packed_weight, dim, start // div, end // div, step
-                )
-
-                # Update in_features/out_features
-                gemlite_kwargs["in_features"] = (
-                    packed_weight.shape[0] * elements_per_sample
-                )
-                gemlite_kwargs["out_features"] = packed_weight.shape[1]
-
-                scale = aten.slice.Tensor(scale, dim, start_scale, end_scale, step)
-                if zero_point is not None and zero_point.numel() > 0:
-                    zero_point = aten.slice.Tensor(
-                        zero_point, dim, start_scale, end_scale, step
-                    )
-                else:
-                    zero_point = None
-
-                sliced = GemliteAQTTensorImpl(
-                    packed_weight, scale, zero_point, gemlite_kwargs, self._layout
-                )
-                return return_and_correct_aliasing(func, args, kwargs, sliced)
-
-            else:
-                raise NotImplementedError(
-                    f"GemliteAQTTensorImpl dispatch: attempting to run {func}, with dim={dim}, that is not supported"
-                )
-
-        elif func is aten.copy_.default:
-            self = args[0]
-            src = args[1]
-
-            # Handle zero_point = None with symmetric quant
-            if self.zero_point is None:
-                self.zero_point = torch.tensor(
-                    [[]], device=self.packed_weight.device, dtype=torch.int32
-                )
-
-            if src.zero_point is None:
-                src.zero_point = torch.tensor(
-                    [[]], device=src.packed_weight.device, dtype=torch.int32
-                )
-
-            if _same_metadata(self, src):
-                self_tensors = self.__tensor_flatten__()[0]
-                for tensor_name in self_tensors:
-                    getattr(self, tensor_name).copy_(getattr(src, tensor_name))
-                for key in self.gemlite_kwargs:
-                    self.gemlite_kwargs[key] = src.gemlite_kwargs[key]
-                return
-            raise ValueError(
-                f"Not supported args for copy_ due to metadata mismatch: {args[0], args[1]}"
-            )
-
-        raise NotImplementedError(
-            f"GemliteAQTTensorImpl dispatch: attempting to run {func}, this is not supported"
-        )
-
-    __torch_function__ = torch._C._disabled_torch_function_impl
-
-    def get_layout(self) -> Layout:
-        return self._layout
-
-    @property
-    def block_size(self):
-        return (1, self._layout.group_size)
-
-
-def _linear_fp_act_int4_weight_gemlite_impl(input_tensor, weight_tensor, bias=None):
-    if hasattr(weight_tensor, "tensor_impl"):
-        weight_impl = weight_tensor.tensor_impl
-    else:
-        weight_impl = weight_tensor
-
-    return gemlite.core.forward_functional(
-        x=input_tensor,
-        bias=bias,
-        tensor_args=(
-            weight_impl.packed_weight,
-            weight_impl.scale,
-            weight_impl.zero_point,
-        ),
-        meta_args=weight_impl.gemlite_kwargs["meta_args"],
-    )
-
-
-def _linear_fp_act_int4_weight_gemlite_check(input_tensor, weight_tensor, bias):
-    return (
-        # input is native fp16 tensor
-        not is_traceable_wrapper_subclass(input_tensor)
-        # and input_tensor.dtype in [torch.float16, torch.bfloat16]
-        # weight is gemlite layout
-        and isinstance(weight_tensor, AffineQuantizedTensor)
-        and isinstance(weight_tensor._layout, GemlitePackedLayout)
-    )
+from torchao.prototype.dtypes.uintx.gemlite_layout import (  # noqa: F401
+    GemliteAQTTensorImpl,  # noqa: F401
+    GemlitePackedLayout,  # noqa: F401
+    _linear_fp_act_int4_weight_gemlite_check,  # noqa: F401
+    _linear_fp_act_int4_weight_gemlite_impl,  # noqa: F401
+    _same_metadata,  # noqa: F401
+    get_gemlite_aqt_kwargs,  # noqa: F401
+    get_gemlite_quant_kwargs,  # noqa: F401
+)
diff --git a/torchao/prototype/dtypes/__init__.py b/torchao/prototype/dtypes/__init__.py
index 294c7d0b15..7ad78dbed6 100644
--- a/torchao/prototype/dtypes/__init__.py
+++ b/torchao/prototype/dtypes/__init__.py
@@ -7,6 +7,7 @@
 from .uintx import (
     BlockSparseLayout,
     CutlassInt4PackedLayout,
+    GemlitePackedLayout,
     Int8DynamicActInt4WeightCPULayout,
     MarlinQQQLayout,
     MarlinQQQTensor,
@@ -20,4 +21,5 @@
     "MarlinQQQLayout",
     "MarlinQQQTensor",
     "to_marlinqqq_quantized_intx",
+    "GemlitePackedLayout",
 ]
diff --git a/torchao/prototype/dtypes/uintx/__init__.py b/torchao/prototype/dtypes/uintx/__init__.py
index cd333a90e9..56b1eed50a 100644
--- a/torchao/prototype/dtypes/uintx/__init__.py
+++ b/torchao/prototype/dtypes/uintx/__init__.py
@@ -7,6 +7,7 @@
 from .block_sparse_layout import BlockSparseLayout
 from .cutlass_int4_packed_layout import CutlassInt4PackedLayout
 from .dyn_int8_act_int4_wei_cpu_layout import Int8DynamicActInt4WeightCPULayout
+from .gemlite_layout import GemlitePackedLayout
 from .marlin_qqq_tensor import (
     MarlinQQQLayout,
     MarlinQQQTensor,
@@ -20,4 +21,5 @@
     "MarlinQQQLayout",
     "MarlinQQQTensor",
     "to_marlinqqq_quantized_intx",
+    "GemlitePackedLayout",
 ]
diff --git a/torchao/prototype/dtypes/uintx/gemlite_layout.py b/torchao/prototype/dtypes/uintx/gemlite_layout.py
new file mode 100644
index 0000000000..8a8f2309c9
--- /dev/null
+++ b/torchao/prototype/dtypes/uintx/gemlite_layout.py
@@ -0,0 +1,452 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+from dataclasses import dataclass
+from typing import Dict, Optional, Tuple
+
+import torch
+from torch.utils._python_dispatch import (
+    is_traceable_wrapper_subclass,
+    return_and_correct_aliasing,
+)
+
+from torchao.dtypes.affine_quantized_tensor import (
+    AffineQuantizedTensor,
+    register_layout,
+)
+from torchao.dtypes.uintx.tensor_core_tiled_layout import TensorCoreTiledAQTTensorImpl
+from torchao.dtypes.utils import Layout
+from torchao.utils import fill_defaults
+
+try:
+    import gemlite
+except:
+    gemlite = None
+
+aten = torch.ops.aten
+
+
+def _same_metadata(
+    self: "GemliteAQTTensorImpl",
+    src: "GemliteAQTTensorImpl",
+) -> bool:
+    kwargs_match = len(self.gemlite_kwargs) == len(src.gemlite_kwargs)
+    for k, v in self.gemlite_kwargs.items():
+        if k in [
+            "in_features",
+            "out_features",
+            "packing_bitwidth",
+            "elements_per_sample",
+        ]:
+            kwargs_match = kwargs_match and (v == src.gemlite_kwargs[k])
+
+    return (
+        isinstance(self, GemliteAQTTensorImpl)
+        and isinstance(src, GemliteAQTTensorImpl)
+        and self.shape == src.shape
+        and self.packed_weight.shape == src.packed_weight.shape
+        and self.scale.shape == src.scale.shape
+        and self.zero_point.shape == src.zero_point.shape
+        and kwargs_match
+        and type(self._layout) == type(src._layout)
+    )
+
+
+def get_gemlite_quant_kwargs(bit_width, group_size, dtype):
+    from torchao.quantization.quant_primitives import MappingType, ZeroPointDomain
+
+    kwargs = {}
+    if bit_width != 8:
+        kwargs["mapping_type"] = MappingType.ASYMMETRIC
+        kwargs["block_size"] = (1, group_size)
+        kwargs["target_dtype"] = torch.uint8
+        kwargs["eps"] = 1e-6
+        kwargs["quant_min"] = 0
+        kwargs["quant_max"] = (2**bit_width) - 1
+        kwargs["eps"] = 1e-6
+        kwargs["zero_point_dtype"] = dtype
+        kwargs["zero_point_domain"] = ZeroPointDomain.FLOAT
+    elif bit_width == 8:
+        kwargs["mapping_type"] = MappingType.SYMMETRIC
+        kwargs["block_size"] = (1, group_size)
+        kwargs["target_dtype"] = torch.int8
+        kwargs["quant_min"] = -128
+        kwargs["quant_max"] = 127
+        kwargs["eps"] = 1e-5
+        kwargs["zero_point_dtype"] = None
+        kwargs["zero_point_domain"] = ZeroPointDomain.NONE
+    return kwargs
+
+
+def get_gemlite_aqt_kwargs(
+    weight,
+    group_size=64,
+    bit_width=4,
+    packing_bitwidth=None,
+    mode="weight_only",
+    use_hqq=True,
+):
+    if gemlite is None:
+        raise ImportError(
+            "Unable to import 'gemlite'. Please ensure it is installed correctly. You can install it with: pip install gemlite"
+        )
+
+    assert bit_width in [
+        4,
+        8,
+    ], f"gemlite only works with bit_width 4,8 but got {bit_width}"
+
+    assert weight.dtype in [torch.float16, torch.bfloat16], (
+        f"gemlite only works with dtype torch.float16 or torch.bfloat16 but got {weight.dtype}"
+    )
+    assert group_size in [32, 64, 128, 256, 512, 1024, None]
+    assert group_size is None or bit_width != 8, (
+        "gemlite only works with group_size=None for bit_width=8"
+    )
+    assert packing_bitwidth in [8, 16, 32, None], (
+        f"Invalid packing bitwidth, got {packing_bitwidth}"
+    )
+
+    assert mode in ["weight_only", "dynamic"], (
+        f"Invalid mode: should be either weight_only or dynamic, got {mode}"
+    )
+
+    out_features, in_features = weight.shape
+    group_size = in_features if group_size is None else group_size
+
+    aqt_kwargs = get_gemlite_quant_kwargs(bit_width, group_size, weight.dtype)
+    aqt_kwargs["_layout"] = GemlitePackedLayout(
+        group_size=group_size,
+        bit_width=bit_width,
+        packing_bitwidth=packing_bitwidth,
+        mode=mode,
+    )
+    aqt_kwargs["use_hqq"] = use_hqq
+    return aqt_kwargs
+
+
+@dataclass(frozen=True)
+class GemlitePackedLayout(Layout):
+    group_size: Optional[int] = 128
+    bit_width: int = 4
+    packing_bitwidth: Optional[int] = None
+    mode: Optional[str] = "weight_only"
+
+
+@register_layout(GemlitePackedLayout)
+class GemliteAQTTensorImpl(TensorCoreTiledAQTTensorImpl):
+    def __new__(
+        cls,
+        packed_weight: torch.Tensor,
+        scale: torch.Tensor,
+        zero_point: torch.Tensor,
+        gemlite_kwargs: Dict,
+        _layout: Layout,
+    ):
+        kwargs = {}
+        kwargs["device"] = packed_weight.device
+        kwargs["layout"] = (
+            kwargs.get("layout")
+            if kwargs.get("layout", False)
+            else packed_weight.layout
+        )
+        kwargs["dtype"] = packed_weight.dtype
+        kwargs["requires_grad"] = False
+        shape = packed_weight.shape
+        return torch.Tensor._make_wrapper_subclass(cls, shape, **kwargs)  # type: ignore[attr-defined]
+
+    def __init__(
+        self,
+        packed_weight: torch.Tensor,
+        scale: torch.Tensor,
+        zero_point: torch.Tensor,
+        gemlite_kwargs: Dict,
+        _layout: Layout,
+    ):
+        self.packed_weight = packed_weight
+        self.scale = scale
+        self.zero_point = zero_point
+        self.gemlite_kwargs = gemlite_kwargs
+        self._layout = _layout
+
+    def __tensor_flatten__(self):
+        return ["packed_weight", "scale", "zero_point"], [
+            self._layout,
+            self.gemlite_kwargs,
+        ]
+
+    @classmethod
+    def __tensor_unflatten__(
+        cls, tensor_data_dict, tensor_attributes, outer_size, outer_stride
+    ):
+        packed_weight, scale, zero_point = (
+            tensor_data_dict["packed_weight"],
+            tensor_data_dict["scale"],
+            tensor_data_dict["zero_point"],
+        )
+        _layout, gemlite_kwargs = tensor_attributes
+        return cls(packed_weight, scale, zero_point, gemlite_kwargs, _layout)
+
+    @classmethod
+    def from_plain(
+        cls,
+        int_data: torch.Tensor,
+        scale: torch.Tensor,
+        zero_point: Optional[torch.Tensor],
+        _layout: Layout,
+    ):
+        assert isinstance(_layout, GemlitePackedLayout), (
+            f"GemliteAQTTensorImpl only works with GemliteLinearTriton but got {_layout}"
+        )
+        device = int_data.device
+        if device.type != "cuda":
+            int_data = (
+                int_data.cuda()
+            )  # We need int_data on cuda device because of Triton packing
+
+        group_size, bit_width = _layout.group_size, _layout.bit_width
+        out_features, in_features = int_data.shape
+        packing_bitwidth = _layout.packing_bitwidth
+        mode = _layout.mode
+
+        if bit_width == 8 and group_size == in_features:
+            processor = (
+                gemlite.helper.A8W8_int8_dynamic
+                if mode == "dynamic"
+                else gemlite.helper.A16W8
+            )
+            gemlite_linear = processor(device=int_data.device).from_weights(
+                int_data, scales=scale, bias=None
+            )
+        else:
+            processor = (
+                gemlite.helper.A8Wn_dynamic
+                if mode == "dynamic"
+                else gemlite.helper.A16Wn
+            )
+            gemlite_linear = processor(
+                device=int_data.device, packing_bitwidth=packing_bitwidth
+            ).from_weights(
+                int_data, scale, zero_point, bit_width, group_size, bias=None
+            )
+
+        meta_args = gemlite_linear.get_meta_args()
+        gemlite_kwargs = {
+            "in_features": in_features,
+            "out_features": out_features,
+            "packing_bitwidth": packing_bitwidth,
+            "data_contiguous": gemlite_linear.data_contiguous,
+            "elements_per_sample": gemlite_linear.elements_per_sample,
+            "W_group_mode": gemlite_linear.W_group_mode,
+            "meta_args": meta_args,
+        }
+
+        packed_weight, scale, zero_point = gemlite_linear.get_tensor_args()
+        packed_weight = packed_weight.to(device)
+        if zero_point is None:
+            zero_point = torch.tensor(
+                [[]], device=packed_weight.device, dtype=torch.int32
+            )
+
+        return cls(packed_weight, scale, zero_point, gemlite_kwargs, _layout)
+
+    def to(self, *args, **kwargs):
+        kwargs = self._get_to_kwargs(*args, **kwargs)
+        device = kwargs["device"]
+        return self.__class__(
+            self.packed_weight.to(device),
+            self.scale.to(device),
+            self.zero_point.to(device),
+            self.gemlite_kwargs,
+            self._layout,
+        )
+
+    def _apply_fn_to_data(self, fn):
+        return self.__class__(
+            fn(self.packed_weight),
+            fn(self.scale),
+            fn(self.zero_point),
+            self.gemlite_kwargs,
+            self._layout,
+        )
+
+    def get_plain(self) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        device = self.packed_weight.device
+        int_data = (
+            (
+                gemlite.bitpack.unpack_over_rows(
+                    self.packed_weight.cuda(),
+                    W_nbits=self._layout.bit_width,
+                    num_output_rows=self.gemlite_kwargs["in_features"],
+                    dtype=torch.uint8,
+                )
+            )
+            .to(device)
+            .t()
+        )
+
+        # Preserve col-row major layout
+        if self.gemlite_kwargs["data_contiguous"]:
+            int_data = int_data.contiguous()
+
+        # Handle FMA mode: W_q * s + z  -> (W_q - z) * s
+        if self.gemlite_kwargs["W_group_mode"] == 4:
+            scale_min_val = 1e-8
+            scale = self.scale.clone().float()
+            scale[torch.logical_and(scale >= 0, scale.abs() <= scale_min_val)] = (
+                scale_min_val
+            )
+            scale[
+                torch.logical_and(scale < 0, scale.abs() <= scale_min_val)
+            ] = -scale_min_val
+            zero_point = (-self.zero_point.float() / scale).clamp_(-100, 100)
+            zero_point = zero_point.to(self.scale.dtype)
+        else:
+            zero_point = self.zero_point
+
+        scale = self.scale.t().contiguous()
+        zero_point = zero_point.t().contiguous()
+
+        return int_data, scale, zero_point
+
+    @classmethod
+    def __torch_dispatch__(cls, func, types, args, kwargs):
+        kwargs = {} if kwargs is None else kwargs
+
+        # we don't handle transpose operations and just ignore them. In practice the only
+        # reason a transpsoe should occur is because the functional linear
+        # op can decompose into e.g. transpose + addmm so since we want
+        # to use the gemlite matmul kernel, which expects teh weight to be passed in as is,
+        # we ignore the transpose
+        if func is aten.detach.default or func is aten.t.default:
+            return return_and_correct_aliasing(
+                func, args, kwargs, args[0]._apply_fn_to_data(torch.detach)
+            )
+
+        if func is aten.clone.default:
+            return return_and_correct_aliasing(
+                func, args, kwargs, args[0]._apply_fn_to_data(torch.clone)
+            )
+
+        if func is aten.slice.Tensor:
+            self, dim, start, end, step = fill_defaults(args, 5, [0, None, None, 1])
+            assert step == 1, "Only step == 1 is supported in slicing right now"
+
+            if dim in [0, 1]:
+                # data in self is transposed, meaning forward() performs x @ W_deq not x @ W_deq.T
+                dim = 1 - dim
+                packed_weight = self.packed_weight
+                scale = self.scale
+                zero_point = self.zero_point
+
+                gemlite_kwargs = self.gemlite_kwargs.copy()
+                orig_shape = [
+                    gemlite_kwargs["in_features"],
+                    gemlite_kwargs["out_features"],
+                ]
+                elements_per_sample = gemlite_kwargs["elements_per_sample"]
+                data_len = orig_shape[dim]
+                scale_len = scale.shape[dim]
+                ratio = data_len / scale_len
+                start_scale = int(start / ratio)
+                end_scale = int(end / ratio)
+
+                # For packing only the K dimension. This should be flipped for N-dim packing.
+                div = elements_per_sample if dim == 0 else 1
+                packed_weight = aten.slice.Tensor(
+                    packed_weight, dim, start // div, end // div, step
+                )
+
+                # Update in_features/out_features
+                gemlite_kwargs["in_features"] = (
+                    packed_weight.shape[0] * elements_per_sample
+                )
+                gemlite_kwargs["out_features"] = packed_weight.shape[1]
+
+                scale = aten.slice.Tensor(scale, dim, start_scale, end_scale, step)
+                if zero_point is not None and zero_point.numel() > 0:
+                    zero_point = aten.slice.Tensor(
+                        zero_point, dim, start_scale, end_scale, step
+                    )
+                else:
+                    zero_point = None
+
+                sliced = GemliteAQTTensorImpl(
+                    packed_weight, scale, zero_point, gemlite_kwargs, self._layout
+                )
+                return return_and_correct_aliasing(func, args, kwargs, sliced)
+
+            else:
+                raise NotImplementedError(
+                    f"GemliteAQTTensorImpl dispatch: attempting to run {func}, with dim={dim}, that is not supported"
+                )
+
+        elif func is aten.copy_.default:
+            self = args[0]
+            src = args[1]
+
+            # Handle zero_point = None with symmetric quant
+            if self.zero_point is None:
+                self.zero_point = torch.tensor(
+                    [[]], device=self.packed_weight.device, dtype=torch.int32
+                )
+
+            if src.zero_point is None:
+                src.zero_point = torch.tensor(
+                    [[]], device=src.packed_weight.device, dtype=torch.int32
+                )
+
+            if _same_metadata(self, src):
+                self_tensors = self.__tensor_flatten__()[0]
+                for tensor_name in self_tensors:
+                    getattr(self, tensor_name).copy_(getattr(src, tensor_name))
+                for key in self.gemlite_kwargs:
+                    self.gemlite_kwargs[key] = src.gemlite_kwargs[key]
+                return
+            raise ValueError(
+                f"Not supported args for copy_ due to metadata mismatch: {args[0], args[1]}"
+            )
+
+        raise NotImplementedError(
+            f"GemliteAQTTensorImpl dispatch: attempting to run {func}, this is not supported"
+        )
+
+    __torch_function__ = torch._C._disabled_torch_function_impl
+
+    def get_layout(self) -> Layout:
+        return self._layout
+
+    @property
+    def block_size(self):
+        return (1, self._layout.group_size)
+
+
+def _linear_fp_act_int4_weight_gemlite_impl(input_tensor, weight_tensor, bias=None):
+    if hasattr(weight_tensor, "tensor_impl"):
+        weight_impl = weight_tensor.tensor_impl
+    else:
+        weight_impl = weight_tensor
+
+    return gemlite.core.forward_functional(
+        x=input_tensor,
+        bias=bias,
+        tensor_args=(
+            weight_impl.packed_weight,
+            weight_impl.scale,
+            weight_impl.zero_point,
+        ),
+        meta_args=weight_impl.gemlite_kwargs["meta_args"],
+    )
+
+
+def _linear_fp_act_int4_weight_gemlite_check(input_tensor, weight_tensor, bias):
+    return (
+        # input is native fp16 tensor
+        not is_traceable_wrapper_subclass(input_tensor)
+        # and input_tensor.dtype in [torch.float16, torch.bfloat16]
+        # weight is gemlite layout
+        and isinstance(weight_tensor, AffineQuantizedTensor)
+        and isinstance(weight_tensor._layout, GemlitePackedLayout)
+    )
diff --git a/torchao/quantization/autoquant.py b/torchao/quantization/autoquant.py
index c72e18a923..884c96559a 100644
--- a/torchao/quantization/autoquant.py
+++ b/torchao/quantization/autoquant.py
@@ -724,7 +724,7 @@ class AQGemliteInt4G32WeightOnlyQuantizedLinearWeight(
     @classmethod
     def from_float(cls, weight):
         from torchao.dtypes import to_affine_quantized_intx
-        from torchao.dtypes.uintx.gemlite_layout import get_gemlite_aqt_kwargs
+        from torchao.prototype.dtypes.uintx.gemlite_layout import get_gemlite_aqt_kwargs
 
         if weight.dtype != torch.float16:
             weight = weight.to(torch.float16)
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
index 09c2edcd9f..c29382b658 100644
--- a/torchao/quantization/quant_api.py
+++ b/torchao/quantization/quant_api.py
@@ -1055,7 +1055,7 @@ def _gemlite_uintx_weight_only_transform(
 
     weight = module.weight
 
-    from torchao.dtypes.uintx.gemlite_layout import get_gemlite_aqt_kwargs
+    from torchao.prototype.dtypes.uintx.gemlite_layout import get_gemlite_aqt_kwargs
 
     use_hqq = True if bit_width == 4 else False
     new_weight = to_affine_quantized_intx(

From 7213f817592ba6187fa446a66708b74211656f07 Mon Sep 17 00:00:00 2001
From: Apurva Jain <apurvajain.kota@gmail.com>
Date: Tue, 11 Nov 2025 21:30:23 -0800
Subject: [PATCH 15/22] Move uintx_layout to prototype/dtypes (#3316)

---
 docs/source/api_ref_dtypes.rst                |   2 +-
 test/dtypes/test_uintx.py                     |   3 +-
 torchao/dtypes/__init__.py                    |   2 +-
 torchao/dtypes/uintx/uintx_layout.py          | 260 ++----------------
 torchao/prototype/autoround/core.py           |   2 +-
 torchao/prototype/autoround/eval_autoround.py |   4 +-
 torchao/prototype/dtypes/__init__.py          |   8 +
 torchao/prototype/dtypes/uintx/__init__.py    |  10 +
 .../prototype/dtypes/uintx/uintx_layout.py    | 251 +++++++++++++++++
 .../codebook/codebook_quantized_tensor.py     |   2 +-
 10 files changed, 295 insertions(+), 249 deletions(-)
 create mode 100644 torchao/prototype/dtypes/uintx/uintx_layout.py

diff --git a/docs/source/api_ref_dtypes.rst b/docs/source/api_ref_dtypes.rst
index 58ad4ee8a4..3997b444b3 100644
--- a/docs/source/api_ref_dtypes.rst
+++ b/docs/source/api_ref_dtypes.rst
@@ -22,7 +22,6 @@ Layouts and Tensor Subclasses
     FloatxTensor
     FloatxTensorCoreLayout
     MarlinSparseLayout
-    UintxLayout
     Int4CPULayout
     CutlassSemiSparseLayout
 
@@ -53,6 +52,7 @@ Prototype
     Int8DynamicActInt4WeightCPULayout
     MarlinQQQTensor
     MarlinQQQLayout
+    UintxLayout
 
 ..
   _NF4Tensor - add after fixing torchao/dtypes/nf4tensor.py:docstring
diff --git a/test/dtypes/test_uintx.py b/test/dtypes/test_uintx.py
index 0878dfed4d..3172381a3a 100644
--- a/test/dtypes/test_uintx.py
+++ b/test/dtypes/test_uintx.py
@@ -9,7 +9,7 @@
 import pytest
 import torch
 
-from torchao.dtypes.uintx.uintx_layout import to_uintx
+from torchao.prototype.dtypes.uintx.uintx_layout import to_uintx
 from torchao.quantization.quant_api import UIntXWeightOnlyConfig, quantize_
 from torchao.quantization.quant_primitives import (
     MappingType,
@@ -183,6 +183,7 @@ def test_uintx_api_deprecation():
         ("CutlassInt4PackedLayout", "torchao.dtypes.uintx.cutlass_int4_packed_layout"),
         ("BlockSparseLayout", "torchao.dtypes.uintx.block_sparse_layout"),
         ("MarlinQQQLayout", "torchao.dtypes.uintx.marlin_qqq_tensor"),
+        ("UintxLayout", "torchao.dtypes.uintx.uintx_layout"),
     ]
 
     for api_name, module_path in deprecated_apis:
diff --git a/torchao/dtypes/__init__.py b/torchao/dtypes/__init__.py
index 4c83de7ddd..43c140908a 100644
--- a/torchao/dtypes/__init__.py
+++ b/torchao/dtypes/__init__.py
@@ -21,7 +21,6 @@
     QDQLayout,
     SemiSparseLayout,
     TensorCoreTiledLayout,
-    UintxLayout,
 )
 from .uintx.block_sparse_layout import BlockSparseLayout
 from .uintx.cutlass_int4_packed_layout import CutlassInt4PackedLayout
@@ -31,6 +30,7 @@
     MarlinQQQTensor,
     to_marlinqqq_quantized_intx,
 )
+from .uintx.uintx_layout import UintxLayout
 from .utils import (
     Layout,
     PlainLayout,
diff --git a/torchao/dtypes/uintx/uintx_layout.py b/torchao/dtypes/uintx/uintx_layout.py
index 3180e9f2c9..dfd93249d6 100644
--- a/torchao/dtypes/uintx/uintx_layout.py
+++ b/torchao/dtypes/uintx/uintx_layout.py
@@ -3,250 +3,24 @@
 #
 # This source code is licensed under the BSD 3-Clause license found in the
 # LICENSE file in the root directory of this source tree.
-from dataclasses import dataclass
-from typing import List, Tuple
 
-import torch
-from torch.utils._python_dispatch import return_and_correct_aliasing
+# Backward compatibility stub - imports from the new location
+import warnings
 
-from torchao.dtypes.affine_quantized_tensor import register_layout
-from torchao.dtypes.uintx.plain_layout import PlainAQTTensorImpl
-from torchao.dtypes.utils import (
-    Layout,
+warnings.warn(
+    "Importing from torchao.dtypes.uintx.uintx_layout is deprecated. "
+    "Please use 'from torchao.prototype.dtypes import UintxLayout, UintxTensor' instead. "
+    "This import path will be removed in a future release of torchao. "
+    "See https://github.com/pytorch/ao/issues/2752 for more details.",
+    DeprecationWarning,
+    stacklevel=2,
 )
-from torchao.utils import TorchAOBaseTensor
 
-from .bitpacking import pack, unpack
-
-aten = torch.ops.aten
-
-# Note: Uintx does not work for torch 2.3 and below
-_DTYPE_TO_BIT_WIDTH = {}
-_BIT_WIDTH_TO_DTYPE = {}
-
-_DTYPE_TO_BIT_WIDTH = {
-    torch.uint1: 1,
-    torch.uint2: 2,
-    torch.uint3: 3,
-    torch.uint4: 4,
-    torch.uint5: 5,
-    torch.uint6: 6,
-    torch.uint7: 7,
-}
-
-_BIT_WIDTH_TO_DTYPE = {v: k for k, v in _DTYPE_TO_BIT_WIDTH.items()}
-
-
-class UintxTensor(TorchAOBaseTensor):
-    """
-    Splits int data into packed shards based on bit size
-    fields:
-      int4_shard (torch.Tensor): 4 bit packed shard
-      int2_shard (torch.Tensor): 2 bit packed shard
-      int1_shard (torch.Tensor): 1 bit packed shard
-      bit_width (int): number of bits for each element
-      pack_dim: (int) dimension to pack along
-    """
-
-    bits_to_shard = {
-        1: ["int1_shard"],
-        2: ["int2_shard"],
-        3: ["int2_shard", "int1_shard"],
-        4: ["int4_shard"],
-        5: ["int4_shard", "int1_shard"],
-        6: ["int4_shard", "int2_shard"],
-        7: ["int4_shard", "int2_shard", "int1_shard"],
-    }
-
-    def __new__(
-        cls,
-        shards: List[torch.Tensor],
-        packed_shape: List[int],
-        bit_width: int,
-        pack_dim: int = -1,
-    ):
-        kwargs = {"device": shards[0].device}
-        kwargs["device"] = shards[0].device
-        kwargs["layout"] = shards[0].layout
-        kwargs["requires_grad"] = False
-        kwargs["dtype"] = torch.uint8
-        return torch.Tensor._make_wrapper_subclass(cls, packed_shape, **kwargs)
-
-    def __init__(
-        self,
-        shards: List[torch.Tensor],
-        packed_shape: List[int],
-        bit_width: int,
-        pack_dim: int = -1,
-    ):
-        for i, attrib in enumerate(self.bits_to_shard[bit_width]):
-            setattr(self, attrib, shards[i])
-
-        self.packed_shape = packed_shape
-        self.bit_width = bit_width
-        self.pack_dim = pack_dim
-
-    def get_shards(self):
-        return [getattr(self, i) for i in self.__class__.bits_to_shard[self.bit_width]]
-
-    def __repr__(self):
-        return f"Int{self.bit_width}Tensor(shape = {self.packed_shape}, data = {unpack(self.get_shards(), self.bit_width, dim=self.pack_dim)})"
-
-    def __tensor_flatten__(self):
-        return self.__class__.bits_to_shard[self.bit_width], [
-            self.packed_shape,
-            self.bit_width,
-            self.pack_dim,
-        ]
-
-    @classmethod
-    def __tensor_unflatten__(
-        cls, tensor_data_dict, tensor_attributes, outer_size, outer_stride
-    ):
-        shards = list(tensor_data_dict.values())
-        packed_shape, bit_width, pack_dim = tensor_attributes
-        return cls(shards, packed_shape, bit_width, pack_dim)
-
-    def get_plain(self):
-        return unpack(self.get_shards(), self.bit_width, dim=self.pack_dim)
-
-    # temporary until kernels on packed tensors are created
-    def apply_transformation(self, fn):
-        og = self.get_plain()
-        new = fn(og)
-        dtype = _BIT_WIDTH_TO_DTYPE[self.bit_width]
-        return self.from_uint8(new, dtype, self.pack_dim)
-
-    # temporary until kernels on packed tensors are created
-    def apply_fn_to_shards(self, fn):
-        new_shards = [fn(shard) for shard in self.get_shards()]
-        return self.__class__(
-            new_shards, self.packed_shape, self.bit_width, self.pack_dim
-        )
-
-    @classmethod
-    def from_uint8(cls, int_data: torch.Tensor, dtype: torch.dtype, pack_dim: int = -1):
-        assert dtype in _DTYPE_TO_BIT_WIDTH.keys(), (
-            "Expected dtype to be one of {_DTYPE_TO_BIT_WIDTH.keys()}"
-        )
-        bit_width = _DTYPE_TO_BIT_WIDTH[dtype]
-        shards = pack(int_data, bit_width, dim=pack_dim)
-        shape = list(int_data.shape)
-        shape[pack_dim] = shape[pack_dim] * bit_width // 8
-        return cls(shards, int_data.shape, bit_width, pack_dim)
-
-    def _get_to_kwargs(self, *args, **kwargs):
-        device, dtype, _, memory_format = torch._C._nn._parse_to(*args, **kwargs)
-        device = self.device if device is None else device
-        dtype = self.dtype if dtype is None else dtype
-        memory_format = (
-            memory_format if memory_format is not None else torch.preserve_format
-        )
-        kwargs = {
-            "device": device,
-            "dtype": dtype,
-            "memory_format": memory_format,
-        }
-        return kwargs
-
-    def to(self, *args, **kwargs):
-        if "copy" in kwargs:
-            return super().to(*args, **kwargs)
-        kwargs = self._get_to_kwargs(*args, **kwargs)
-        if "device" in kwargs:
-            return self.__class__(
-                list(shard.to(kwargs["device"]) for shard in self.get_shards()),
-                self.packed_shape,
-                self.bit_width,
-                self.pack_dim,
-            )
-        return super().to(*args, **kwargs)
-
-
-implements = UintxTensor.implements
-
-
-@implements(aten.detach.default)
-def _(func, types, args, kwargs):
-    return return_and_correct_aliasing(
-        func, args, kwargs, args[0].apply_fn_to_shards(torch.detach)
-    )
-
-
-@implements(aten.view.default)
-def _(func, types, args, kwargs):
-    return return_and_correct_aliasing(
-        func, args, kwargs, args[0].apply_transformation(lambda x: x.view(*args[1:]))
-    )
-
-
-@implements(aten._to_copy.default)
-def _(func, types, args, kwargs):
-    return return_and_correct_aliasing(func, args, kwargs, args[0])
-
-
-@implements(aten.sub.Tensor)
-def _(func, types, args, kwargs):
-    return return_and_correct_aliasing(
-        func,
-        args,
-        kwargs,
-        args[0].apply_transformation(lambda x: (x - args[1]).to(torch.uint8)),
-    )
-
-
-@implements(aten.mul.Tensor)
-def _(func, types, args, kwargs):
-    return return_and_correct_aliasing(
-        func,
-        args,
-        kwargs,
-        args[0].apply_transformation(lambda x: (x * args[1]).to(torch.uint8)),
-    )
-
-
-# quantization api integrations
-to_uintx = UintxTensor.from_uint8
-
-
-@dataclass(frozen=True)
-class UintxLayout(Layout):
-    """A layout class for Uintx tensors, which are tensors with elements packed into
-    smaller bit-widths than the standard 8-bit byte. This layout is used to define
-    how the data is stored and processed in UintxTensor objects.
-
-    Attributes:
-        dtype (torch.dtype): The data type of the tensor elements, which determines
-                             the bit-width used for packing.
-        pack_dim (int): The dimension along which the data is packed. Default is -1,
-                        which indicates the last dimension.
-    """
-
-    dtype: torch.dtype
-    pack_dim: int = -1
-
-    def post_process(
-        self,
-        input: torch.Tensor,
-        scale: torch.Tensor,
-        zero_point: torch.Tensor,
-        block_size: Tuple[int, ...],
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        return to_uintx(input, self.dtype, self.pack_dim), scale, zero_point
-
-
-@register_layout(UintxLayout)
-class UintxAQTTensorImpl(PlainAQTTensorImpl):
-    def get_plain(self) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        return self.int_data.get_plain(), self.scale, self.zero_point
-
-    @classmethod
-    def from_plain(
-        cls,
-        int_data: torch.Tensor,
-        scale: torch.Tensor,
-        zero_point: torch.Tensor,
-        _layout: Layout,
-    ):
-        assert isinstance(_layout, UintxLayout)
-        return cls(int_data, scale, zero_point, _layout)
+from torchao.prototype.dtypes.uintx.uintx_layout import (  # noqa: F401
+    _BIT_WIDTH_TO_DTYPE,  # noqa: F401
+    _DTYPE_TO_BIT_WIDTH,  # noqa: F401
+    UintxAQTTensorImpl,  # noqa: F401
+    UintxLayout,  # noqa: F401
+    UintxTensor,  # noqa: F401
+    to_uintx,  # noqa: F401
+)
diff --git a/torchao/prototype/autoround/core.py b/torchao/prototype/autoround/core.py
index 859e1cfe02..159fcb3c3d 100644
--- a/torchao/prototype/autoround/core.py
+++ b/torchao/prototype/autoround/core.py
@@ -189,7 +189,7 @@ def to_uintx_weight(input_float):
                 quant_min = 0
                 quant_max = _auto_round_config.bits**2 - 1
                 block_size = (1, observed_linear.group_size)
-                from torchao.dtypes.uintx.uintx import (
+                from torchao.prototype.dtypes.uintx.uintx_layout import (
                     _BIT_WIDTH_TO_DTYPE,
                     UintxLayout,
                 )
diff --git a/torchao/prototype/autoround/eval_autoround.py b/torchao/prototype/autoround/eval_autoround.py
index 4846f919cc..4f6850be88 100644
--- a/torchao/prototype/autoround/eval_autoround.py
+++ b/torchao/prototype/autoround/eval_autoround.py
@@ -111,7 +111,9 @@ def main(args):
                 )
             elif args.uintx:
                 msg += f" (uintx {args.bits} bits)"
-                from torchao.dtypes.uintx.uintx import _BIT_WIDTH_TO_DTYPE
+                from torchao.prototype.dtypes.uintx.uintx_layout import (
+                    _BIT_WIDTH_TO_DTYPE,
+                )
                 from torchao.quantization.quant_api import (
                     UIntXWeightOnlyConfig,
                     quantize_,
diff --git a/torchao/prototype/dtypes/__init__.py b/torchao/prototype/dtypes/__init__.py
index 7ad78dbed6..88fe73ab76 100644
--- a/torchao/prototype/dtypes/__init__.py
+++ b/torchao/prototype/dtypes/__init__.py
@@ -11,7 +11,11 @@
     Int8DynamicActInt4WeightCPULayout,
     MarlinQQQLayout,
     MarlinQQQTensor,
+    UintxAQTTensorImpl,
+    UintxLayout,
+    UintxTensor,
     to_marlinqqq_quantized_intx,
+    to_uintx,
 )
 
 __all__ = [
@@ -22,4 +26,8 @@
     "MarlinQQQTensor",
     "to_marlinqqq_quantized_intx",
     "GemlitePackedLayout",
+    "UintxLayout",
+    "UintxTensor",
+    "UintxAQTTensorImpl",
+    "to_uintx",
 ]
diff --git a/torchao/prototype/dtypes/uintx/__init__.py b/torchao/prototype/dtypes/uintx/__init__.py
index 56b1eed50a..2b6372d748 100644
--- a/torchao/prototype/dtypes/uintx/__init__.py
+++ b/torchao/prototype/dtypes/uintx/__init__.py
@@ -13,6 +13,12 @@
     MarlinQQQTensor,
     to_marlinqqq_quantized_intx,
 )
+from .uintx_layout import (
+    UintxAQTTensorImpl,
+    UintxLayout,
+    UintxTensor,
+    to_uintx,
+)
 
 __all__ = [
     "BlockSparseLayout",
@@ -22,4 +28,8 @@
     "MarlinQQQTensor",
     "to_marlinqqq_quantized_intx",
     "GemlitePackedLayout",
+    "UintxLayout",
+    "UintxTensor",
+    "UintxAQTTensorImpl",
+    "to_uintx",
 ]
diff --git a/torchao/prototype/dtypes/uintx/uintx_layout.py b/torchao/prototype/dtypes/uintx/uintx_layout.py
new file mode 100644
index 0000000000..ce9ce836e7
--- /dev/null
+++ b/torchao/prototype/dtypes/uintx/uintx_layout.py
@@ -0,0 +1,251 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+from dataclasses import dataclass
+from typing import List, Tuple
+
+import torch
+from torch.utils._python_dispatch import return_and_correct_aliasing
+
+from torchao.dtypes.affine_quantized_tensor import register_layout
+from torchao.dtypes.uintx.bitpacking import pack, unpack
+from torchao.dtypes.uintx.plain_layout import PlainAQTTensorImpl
+from torchao.dtypes.utils import (
+    Layout,
+)
+from torchao.utils import TorchAOBaseTensor
+
+aten = torch.ops.aten
+
+# Note: Uintx does not work for torch 2.3 and below
+_DTYPE_TO_BIT_WIDTH = {}
+_BIT_WIDTH_TO_DTYPE = {}
+
+_DTYPE_TO_BIT_WIDTH = {
+    torch.uint1: 1,
+    torch.uint2: 2,
+    torch.uint3: 3,
+    torch.uint4: 4,
+    torch.uint5: 5,
+    torch.uint6: 6,
+    torch.uint7: 7,
+}
+
+_BIT_WIDTH_TO_DTYPE = {v: k for k, v in _DTYPE_TO_BIT_WIDTH.items()}
+
+
+class UintxTensor(TorchAOBaseTensor):
+    """
+    Splits int data into packed shards based on bit size
+    fields:
+      int4_shard (torch.Tensor): 4 bit packed shard
+      int2_shard (torch.Tensor): 2 bit packed shard
+      int1_shard (torch.Tensor): 1 bit packed shard
+      bit_width (int): number of bits for each element
+      pack_dim: (int) dimension to pack along
+    """
+
+    bits_to_shard = {
+        1: ["int1_shard"],
+        2: ["int2_shard"],
+        3: ["int2_shard", "int1_shard"],
+        4: ["int4_shard"],
+        5: ["int4_shard", "int1_shard"],
+        6: ["int4_shard", "int2_shard"],
+        7: ["int4_shard", "int2_shard", "int1_shard"],
+    }
+
+    def __new__(
+        cls,
+        shards: List[torch.Tensor],
+        packed_shape: List[int],
+        bit_width: int,
+        pack_dim: int = -1,
+    ):
+        kwargs = {"device": shards[0].device}
+        kwargs["device"] = shards[0].device
+        kwargs["layout"] = shards[0].layout
+        kwargs["requires_grad"] = False
+        kwargs["dtype"] = torch.uint8
+        return torch.Tensor._make_wrapper_subclass(cls, packed_shape, **kwargs)
+
+    def __init__(
+        self,
+        shards: List[torch.Tensor],
+        packed_shape: List[int],
+        bit_width: int,
+        pack_dim: int = -1,
+    ):
+        for i, attrib in enumerate(self.bits_to_shard[bit_width]):
+            setattr(self, attrib, shards[i])
+
+        self.packed_shape = packed_shape
+        self.bit_width = bit_width
+        self.pack_dim = pack_dim
+
+    def get_shards(self):
+        return [getattr(self, i) for i in self.__class__.bits_to_shard[self.bit_width]]
+
+    def __repr__(self):
+        return f"Int{self.bit_width}Tensor(shape = {self.packed_shape}, data = {unpack(self.get_shards(), self.bit_width, dim=self.pack_dim)})"
+
+    def __tensor_flatten__(self):
+        return self.__class__.bits_to_shard[self.bit_width], [
+            self.packed_shape,
+            self.bit_width,
+            self.pack_dim,
+        ]
+
+    @classmethod
+    def __tensor_unflatten__(
+        cls, tensor_data_dict, tensor_attributes, outer_size, outer_stride
+    ):
+        shards = list(tensor_data_dict.values())
+        packed_shape, bit_width, pack_dim = tensor_attributes
+        return cls(shards, packed_shape, bit_width, pack_dim)
+
+    def get_plain(self):
+        return unpack(self.get_shards(), self.bit_width, dim=self.pack_dim)
+
+    # temporary until kernels on packed tensors are created
+    def apply_transformation(self, fn):
+        og = self.get_plain()
+        new = fn(og)
+        dtype = _BIT_WIDTH_TO_DTYPE[self.bit_width]
+        return self.from_uint8(new, dtype, self.pack_dim)
+
+    # temporary until kernels on packed tensors are created
+    def apply_fn_to_shards(self, fn):
+        new_shards = [fn(shard) for shard in self.get_shards()]
+        return self.__class__(
+            new_shards, self.packed_shape, self.bit_width, self.pack_dim
+        )
+
+    @classmethod
+    def from_uint8(cls, int_data: torch.Tensor, dtype: torch.dtype, pack_dim: int = -1):
+        assert dtype in _DTYPE_TO_BIT_WIDTH.keys(), (
+            "Expected dtype to be one of {_DTYPE_TO_BIT_WIDTH.keys()}"
+        )
+        bit_width = _DTYPE_TO_BIT_WIDTH[dtype]
+        shards = pack(int_data, bit_width, dim=pack_dim)
+        shape = list(int_data.shape)
+        shape[pack_dim] = shape[pack_dim] * bit_width // 8
+        return cls(shards, int_data.shape, bit_width, pack_dim)
+
+    def _get_to_kwargs(self, *args, **kwargs):
+        device, dtype, _, memory_format = torch._C._nn._parse_to(*args, **kwargs)
+        device = self.device if device is None else device
+        dtype = self.dtype if dtype is None else dtype
+        memory_format = (
+            memory_format if memory_format is not None else torch.preserve_format
+        )
+        kwargs = {
+            "device": device,
+            "dtype": dtype,
+            "memory_format": memory_format,
+        }
+        return kwargs
+
+    def to(self, *args, **kwargs):
+        if "copy" in kwargs:
+            return super().to(*args, **kwargs)
+        kwargs = self._get_to_kwargs(*args, **kwargs)
+        if "device" in kwargs:
+            return self.__class__(
+                list(shard.to(kwargs["device"]) for shard in self.get_shards()),
+                self.packed_shape,
+                self.bit_width,
+                self.pack_dim,
+            )
+        return super().to(*args, **kwargs)
+
+
+implements = UintxTensor.implements
+
+
+@implements(aten.detach.default)
+def _(func, types, args, kwargs):
+    return return_and_correct_aliasing(
+        func, args, kwargs, args[0].apply_fn_to_shards(torch.detach)
+    )
+
+
+@implements(aten.view.default)
+def _(func, types, args, kwargs):
+    return return_and_correct_aliasing(
+        func, args, kwargs, args[0].apply_transformation(lambda x: x.view(*args[1:]))
+    )
+
+
+@implements(aten._to_copy.default)
+def _(func, types, args, kwargs):
+    return return_and_correct_aliasing(func, args, kwargs, args[0])
+
+
+@implements(aten.sub.Tensor)
+def _(func, types, args, kwargs):
+    return return_and_correct_aliasing(
+        func,
+        args,
+        kwargs,
+        args[0].apply_transformation(lambda x: (x - args[1]).to(torch.uint8)),
+    )
+
+
+@implements(aten.mul.Tensor)
+def _(func, types, args, kwargs):
+    return return_and_correct_aliasing(
+        func,
+        args,
+        kwargs,
+        args[0].apply_transformation(lambda x: (x * args[1]).to(torch.uint8)),
+    )
+
+
+# quantization api integrations
+to_uintx = UintxTensor.from_uint8
+
+
+@dataclass(frozen=True)
+class UintxLayout(Layout):
+    """A layout class for Uintx tensors, which are tensors with elements packed into
+    smaller bit-widths than the standard 8-bit byte. This layout is used to define
+    how the data is stored and processed in UintxTensor objects.
+
+    Attributes:
+        dtype (torch.dtype): The data type of the tensor elements, which determines
+                             the bit-width used for packing.
+        pack_dim (int): The dimension along which the data is packed. Default is -1,
+                        which indicates the last dimension.
+    """
+
+    dtype: torch.dtype
+    pack_dim: int = -1
+
+    def post_process(
+        self,
+        input: torch.Tensor,
+        scale: torch.Tensor,
+        zero_point: torch.Tensor,
+        block_size: Tuple[int, ...],
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        return to_uintx(input, self.dtype, self.pack_dim), scale, zero_point
+
+
+@register_layout(UintxLayout)
+class UintxAQTTensorImpl(PlainAQTTensorImpl):
+    def get_plain(self) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        return self.int_data.get_plain(), self.scale, self.zero_point
+
+    @classmethod
+    def from_plain(
+        cls,
+        int_data: torch.Tensor,
+        scale: torch.Tensor,
+        zero_point: torch.Tensor,
+        _layout: Layout,
+    ):
+        assert isinstance(_layout, UintxLayout)
+        return cls(int_data, scale, zero_point, _layout)
diff --git a/torchao/prototype/quantization/codebook/codebook_quantized_tensor.py b/torchao/prototype/quantization/codebook/codebook_quantized_tensor.py
index e16a339e82..9c3ef1e9b0 100644
--- a/torchao/prototype/quantization/codebook/codebook_quantized_tensor.py
+++ b/torchao/prototype/quantization/codebook/codebook_quantized_tensor.py
@@ -9,7 +9,7 @@
 import torch
 
 from torchao.core.config import AOBaseConfig
-from torchao.dtypes.uintx.uintx_layout import _DTYPE_TO_BIT_WIDTH, UintxTensor
+from torchao.prototype.dtypes.uintx.uintx_layout import _DTYPE_TO_BIT_WIDTH, UintxTensor
 from torchao.prototype.quantization.codebook.codebook_ops import (
     choose_qparams_codebook,
     dequantize_codebook,

From 726607d5f116b07182de90d49e81e5eae3656e81 Mon Sep 17 00:00:00 2001
From: Jesse Cai <jessecai@meta.com>
Date: Wed, 12 Nov 2025 12:34:11 -0500
Subject: [PATCH 16/22] Add __str__ to FqnToConfig to make printing more
 readable (#3323)

* Adds __str__ to FqnToConfig to make printing more readable

Summary:

att, adds `__str__` method to `FqnToConfig` so that printing is more
legible.

For some config:
```python
config = FqnToConfig({
    "model.layers.fig.1.1": Float8DynamicActivationFloat8WeightConfig(
        granularity=PerRow(),
    ),
    "model.layers.fig.1.3": Float8DynamicActivationFloat8WeightConfig(
        granularity=PerRow(),
    ),
    "model.layers.fig.8.3": Float8DynamicActivationFloat8WeightConfig(
        granularity=PerRow(),
    ),
})
```

the output will be:
```
FqnToConfig({
    'model.layers.fig.1.1':
        Float8DynamicActivationFloat8WeightConfig(activation_dtype=torch.float8_e4m3fn, weight_dtype=torch.float8_e4m3fn, granularity=[PerRow(dim=-1), PerRow(dim=-1)], mm_config=Float8MMConfig(emulate=False, use_fast_accum=True, pad_inner_dim=False), activation_value_lb=None, activation_value_ub=None, kernel_preference=<KernelPreference.AUTO: 'auto'>, set_inductor_config=True, version=2),
    'model.layers.fig.1.3':
        Float8DynamicActivationFloat8WeightConfig(activation_dtype=torch.float8_e4m3fn, weight_dtype=torch.float8_e4m3fn, granularity=[PerRow(dim=-1), PerRow(dim=-1)], mm_config=Float8MMConfig(emulate=False, use_fast_accum=True, pad_inner_dim=False), activation_value_lb=None, activation_value_ub=None, kernel_preference=<KernelPreference.AUTO: 'auto'>, set_inductor_config=True, version=2),
    'model.layers.fig.8.3':
        Float8DynamicActivationFloat8WeightConfig(activation_dtype=torch.float8_e4m3fn, weight_dtype=torch.float8_e4m3fn, granularity=[PerRow(dim=-1), PerRow(dim=-1)], mm_config=Float8MMConfig(emulate=False, use_fast_accum=True, pad_inner_dim=False), activation_value_lb=None, activation_value_ub=None, kernel_preference=<KernelPreference.AUTO: 'auto'>, set_inductor_config=True, version=2),
})
```

also adds in a test so that you cannot specify both fqn_to_config and
module_fqn_to_config unless they are both equal.

Test Plan:
```
pytest test/quantization/test_quant_api.py -k test_fqn_config_module_config_and_fqn_config_both_specified
```

Reviewers:

Subscribers:

Tasks:

Tags:

* fix ruff check
---
 test/quantization/test_quant_api.py |  7 +++++++
 torchao/quantization/quant_api.py   | 21 +++++++++++++++++++++
 2 files changed, 28 insertions(+)

diff --git a/test/quantization/test_quant_api.py b/test/quantization/test_quant_api.py
index 506cec9dea..e530babdb9 100644
--- a/test/quantization/test_quant_api.py
+++ b/test/quantization/test_quant_api.py
@@ -1178,6 +1178,13 @@ def __init__(self):
         assert isinstance(m.nested.linear.weight, AffineQuantizedTensor)
         assert isinstance(m.linear1.weight, AffineQuantizedTensor)
 
+    def test_fqn_config_module_config_and_fqn_config_both_specified(self):
+        with self.assertRaises(ValueError):
+            FqnToConfig(
+                fqn_to_config={"test": Float8WeightOnlyConfig()},
+                module_fqn_to_config={"test2": Float8WeightOnlyConfig()},
+            )
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
index c29382b658..f8602fa66c 100644
--- a/torchao/quantization/quant_api.py
+++ b/torchao/quantization/quant_api.py
@@ -2466,6 +2466,15 @@ class FqnToConfig(AOBaseConfig):
     def __post_init__(self):
         torch._C._log_api_usage_once("torchao.quantization.FqnToConfig")
 
+        if (
+            len(self.fqn_to_config) > 0
+            and len(self.module_fqn_to_config) > 0
+            and self.fqn_to_config != self.module_fqn_to_config
+        ):
+            raise ValueError(
+                "`fqn_to_config` and `module_fqn_to_config` are both specified and are not equal!"
+            )
+
         # This code handles BC compatibility with `ModuleFqnToConfig`. It ensures that `self.module_fqn_to_config` and `self.fqn_to_config` share the same object.
         if len(self.module_fqn_to_config) > 0 and len(self.fqn_to_config) == 0:
             self.fqn_to_config = self.module_fqn_to_config
@@ -2479,6 +2488,18 @@ def __post_init__(self):
                 "Config Deprecation: _default is deprecated and will no longer be supported in a future release. Please see https://github.com/pytorch/ao/issues/3229 for more details."
             )
 
+    def __str__(self):
+        return "\n".join(
+            [
+                "FqnToConfig({",
+                *(
+                    f"  '{key}':\n    {value},"
+                    for key, value in self.fqn_to_config.items()
+                ),
+                "})",
+            ]
+        )
+
 
 # maintain BC
 ModuleFqnToConfig = FqnToConfig

From 42fc6bdb48292e24c65a40107a6a7eb81131cd9e Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh168@gmail.com>
Date: Wed, 12 Nov 2025 10:03:09 -0800
Subject: [PATCH 17/22] Add support for e2e benchmark for conv2d/conv3d (#3329)

Summary:
att, we added this to float8_inference_roofline to reuse code
but we haven't enabled the roofline feature. For now we just
need the e2e speedup time for single conv2d/conv3d against bf16
to understand the speedup expecatation

Also added B200 hardware spec.

Test Plan:
python $SCRIPT_PATH $OUTPUT_FILE \
    --recipe_name $RECIPE_NAME \
    --shape_gen_name $SHAPE_GEN_NAME \
    --M $M --K $K --N $N  \
    --D $D --H $H --W $W  \
    --kernel_size $kernel_size \
    --op_name conv3d

This doesn't run yet because OSS fbgemm can't be installed in the B200 machine

Reviewers:

Subscribers:

Tasks:

Tags:

Co-authored-by: jerryzh <jerryzh@devgpu009.kcm2.facebook.com>
---
 .../float8/float8_inference_roofline.py       | 226 +++++++++++++-----
 torchao/testing/training/roofline_utils.py    |  16 ++
 2 files changed, 177 insertions(+), 65 deletions(-)

diff --git a/benchmarks/float8/float8_inference_roofline.py b/benchmarks/float8/float8_inference_roofline.py
index ea28d3236e..f5fa75cfb9 100644
--- a/benchmarks/float8/float8_inference_roofline.py
+++ b/benchmarks/float8/float8_inference_roofline.py
@@ -50,6 +50,7 @@
 from torchao.quantization.quant_api import (
     Float8DynamicActivationFloat8WeightConfig,
     PerRow,
+    PerTensor,
     quantize_,
 )
 from torchao.quantization.quantize_.common import KernelPreference
@@ -179,6 +180,11 @@ def run(
     n_limit: Optional[int] = None,
     save_profile_traces: bool = False,
     enable_fusion_modeling: bool = False,
+    op_name: str = "linear",
+    D: Optional[int] = None,
+    H: Optional[int] = None,
+    W: Optional[int] = None,
+    kernel_size: Optional[int] = None,
 ):
     """
     Args:
@@ -189,7 +195,29 @@ def run(
     * `n_limit (optional)`: if specified, only runs `n_limit` iterations
     # `save_profile_traces (optional)`: if True, saves profiling traces
     # `enable_fusion_modeling`: if True, models activation -> gemm instead of just gemm
+    # `op_name`: linear, conv2d or conv3d, decides which op to benchmark
+    # `D`, `H`, `W`: spatial dimensiosn for conv3d / conv2d
+    # `kernel_size`: kernel_size for conv3d / conv2d
     """
+    _SUPPORTED_OPS = ["linear", "conv2d", "conv3d"]
+    assert op_name in _SUPPORTED_OPS, (
+        f"Unsupported op: {op_name}, supported are: {_SUPPORTED_OPS}"
+    )
+    if op_name == "conv2d":
+        assert H is not None and W is not None, (
+            "Expected D, H, W to be specified for conv2d"
+        )
+        assert kernel_size is not None, (
+            "Expected kernel_size to be specified for conv2d"
+        )
+    elif op_name == "conv3d":
+        assert D is not None and H is not None and W is not None, (
+            "Expected D, H, W to be specified for conv3d"
+        )
+        assert kernel_size is not None, (
+            "Expected kernel_size to be specified for conv3d"
+        )
+
     config_table = [
         ["GPU", torch.cuda.get_device_name(0)],
         ["torch version", torch.__version__],
@@ -198,7 +226,10 @@ def run(
         ["do_benchmarks", do_benchmarks],
         ["shape_gen_name", shape_gen_name],
         ["enable_fusion_modeling", enable_fusion_modeling],
+        ["op_name", op_name],
         ["MKN", f"{M} {K} {N}"],
+        ["DHW", f"{D} {H} {W}"],
+        ["kernel_size", kernel_size],
     ]
     print(tabulate(config_table, headers=["Parameter", "Value"], tablefmt="simple"))
 
@@ -207,33 +238,45 @@ def run(
 
     M, K, N = sympy.symbols("M K N")
 
-    fp8_ovhd_time_sympy = get_inference_float8_mem_sympy(
-        M,
-        K,
-        N,
-        recipe_name,
-        # TODO(future): also enable fusion modeling here
-    )
-    bf16_gemm_time_sympy = get_inference_gemm_time_sympy(M, K, N, torch.bfloat16, None)
-
-    if recipe_name and recipe_name.startswith(("nvfp4", "mxfp4")):
-        fp8_gemm_time_sympy = get_inference_gemm_time_sympy(
-            M, K, N, torch.float4_e2m1fn_x2, recipe_name
+    if op_name == "linear":
+        fp8_ovhd_time_sympy = get_inference_float8_mem_sympy(
+            M,
+            K,
+            N,
+            recipe_name,
+            # TODO(future): also enable fusion modeling here
         )
-    else:
-        gemm_recipe_name = "mxfp8" if recipe_name.startswith("mxfp8") else None
-        fp8_gemm_time_sympy = get_inference_gemm_time_sympy(
-            M, K, N, torch.float8_e4m3fn, gemm_recipe_name
+        bf16_gemm_time_sympy = get_inference_gemm_time_sympy(
+            M, K, N, torch.bfloat16, None
         )
-    print("bf16_gemm_time_sympy", bf16_gemm_time_sympy)
-    print("fp8_gemm_time_sympy", fp8_gemm_time_sympy)
-    print("fp8_ovhd_time_sympy", fp8_ovhd_time_sympy)
-    print()
 
+        if recipe_name and recipe_name.startswith(("nvfp4", "mxfp4")):
+            fp8_gemm_time_sympy = get_inference_gemm_time_sympy(
+                M, K, N, torch.float4_e2m1fn_x2, recipe_name
+            )
+        else:
+            gemm_recipe_name = "mxfp8" if recipe_name.startswith("mxfp8") else None
+            fp8_gemm_time_sympy = get_inference_gemm_time_sympy(
+                M, K, N, torch.float8_e4m3fn, gemm_recipe_name
+            )
+        print("bf16_gemm_time_sympy", bf16_gemm_time_sympy)
+        print("fp8_gemm_time_sympy", fp8_gemm_time_sympy)
+        print("fp8_ovhd_time_sympy", fp8_ovhd_time_sympy)
+        print()
+    else:
+        # TODO: enable roofline analysis for conv
+        pass
+
+    # Note: roofline for conv2d/conv3d is not added yet, so most of the
+    # things for conv2d/conv3d we'll left out for now
     headers = [
         "fwd_M",
         "fwd_K",
         "fwd_N",
+        "D",
+        "H",
+        "W",
+        "kernel_size",
         # roofline - gemm time (fwd + bwd, 3 gemms)
         "r_bf16_gemm_s",
         "r_fp8_gemm_s",
@@ -258,6 +301,7 @@ def run(
         "rb_bf16_gemm_ratio",
         "rb_fp8_gemm_ratio",
     ]
+
     results = []
 
     name_to_shapes = get_name_to_shapes_iter(shape_gen_name, user_M, user_K, user_N)
@@ -266,54 +310,93 @@ def run(
         if n_limit is not None and idx >= n_limit:
             break
 
-        # use roofline model to estimate gemm time
-        # note: cast from sympy.core.numbers.Float to float to make pandas formatting work
-        r_bf16_gemm_time_s = float(
-            bf16_gemm_time_sympy.subs(M, M_val).subs(K, K_val).subs(N, N_val)
-        )
-        r_fp8_gemm_time_s = float(
-            fp8_gemm_time_sympy.subs(M, M_val).subs(K, K_val).subs(N, N_val)
-        )
-
-        # if enabled, also measured observed gemm time
-        b_bf16_gemm_time_s, b_fp8_gemm_time_s = 0, 0
-        rb_bf16_gemm_ratio = -1
-        rb_fp8_gemm_ratio = -1
+        if op_name == "linear":
+            # use roofline model to estimate gemm time
+            # note: cast from sympy.core.numbers.Float to float to make pandas formatting work
+            r_bf16_gemm_time_s = float(
+                bf16_gemm_time_sympy.subs(M, M_val).subs(K, K_val).subs(N, N_val)
+            )
+            r_fp8_gemm_time_s = float(
+                fp8_gemm_time_sympy.subs(M, M_val).subs(K, K_val).subs(N, N_val)
+            )
 
-        if do_benchmarks:
-            # TODO(future): make the bf16 gemm times exactly match the e2e
-            # benchmarks, there is a slight deviation, probably related to gemm
-            # operand memory formats/transpositions below not exactly matching
-            # what PyTorch core is doing for `torch.mm`
-            # input @ weight_t = output
-            bf16_g1, f8_g1 = get_gemm_times(
-                M_val,
-                K_val,
-                N_val,
-                True,
-                recipe_name,
+            # note: cast from sympy.core.numbers.Float to float to make pandas formatting work
+            r_fp8_ovhd_time_s = float(
+                fp8_ovhd_time_sympy.subs(M, M_val).subs(K, K_val).subs(N, N_val)
             )
-            b_bf16_gemm_time_s = bf16_g1
-            b_fp8_gemm_time_s = f8_g1
-            rb_bf16_gemm_ratio = r_bf16_gemm_time_s / b_bf16_gemm_time_s
-            rb_fp8_gemm_ratio = r_fp8_gemm_time_s / b_fp8_gemm_time_s
-
-        # note: cast from sympy.core.numbers.Float to float to make pandas formatting work
-        r_fp8_ovhd_time_s = float(
-            fp8_ovhd_time_sympy.subs(M, M_val).subs(K, K_val).subs(N, N_val)
-        )
+            r_fp8_gemm_and_ovhd_s = r_fp8_gemm_time_s + r_fp8_ovhd_time_s
+            r_speedup = r_bf16_gemm_time_s / (r_fp8_gemm_time_s + r_fp8_ovhd_time_s)
+
+            # if enabled, also measured observed gemm time
+            b_bf16_gemm_time_s, b_fp8_gemm_time_s = 0, 0
+            rb_bf16_gemm_ratio = -1
+            rb_fp8_gemm_ratio = -1
+
+            if do_benchmarks:
+                # TODO(future): make the bf16 gemm times exactly match the e2e
+                # benchmarks, there is a slight deviation, probably related to gemm
+                # operand memory formats/transpositions below not exactly matching
+                # what PyTorch core is doing for `torch.mm`
+                # input @ weight_t = output
+                bf16_g1, f8_g1 = get_gemm_times(
+                    M_val,
+                    K_val,
+                    N_val,
+                    True,
+                    recipe_name,
+                )
+                b_bf16_gemm_time_s = bf16_g1
+                b_fp8_gemm_time_s = f8_g1
+                rb_bf16_gemm_ratio = r_bf16_gemm_time_s / b_bf16_gemm_time_s
+                rb_fp8_gemm_ratio = r_fp8_gemm_time_s / b_fp8_gemm_time_s
+
+        else:
+            # roofline analysis for conv2d/conv3d are not added yet
+            r_bf16_gemm_time_s = None
+            r_fp8_gemm_time_s = None
+
+            r_fp8_ovhd_time_s = None
+            r_fp8_gemm_and_ovhd_s = None
+            r_speedup = None
+
+            # real gemm benchmark time, also not added yet
+            # if enabled, also measured observed gemm time
+            b_bf16_gemm_time_s, b_fp8_gemm_time_s = 0, 0
+            # gemm roofline ratio achieved in real benchmark
+            rb_bf16_gemm_ratio = -1
+            rb_fp8_gemm_ratio = -1
 
         b_bf16_e2e_time_s, b_fp8_e2e_time_s = 0, 0
         if do_benchmarks:
             # create the model
-            if not enable_fusion_modeling:
-                m_orig = nn.Sequential(nn.Linear(K_val, N_val, bias=False))
+            if op_name == "conv2d":
+                m_orig = nn.Sequential(
+                    nn.Conv2d(K_val, N_val, kernel_size, bias=False)
+                ).to(memory_format=torch.channels_last)
+            elif op_name == "conv3d":
+                m_orig = nn.Sequential(
+                    nn.Conv3d(K_val, N_val, kernel_size, bias=False)
+                ).to(memory_format=torch.channels_last_3d)
             else:
-                m_orig = nn.Sequential(nn.ReLU(), nn.Linear(K_val, N_val, bias=False))
+                if not enable_fusion_modeling:
+                    m_orig = nn.Sequential(nn.Linear(K_val, N_val, bias=False))
+                else:
+                    m_orig = nn.Sequential(
+                        nn.ReLU(), nn.Linear(K_val, N_val, bias=False)
+                    )
             m_orig = m_orig.cuda().bfloat16()
-            x = torch.randn(
-                M_val, K_val, dtype=torch.bfloat16, device="cuda"
-            ).requires_grad_()
+            if op_name == "conv2d":
+                x = torch.randn(
+                    M_val, K_val, H, W, dtype=torch.bfloat16, device="cuda"
+                ).to(memory_format=torch.channels_last)
+            elif op_name == "conv3d":
+                x = torch.randn(
+                    M_val, K_val, D, H, W, dtype=torch.bfloat16, device="cuda"
+                ).to(memory_format=torch.channels_last_3d)
+            else:
+                x = torch.randn(
+                    M_val, K_val, dtype=torch.bfloat16, device="cuda"
+                ).requires_grad_()
 
             # get the bf16 gpu kernel time
             torch._dynamo.reset()
@@ -327,7 +410,11 @@ def run(
             # get the float8 dynamic scaling gpu kernel time
             torch._dynamo.reset()
 
-            if recipe_name == "rowwise":
+            if recipe_name == "tensorwise":
+                config = Float8DynamicActivationFloat8WeightConfig(
+                    granularity=PerTensor(),
+                )
+            elif recipe_name == "rowwise":
                 config = Float8DynamicActivationFloat8WeightConfig(
                     granularity=PerRow(),
                     # for now, use TORCH. In the future might be interesting
@@ -355,7 +442,14 @@ def run(
                 assert False, "unsupported"
 
             m_fp8_dyn = copy.deepcopy(m_orig)
-            quantize_(m_fp8_dyn, config)
+            if op_name == "linear":
+                quantize_(m_fp8_dyn, config)
+            elif op_name == "conv2d":
+                _is_conv2d = lambda m, fqn: isinstance(m, torch.nn.Conv2d)
+                quantize_(m_fp8_dyn, config, filter_fn=_is_conv2d)
+            else:
+                _is_conv3d = lambda m, fqn: isinstance(m, torch.nn.Conv3d)
+                quantize_(m_fp8_dyn, config, filter_fn=_is_conv3d)
 
             m_fp8_dyn = torch.compile(m_fp8_dyn)
 
@@ -364,20 +458,22 @@ def run(
                 fp8_trace_filename = f"{outfile}_{M_val}_{K_val}_{N_val}_fp8.json"
             b_fp8_e2e_time_s = get_gpu_kernel_time(m_fp8_dyn, x, fp8_trace_filename)
 
-        r_speedup = r_bf16_gemm_time_s / (r_fp8_gemm_time_s + r_fp8_ovhd_time_s)
-
         results.append(
             [
                 M_val,
                 K_val,
                 N_val,
+                D,
+                H,
+                W,
+                kernel_size,
                 # roofline - gemm
                 r_bf16_gemm_time_s,
                 r_fp8_gemm_time_s,
                 # roofline - fp8 overhead
                 r_fp8_ovhd_time_s,
                 # roofline - gemm + overhead, and speedup
-                r_fp8_gemm_time_s + r_fp8_ovhd_time_s,
+                r_fp8_gemm_and_ovhd_s,
                 r_speedup,
                 # benchmarks - gemm
                 b_bf16_gemm_time_s,
diff --git a/torchao/testing/training/roofline_utils.py b/torchao/testing/training/roofline_utils.py
index e391a4d44b..bf234b3717 100644
--- a/torchao/testing/training/roofline_utils.py
+++ b/torchao/testing/training/roofline_utils.py
@@ -43,6 +43,22 @@
         # TODO(future): measure once we have the hardware
         "pct_achievable_mem_bw": 0.92,
     },
+    "NVIDIA GB200": {
+        # https://resources.nvidia.com/en-us-blackwell-architecture, page 19,
+        # divide by 2 because no sparsity
+        "bf16_peak_tops": 2.25e15,
+        "fp8_peak_tops": 4.5e15,
+        "fp4_peak_tops": 9.0e15,
+        # https://resources.nvidia.com/en-us-blackwell-architecture, page 20
+        # 8.0 TB per second
+        "peak_mem_bw_bytes_sec": 8.0e12,
+        # for now, copy over from H100
+        # TODO(future): measure once we have the hardware
+        "pct_achievable_gemm_tops": 0.78,
+        # for now, copy over from H100
+        # TODO(future): measure once we have the hardware
+        "pct_achievable_mem_bw": 0.92,
+    },
     "AMD Instinct MI300X": {
         # https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/data-sheets/amd-instinct-mi300x-data-sheet.pdf, page 1,
         "bf16_peak_tops": 1307e12,

From 8c375689e5236238d84242bc5c241b3772c59251 Mon Sep 17 00:00:00 2001
From: Apurva Jain <apurvajain.kota@gmail.com>
Date: Wed, 12 Nov 2025 10:14:39 -0800
Subject: [PATCH 18/22] Move floatx_tensor_core_layout to prototype/dtypes
 (#3317)

---
 benchmarks/benchmark_fp6.py                   |   2 +-
 docs/source/api_ref_dtypes.rst                |   2 +-
 test/dtypes/test_floatx.py                    |  12 +-
 torchao/dtypes/affine_quantized_tensor.py     |   5 +-
 torchao/dtypes/affine_quantized_tensor_ops.py |   8 +-
 .../floatx/floatx_tensor_core_layout.py       | 679 +-----------------
 torchao/prototype/dtypes/__init__.py          |   2 +
 torchao/prototype/dtypes/floatx/__init__.py   |  17 +
 .../floatx/floatx_tensor_core_layout.py       | 666 +++++++++++++++++
 torchao/quantization/quant_api.py             |   2 +-
 10 files changed, 723 insertions(+), 672 deletions(-)
 create mode 100644 torchao/prototype/dtypes/floatx/__init__.py
 create mode 100644 torchao/prototype/dtypes/floatx/floatx_tensor_core_layout.py

diff --git a/benchmarks/benchmark_fp6.py b/benchmarks/benchmark_fp6.py
index c22eba9e1a..4aac4b952f 100644
--- a/benchmarks/benchmark_fp6.py
+++ b/benchmarks/benchmark_fp6.py
@@ -9,7 +9,7 @@
 from tqdm import tqdm
 
 from torchao.dtypes import to_affine_quantized_fpx
-from torchao.dtypes.floatx import FloatxTensorCoreLayout
+from torchao.prototype.dtypes.floatx import FloatxTensorCoreLayout
 from torchao.utils import benchmark_torch_function_in_microseconds
 
 
diff --git a/docs/source/api_ref_dtypes.rst b/docs/source/api_ref_dtypes.rst
index 3997b444b3..826c16fe19 100644
--- a/docs/source/api_ref_dtypes.rst
+++ b/docs/source/api_ref_dtypes.rst
@@ -20,7 +20,6 @@ Layouts and Tensor Subclasses
     TensorCoreTiledLayout
     Float8Layout
     FloatxTensor
-    FloatxTensorCoreLayout
     MarlinSparseLayout
     Int4CPULayout
     CutlassSemiSparseLayout
@@ -52,6 +51,7 @@ Prototype
     Int8DynamicActInt4WeightCPULayout
     MarlinQQQTensor
     MarlinQQQLayout
+    FloatxTensorCoreLayout
     UintxLayout
 
 ..
diff --git a/test/dtypes/test_floatx.py b/test/dtypes/test_floatx.py
index ab4a13d24c..a3dd4d19e3 100644
--- a/test/dtypes/test_floatx.py
+++ b/test/dtypes/test_floatx.py
@@ -14,20 +14,20 @@
     run_tests,
 )
 
-from torchao.dtypes.floatx import (
+from torchao.prototype.custom_fp_utils import (
+    _f32_to_floatx_unpacked,
+    _floatx_unpacked_to_f32,
+)
+from torchao.prototype.dtypes.floatx import (
     FloatxTensorCoreLayout,
     from_scaled_tc_floatx,
     to_scaled_tc_floatx,
 )
-from torchao.dtypes.floatx.floatx_tensor_core_layout import (
+from torchao.prototype.dtypes.floatx.floatx_tensor_core_layout import (
     FloatxTensorCoreAQTTensorImpl,
     _pack_tc_floatx,
     _pack_tc_fp6,
 )
-from torchao.prototype.custom_fp_utils import (
-    _f32_to_floatx_unpacked,
-    _floatx_unpacked_to_f32,
-)
 from torchao.quantization import (
     FPXWeightOnlyConfig,
     quantize_,
diff --git a/torchao/dtypes/affine_quantized_tensor.py b/torchao/dtypes/affine_quantized_tensor.py
index 0d7ed8d9e2..3303bd5267 100644
--- a/torchao/dtypes/affine_quantized_tensor.py
+++ b/torchao/dtypes/affine_quantized_tensor.py
@@ -136,7 +136,8 @@ def dequantize(self, output_dtype: Optional[torch.dtype] = None) -> torch.Tensor
         if output_dtype is None:
             output_dtype = self.dtype
 
-        from torchao.dtypes.floatx import Float8Layout, FloatxTensorCoreLayout
+        from torchao.dtypes.floatx import Float8Layout
+        from torchao.prototype.dtypes.floatx import FloatxTensorCoreLayout
 
         if isinstance(self._layout, FloatxTensorCoreLayout):
             int_data, scale = self.tensor_impl.get_plain()
@@ -539,7 +540,7 @@ def from_hp_to_fpx(
         _layout: Layout,
     ):
         """Create a floatx AffineQuantizedTensor from a high precision tensor. Floatx is represented as ebits and mbits, and supports the representation of float1-float7."""
-        from torchao.dtypes.floatx import FloatxTensorCoreLayout
+        from torchao.prototype.dtypes.floatx import FloatxTensorCoreLayout
 
         assert isinstance(_layout, FloatxTensorCoreLayout), (
             f"Only FloatxTensorCoreLayout is supported for floatx, got {_layout}"
diff --git a/torchao/dtypes/affine_quantized_tensor_ops.py b/torchao/dtypes/affine_quantized_tensor_ops.py
index 6c7216ab12..730d33d2c6 100644
--- a/torchao/dtypes/affine_quantized_tensor_ops.py
+++ b/torchao/dtypes/affine_quantized_tensor_ops.py
@@ -21,10 +21,6 @@
     _linear_fp_act_fp8_weight_check,
     _linear_fp_act_fp8_weight_impl,
 )
-from torchao.dtypes.floatx.floatx_tensor_core_layout import (
-    _linear_f16_bf16_act_floatx_weight_check,
-    _linear_f16_bf16_act_floatx_weight_impl,
-)
 from torchao.dtypes.uintx.int4_cpu_layout import (
     _linear_fp_act_uint4_weight_cpu_check,
     _linear_fp_act_uint4_weight_cpu_impl,
@@ -72,6 +68,10 @@
     _linear_bf16_act_uint4_weight_check,
     _linear_bf16_act_uint4_weight_impl,
 )
+from torchao.prototype.dtypes.floatx.floatx_tensor_core_layout import (
+    _linear_f16_bf16_act_floatx_weight_check,
+    _linear_f16_bf16_act_floatx_weight_impl,
+)
 from torchao.prototype.dtypes.uintx.block_sparse_layout import (
     _linear_int8_act_int8_weight_block_sparse_check,
     _linear_int8_act_int8_weight_block_sparse_impl,
diff --git a/torchao/dtypes/floatx/floatx_tensor_core_layout.py b/torchao/dtypes/floatx/floatx_tensor_core_layout.py
index c7fb1e1a7c..7f96564458 100644
--- a/torchao/dtypes/floatx/floatx_tensor_core_layout.py
+++ b/torchao/dtypes/floatx/floatx_tensor_core_layout.py
@@ -3,664 +3,29 @@
 #
 # This source code is licensed under the BSD 3-Clause license found in the
 # LICENSE file in the root directory of this source tree.
-from dataclasses import dataclass
-from functools import reduce
-from typing import Optional, Tuple
 
-import torch
-from torch import Tensor
-from torch.utils._python_dispatch import (
-    is_traceable_wrapper_subclass,
-    return_and_correct_aliasing,
-)
+# Backward compatibility stub - imports from the new location
+import warnings
 
-from torchao.dtypes.affine_quantized_tensor import (
-    AffineQuantizedTensor,
-    register_layout,
-)
-from torchao.dtypes.utils import (
-    AQTTensorImpl,
-    Layout,
-)
-from torchao.prototype.custom_fp_utils import (
-    _f32_to_floatx_unpacked,
-    _floatx_unpacked_to_f32,
-    _n_ones,
+warnings.warn(
+    "Importing from torchao.dtypes.floatx.floatx_tensor_core_layout is deprecated. "
+    "Please use 'from torchao.prototype.dtypes.floatx.floatx_tensor_core_layout import ...' instead. "
+    "This import path will be removed in a future torchao release. "
+    "Please check issue: https://github.com/pytorch/ao/issues/2752 for more details. ",
+    DeprecationWarning,
+    stacklevel=2,
 )
 
-aten = torch.ops.aten
-_ONES_TABLE = [_n_ones(i) for i in range(8)]
-
-
-def _pack(x: Tensor, n_bits: int) -> Tensor:
-    return reduce(
-        torch.bitwise_or,
-        [
-            x[..., i :: (8 // n_bits)] << (8 - (i + 1) * n_bits)
-            for i in range(8 // n_bits)
-        ],
-    )
-
-
-def _unpack(x: Tensor, n_bits: int) -> Tensor:
-    return torch.stack(
-        [
-            (x >> (8 - (i + 1) * n_bits)) & ((1 << n_bits) - 1)
-            for i in range(8 // n_bits)
-        ],
-        dim=-1,
-    ).flatten(-2)
-
-
-# https://github.com/usyd-fsalab/fp6_llm/blob/5df6737cca32f604e957e3f63f03ccc2e4d1df0d/fp6_llm/csrc/utils/weight_prepacking.h#L87-L116
-def _bit_interleave(x: Tensor, n_bits: int, undo: bool = False) -> Tensor:
-    # the original code unpacks/packs the values from/to uint32 while we unpack/pack the values from/to uint8
-    # thus, we need to reverse byte order within a uint32 word.
-    x = x.reshape(-1, 4).flip(1)
-
-    x = _unpack(x, n_bits)
-    x = x.view(-1, 4 * (8 // n_bits))
-
-    if not undo:
-        bit_order = {
-            1: [
-                1,
-                5,
-                9,
-                13,
-                17,
-                21,
-                25,
-                29,
-                3,
-                7,
-                11,
-                15,
-                19,
-                23,
-                27,
-                31,
-                0,
-                4,
-                8,
-                12,
-                16,
-                20,
-                24,
-                28,
-                2,
-                6,
-                10,
-                14,
-                18,
-                22,
-                26,
-                30,
-            ],
-            2: [1, 5, 9, 13, 3, 7, 11, 15, 0, 4, 8, 12, 2, 6, 10, 14],
-            4: [1, 5, 3, 7, 0, 4, 2, 6],
-        }[n_bits]
-
-    else:
-        # this is inverse of the above, obtained by running
-        # [v.index(i) for i in range(len(v))]
-        bit_order = {
-            1: [
-                16,
-                0,
-                24,
-                8,
-                17,
-                1,
-                25,
-                9,
-                18,
-                2,
-                26,
-                10,
-                19,
-                3,
-                27,
-                11,
-                20,
-                4,
-                28,
-                12,
-                21,
-                5,
-                29,
-                13,
-                22,
-                6,
-                30,
-                14,
-                23,
-                7,
-                31,
-                15,
-            ],
-            2: [8, 0, 12, 4, 9, 1, 13, 5, 10, 2, 14, 6, 11, 3, 15, 7],
-            4: [4, 0, 6, 2, 5, 1, 7, 3],
-        }[n_bits]
-
-    x = x[:, bit_order]
-    x = _pack(x, n_bits)
-
-    # reverse byte order within a uint32 word again.
-    x = x.reshape(-1, 4).flip(1)
-    return x.flatten()
-
-
-# this is a literal adaptation of FP6-LLM ahead-of-time bit-level pre-packing
-# https://github.com/usyd-fsalab/fp6_llm/blob/5df6737cca32f604e957e3f63f03ccc2e4d1df0d/fp6_llm/csrc/utils/weight_prepacking.h
-def _pack_tc_floatx(tensor: Tensor, nbits: int) -> Tensor:
-    assert tensor.ndim == 2, tensor.dtype == torch.uint8
-    M, N = tensor.shape
-    assert (M % 64 == 0) and (N % 64 == 0)
-
-    # Pass 1 from original code
-    tensor = tensor.view(M // 64, 4, 2, 8, N // 16, 2, 8)
-    tensor = tensor.permute(0, 4, 1, 5, 2, 3, 6)
-    tensor = tensor.reshape(-1, 32, 2)
-    tensor = tensor.permute(1, 0, 2)
-    tensor = tensor.flatten()
-
-    used_bits = 0
-    fragments = []
-
-    for y in [1, 2, 4]:
-        if nbits & y:
-            mask = (1 << y) - 1
-            tensor_ybit = (tensor >> (nbits - used_bits - y)) & mask
-            tensor_ybit = _pack(tensor_ybit, y)
-
-            tensor_ybit = (
-                tensor_ybit.view(32, -1, 4).permute(1, 0, 2).flip(2)
-            )  # Pass 2 from original code
-            tensor_ybit = _bit_interleave(
-                tensor_ybit.flatten(), y
-            )  # Pass 3 from original code
-            fragments.append(tensor_ybit)
-            used_bits += y
-
-    return torch.cat(fragments, dim=0).view(M, -1)
-
-
-# more optimized version of _pack_tc_floatx() for FP6 by merging ops
-def _pack_tc_fp6(tensor: Tensor) -> Tensor:
-    assert tensor.ndim == 2, tensor.dtype == torch.uint8
-    M, N = tensor.shape
-    assert (M % 64 == 0) and (N % 64 == 0)
-
-    tensor = tensor.view(M // 64, 2, 2, 2, 8, N // 16, 2, 8)
-    tensor = tensor.flip(3)
-
-    tensor_2bit = (tensor >> 4) & 0b11
-    tensor_2bit = tensor_2bit.permute(0, 5, 1, 4, 7, 3, 2, 6)
-    tensor_2bit = _pack(tensor_2bit.flatten(), 2)
-
-    tensor_4bit = tensor & 0b1111
-    tensor_4bit = tensor_4bit.permute(0, 5, 1, 2, 4, 7, 3, 6)
-    tensor_4bit = _pack(tensor_4bit.flatten(), 4)
-
-    return torch.cat([tensor_2bit, tensor_4bit], dim=0).view(M, -1)
-
-
-# currently only optimize for TC-FP6 packing
-def pack_tc_floatx(tensor: Tensor, nbits: int) -> Tensor:
-    if nbits == 6:
-        return _pack_tc_fp6(tensor)
-    return _pack_tc_floatx(tensor, nbits)
-
-
-def to_scaled_tc_floatx(
-    tensor: Tensor, ebits: int, mbits: int
-) -> Tuple[Tensor, Tensor]:
-    # _n_ones() is not compatible with torch.compile() due to << operator
-    # https://github.com/pytorch/pytorch/issues/119152
-    # exp_bias = _n_ones(ebits - 1)
-    # max_normal = 2 ** (_n_ones(ebits) - exp_bias) * (_n_ones(mbits + 1) / (2 ** mbits))
-
-    # workaround: global lookup table
-    exp_bias = _ONES_TABLE[ebits - 1]
-    max_normal = 2 ** (_ONES_TABLE[ebits] - exp_bias) * (
-        _ONES_TABLE[mbits + 1] / (2**mbits)
-    )
-
-    dtype = tensor.dtype
-    tensor = tensor.float()
-    scale = tensor.abs().amax(1).clamp(min=1e-12) / max_normal
-    tensor_floatx = _f32_to_floatx_unpacked(tensor / scale.view(-1, 1), ebits, mbits)
-    tensor_tc_floatx = pack_tc_floatx(tensor_floatx, 1 + ebits + mbits)
-    return tensor_tc_floatx, scale.to(dtype)
-
-
-# inverse of _pack_tc_floatx()
-def _unpack_tc_floatx(tensor: Tensor, nbits: int) -> Tensor:
-    assert tensor.ndim == 2 and tensor.dtype == torch.uint8
-    M = tensor.shape[0]
-    size = tensor.numel()
-    tensor = tensor.flatten()
-    offset = 0
-    used_bits = 0
-
-    tensor_floatx = None
-
-    for y in [1, 2, 4]:
-        if nbits & y:
-            size_ybit = size // nbits * y
-            tensor_ybit = tensor[offset : offset + size_ybit]
-            offset += size_ybit
-
-            tensor_ybit = _bit_interleave(tensor_ybit, y, undo=True)  # undo Pass 3
-            tensor_ybit = (
-                tensor_ybit.view(-1, 32, 4).flip(2).permute(1, 0, 2)
-            )  # undo Pass 2
-
-            tensor_ybit = _unpack(tensor_ybit.flatten(), y)
-            tensor_ybit = tensor_ybit << (nbits - used_bits - y)
-            used_bits += y
-
-            if tensor_floatx is None:
-                tensor_floatx = tensor_ybit
-            else:
-                tensor_floatx |= tensor_ybit
-
-    # undo Pass 1
-    tensor_floatx = tensor_floatx.view(32, -1, 2).permute(1, 0, 2)
-    tensor_floatx = tensor_floatx.reshape(M // 64, -1, 4, 2, 2, 8, 8)
-    tensor_floatx = tensor_floatx.permute(0, 2, 4, 5, 1, 3, 6)
-    tensor_floatx = tensor_floatx.reshape(M, -1)
-    return tensor_floatx
-
-
-# more optimized version of _unpack_tc_floatx() for FP6 by merging ops
-# inverse of _unpack_tc_fp6()
-def _unpack_tc_fp6(tensor: Tensor) -> Tensor:
-    assert tensor.ndim == 2 and tensor.dtype == torch.uint8
-    M = tensor.shape[0]
-    N = tensor.shape[1] // 3 * 4
-    assert (M % 64 == 0) and (N % 64 == 0)
-    size_2bit = M * N // 4
-    size_4bit = M * N // 2
-    tensor = tensor.view(-1)
-    assert tensor.numel() == size_2bit + size_4bit
-
-    tensor_2bit, tensor_4bit = tensor.split([size_2bit, size_4bit])
-
-    tensor_2bit = _unpack(tensor_2bit, 2)
-    tensor_2bit = tensor_2bit.view(M // 64, N // 16, 2, 8, 8, 2, 2, 2)
-    tensor_2bit = tensor_2bit.permute(0, 2, 6, 5, 3, 1, 7, 4)
-
-    tensor_4bit = _unpack(tensor_4bit, 4)
-    tensor_4bit = tensor_4bit.view(M // 64, N // 16, 2, 2, 8, 8, 2, 2)
-    tensor_4bit = tensor_4bit.permute(0, 2, 3, 6, 4, 1, 7, 5)
-
-    tensor_fp6 = (tensor_2bit << 4) | tensor_4bit
-    tensor_fp6 = tensor_fp6.flip(3).reshape(M, N)
-    return tensor_fp6
-
-
-def unpack_tc_floatx(tensor: Tensor, nbits: int) -> Tensor:
-    if nbits == 6:
-        return _unpack_tc_fp6(tensor)
-    return _unpack_tc_floatx(tensor, nbits)
-
-
-def from_scaled_tc_floatx(tensor: Tensor, ebits: int, mbits: int, scale=None) -> Tensor:
-    floatx_unpacked = unpack_tc_floatx(tensor, 1 + ebits + mbits)
-    tensor = _floatx_unpacked_to_f32(floatx_unpacked, ebits, mbits)
-    if scale is not None:
-        tensor = tensor * scale.float().view(-1, 1)
-    return tensor
-
-
-# https://github.com/microsoft/DeepSpeed/blob/3a3a6db3332e339cc9fd94efd4982f6d60635a3d/deepspeed/inference/v2/kernels/core_ops/cuda_linear/cuda_linear.py
-_SPLIT_K_MAP = [
-    {  # tokens: [1, 64]
-        3072: 18,
-        4096: 13,
-        5120: 10,
-        6144: 9,
-        8192: 6,
-        10240: 5,
-        14336: 7,
-        28672: 7,
-        57344: 7,
-    },
-    {  # tokens: [65:128]
-        3072: 9,
-        4096: 6,
-        5120: 5,
-        6144: 9,
-        8192: 3,
-        10240: 5,
-        14336: 7,
-        28672: 7,
-        57344: 6,
-    },
-    {  # tokens: [129:192]
-        3072: 6,
-        4096: 4,
-        5120: 7,
-        6144: 3,
-        8192: 2,
-        10240: 5,
-        14336: 5,
-        28672: 5,
-        57344: 4,
-    },
-    {  # tokens: [193:256]
-        3072: 9,
-        4096: 3,
-        5120: 5,
-        6144: 2,
-        8192: 5,
-        10240: 4,
-        14336: 8,
-        28672: 6,
-        57344: 4,
-    },
-    {  # tokens: [257:320]
-        3072: 7,
-        4096: 5,
-        5120: 2,
-        6144: 5,
-        8192: 4,
-        10240: 1,
-        14336: 3,
-        28672: 3,
-        57344: 4,
-    },
-    {  # tokens: [321:384]
-        3072: 3,
-        4096: 2,
-        5120: 5,
-        6144: 3,
-        8192: 1,
-        10240: 8,
-        14336: 3,
-        28672: 4,
-        57344: 3,
-    },
-    {  # tokens: [385:448]
-        3072: 5,
-        4096: 7,
-        5120: 3,
-        6144: 5,
-        8192: 7,
-        10240: 3,
-        14336: 1,
-        28672: 1,
-        57344: 3,
-    },
-    {  # tokens: [449:512]
-        3072: 2,
-        4096: 5,
-        5120: 4,
-        6144: 1,
-        8192: 5,
-        10240: 2,
-        14336: 6,
-        28672: 4,
-        57344: 1,
-    },
-    {  # tokens: [513:576]
-        3072: 2,
-        4096: 3,
-        5120: 1,
-        6144: 1,
-        8192: 3,
-        10240: 3,
-        14336: 3,
-        28672: 1,
-        57344: 1,
-    },
-    {  # tokens: [577:640]
-        3072: 5,
-        4096: 4,
-        5120: 1,
-        6144: 4,
-        8192: 2,
-        10240: 1,
-        14336: 1,
-        28672: 1,
-        57344: 1,
-    },
-    {  # tokens: [641:704]
-        3072: 3,
-        4096: 1,
-        5120: 2,
-        6144: 2,
-        8192: 1,
-        10240: 2,
-        14336: 1,
-        28672: 1,
-        57344: 1,
-    },
-    {  # tokens: [705:768]
-        3072: 3,
-        4096: 1,
-        5120: 3,
-        6144: 2,
-        8192: 1,
-        10240: 1,
-        14336: 1,
-        28672: 1,
-        57344: 1,
-    },
-]
-
-
-# quantization api integrations
-@dataclass(frozen=True)
-class FloatxTensorCoreLayout(Layout):
-    """FloatxTensorCoreLayout is a data class that defines the layout for a tensor with a specific number of exponent bits (ebits) and mantissa bits (mbits).
-    This layout is used in the context of quantization and packing of tensors optimized for TensorCore operations.
-    """
-
-    ebits: int
-    mbits: int
-
-
-@register_layout(FloatxTensorCoreLayout)
-class FloatxTensorCoreAQTTensorImpl(AQTTensorImpl):
-    """FloatxTensorCoreAQTTensorImpl represents a Tensor with dtype floatx(ebits=a, mbits=b),
-    it has a internal tensor field of "packed_floatx_data", which is packed from the
-    uint8 unpacked data (the output of `_quantize_affine_floatx` operator)
-
-    The packing is optimized for TensorCore, from the fp6-llm paper: https://arxiv.org/abs/2401.14112
-    github repo: https://github.com/usyd-fsalab/fp6_llm, now renamed to quant-llm
-
-    At a high level packing is done by grouping bits into 1 bit fragments (shards), 2 bit fragments and
-    4 bit fragments each fragments are packed separately and concatenated together.
-    For example for 6 bit dtype, we can extract the first 4 bits for all elements and pack them together
-    in a fragment, and extract the last 2 bits for all elements and pack them into fragment, in the end
-    we concatenate the fragments together.
-
-    If original Tensor shape is (M, N), and the data is in nbit, the shape of the packed data will be
-    (M, N // 8 * nbit)
-
-    FloatxTensorCoreAQTTensorImpl.from_plain takes an unpacked uint8 floatx Tensor of shape (M, N), with format of
-    (zero padding bits + sign bit + exponent bits + mantissa bits), e.g. 00SEEEMM for fp6_e3_m2
-    it will then pack the weight and instantiate the FloatxTensorCoreAQTTensorImpl tensor
-    FloatxTensorCoreAQTTensorImpl.__init__() takes a packed floatx Tensor of shape (M, N // 8 * nbit)
-    """
-
-    def __new__(
-        cls,
-        packed_floatx_data: torch.Tensor,
-        scale: torch.Tensor,
-        _layout: Layout,
-    ):
-        assert packed_floatx_data.ndim == 2
-        assert packed_floatx_data.dtype == torch.uint8
-        shape = (
-            packed_floatx_data.shape[0],
-            packed_floatx_data.shape[1] // (1 + _layout.ebits + _layout.mbits) * 8,
-        )
-        kwargs = {}
-        kwargs["device"] = packed_floatx_data.device
-        kwargs["layout"] = (
-            kwargs.get("layout")
-            if kwargs.get("layout", False)
-            else packed_floatx_data.layout
-        )
-        kwargs["dtype"] = packed_floatx_data.dtype
-        kwargs["requires_grad"] = False
-        return torch.Tensor._make_wrapper_subclass(cls, shape, **kwargs)  # type: ignore[attr-defined]
-
-    def __init__(
-        self,
-        packed_floatx_data: torch.Tensor,
-        scale: torch.Tensor,
-        _layout: Layout,
-    ):
-        self.packed_floatx_data = packed_floatx_data
-        self.scale = scale
-        self._layout = _layout
-
-    def __tensor_flatten__(self):
-        return ["packed_floatx_data", "scale"], [self._layout]
-
-    @classmethod
-    def __tensor_unflatten__(
-        cls, tensor_data_dict, tensor_attributes, outer_size, outer_stride
-    ):
-        packed_floatx_data, scale = (
-            tensor_data_dict["packed_floatx_data"],
-            tensor_data_dict["scale"],
-        )
-        (_layout,) = tensor_attributes
-        return cls(packed_floatx_data, scale, _layout)
-
-    def get_plain(self) -> Tuple[torch.Tensor, torch.Tensor]:
-        unpacked_floatx_data = unpack_tc_floatx(
-            self.packed_floatx_data, 1 + self._layout.ebits + self._layout.mbits
-        )
-        return unpacked_floatx_data, self.scale
-
-    @classmethod
-    def from_plain(
-        cls,
-        unpacked_floatx_data: torch.Tensor,
-        scale: torch.Tensor,
-        zero_point: Optional[torch.Tensor],
-        _layout: Layout,
-    ):
-        """
-        Format for `unpacked_floatx_data` will be:
-        zero padding bits | sign bit | exponent bits | mantissa bits
-
-        For example for fp6_e3_m2, the format will be: `00SEEEMM`, where S is sign bit, E is exponent
-        bit, M is mantissa bit
-        """
-        assert isinstance(_layout, FloatxTensorCoreLayout)
-        packed_floatx_data = pack_tc_floatx(
-            unpacked_floatx_data, 1 + _layout.ebits + _layout.mbits
-        )
-        return cls(packed_floatx_data, scale, _layout)
-
-    def __repr__(self):
-        unpacked_floatx_data, scale = self.get_plain()
-        _layout = self.get_layout()
-        return f"{self.__class__.__name__}(unpacked_floatx_data={unpacked_floatx_data}, scale={scale}, _layout={_layout})"
-
-    def _apply_fn_to_data(self, fn):
-        return self.__class__(
-            fn(self.packed_floatx_data),
-            fn(self.scale),
-            self._layout,
-        )
-
-    def to(self, *args, **kwargs):
-        kwargs = self._get_to_kwargs(*args, **kwargs)
-        device = kwargs.pop("device")
-        return self.__class__(
-            self.packed_floatx_data.to(device),
-            self.scale.to(device),
-            self._layout,
-        )
-
-    @classmethod
-    def __torch_dispatch__(cls, func, types, args, kwargs):
-        kwargs = {} if kwargs is None else kwargs
-
-        if func is aten.detach.default:
-            return return_and_correct_aliasing(
-                func, args, kwargs, args[0]._apply_fn_to_data(torch.detach)
-            )
-        elif func is aten.clone.default:
-            return return_and_correct_aliasing(
-                func, args, kwargs, args[0]._apply_fn_to_data(torch.clone)
-            )
-        elif func is aten._to_copy.default:
-            return return_and_correct_aliasing(
-                func,
-                args,
-                kwargs,
-                args[0]._apply_fn_to_data(
-                    lambda x: x.to(device=kwargs.pop("device", None))
-                ),
-            )
-
-        raise NotImplementedError(
-            f"FloatxTensorCoreAQTTensorImpl dispatch: attempting to run {func}, this is not supported"
-        )
-
-    __torch_function__ = torch._C._disabled_torch_function_impl
-
-    def get_layout(self) -> Layout:
-        return self._layout
-
-
-def _linear_f16_bf16_act_floatx_weight_check(input_tensor, weight_tensor, bias):
-    from torchao.dtypes.floatx import FloatxTensorCoreLayout
-
-    return (
-        # input is native float32 tensor
-        not is_traceable_wrapper_subclass(input_tensor)
-        and input_tensor.is_floating_point()
-        and input_tensor.dtype in (torch.float16, torch.bfloat16)
-        and
-        # weight is floatx Tensor
-        isinstance(weight_tensor, AffineQuantizedTensor)
-        and isinstance(weight_tensor._layout, FloatxTensorCoreLayout)
-        and (
-            # weight is using fp6 quantization
-            (weight_tensor._layout.ebits == 3 and weight_tensor._layout.mbits == 2)
-            or (weight_tensor._layout.ebits == 2 and weight_tensor._layout.mbits == 3)
-            or
-            # weight is using fp5 quantization
-            (weight_tensor._layout.ebits == 2 and weight_tensor._layout.mbits == 2)
-            or (weight_tensor._layout.ebits == 3 and weight_tensor._layout.mbits == 1)
-        )
-    )
-
-
-def _linear_f16_bf16_act_floatx_weight_impl(input_tensor, weight_tensor, bias):
-    from torchao.ops import quant_llm_linear
-
-    act = input_tensor
-    weight = weight_tensor
-
-    out_dim, in_dim = weight.shape
-    act_reshaped = act.view(-1, in_dim)
-
-    # https://github.com/microsoft/DeepSpeed/blob/3a3a6db3332e339cc9fd94efd4982f6d60635a3d/deepspeed/inference/v2/kernels/core_ops/cuda_linear/cuda_linear.py
-    bsize = act_reshaped.shape[0]
-    splitK = _SPLIT_K_MAP[(bsize - 1) // 64].get(out_dim, 1) if bsize <= 768 else 1
-
-    out = quant_llm_linear(
-        weight._layout.ebits,
-        weight._layout.mbits,
-        act_reshaped,
-        weight.tensor_impl.packed_floatx_data,
-        weight.tensor_impl.scale,
-        splitK=splitK,
-    )
-
-    if bias is not None:
-        out += bias
-
-    return out.view(*act.shape[:-1], out_dim).to(act.dtype)
+# Re-export all public symbols from the new location for backward compatibility
+from torchao.prototype.dtypes.floatx.floatx_tensor_core_layout import (  # noqa: F401
+    FloatxTensorCoreAQTTensorImpl,  # noqa: F401
+    FloatxTensorCoreLayout,  # noqa: F401
+    _linear_f16_bf16_act_floatx_weight_check,  # noqa: F401
+    _linear_f16_bf16_act_floatx_weight_impl,  # noqa: F401
+    _pack_tc_floatx,  # noqa: F401
+    _pack_tc_fp6,  # noqa: F401
+    from_scaled_tc_floatx,  # noqa: F401
+    pack_tc_floatx,  # noqa: F401
+    to_scaled_tc_floatx,  # noqa: F401
+    unpack_tc_floatx,  # noqa: F401
+)
diff --git a/torchao/prototype/dtypes/__init__.py b/torchao/prototype/dtypes/__init__.py
index 88fe73ab76..bfb82fdd60 100644
--- a/torchao/prototype/dtypes/__init__.py
+++ b/torchao/prototype/dtypes/__init__.py
@@ -4,6 +4,7 @@
 # This source code is licensed under the BSD 3-Clause license found in the
 # LICENSE file in the root directory of this source tree.
 
+from .floatx import FloatxTensorCoreLayout
 from .uintx import (
     BlockSparseLayout,
     CutlassInt4PackedLayout,
@@ -26,6 +27,7 @@
     "MarlinQQQTensor",
     "to_marlinqqq_quantized_intx",
     "GemlitePackedLayout",
+    "FloatxTensorCoreLayout",
     "UintxLayout",
     "UintxTensor",
     "UintxAQTTensorImpl",
diff --git a/torchao/prototype/dtypes/floatx/__init__.py b/torchao/prototype/dtypes/floatx/__init__.py
new file mode 100644
index 0000000000..edd045f8a9
--- /dev/null
+++ b/torchao/prototype/dtypes/floatx/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .floatx_tensor_core_layout import (
+    FloatxTensorCoreLayout,
+    from_scaled_tc_floatx,
+    to_scaled_tc_floatx,
+)
+
+__all__ = [
+    "FloatxTensorCoreLayout",
+    "to_scaled_tc_floatx",
+    "from_scaled_tc_floatx",
+]
diff --git a/torchao/prototype/dtypes/floatx/floatx_tensor_core_layout.py b/torchao/prototype/dtypes/floatx/floatx_tensor_core_layout.py
new file mode 100644
index 0000000000..c7fb1e1a7c
--- /dev/null
+++ b/torchao/prototype/dtypes/floatx/floatx_tensor_core_layout.py
@@ -0,0 +1,666 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+from dataclasses import dataclass
+from functools import reduce
+from typing import Optional, Tuple
+
+import torch
+from torch import Tensor
+from torch.utils._python_dispatch import (
+    is_traceable_wrapper_subclass,
+    return_and_correct_aliasing,
+)
+
+from torchao.dtypes.affine_quantized_tensor import (
+    AffineQuantizedTensor,
+    register_layout,
+)
+from torchao.dtypes.utils import (
+    AQTTensorImpl,
+    Layout,
+)
+from torchao.prototype.custom_fp_utils import (
+    _f32_to_floatx_unpacked,
+    _floatx_unpacked_to_f32,
+    _n_ones,
+)
+
+aten = torch.ops.aten
+_ONES_TABLE = [_n_ones(i) for i in range(8)]
+
+
+def _pack(x: Tensor, n_bits: int) -> Tensor:
+    return reduce(
+        torch.bitwise_or,
+        [
+            x[..., i :: (8 // n_bits)] << (8 - (i + 1) * n_bits)
+            for i in range(8 // n_bits)
+        ],
+    )
+
+
+def _unpack(x: Tensor, n_bits: int) -> Tensor:
+    return torch.stack(
+        [
+            (x >> (8 - (i + 1) * n_bits)) & ((1 << n_bits) - 1)
+            for i in range(8 // n_bits)
+        ],
+        dim=-1,
+    ).flatten(-2)
+
+
+# https://github.com/usyd-fsalab/fp6_llm/blob/5df6737cca32f604e957e3f63f03ccc2e4d1df0d/fp6_llm/csrc/utils/weight_prepacking.h#L87-L116
+def _bit_interleave(x: Tensor, n_bits: int, undo: bool = False) -> Tensor:
+    # the original code unpacks/packs the values from/to uint32 while we unpack/pack the values from/to uint8
+    # thus, we need to reverse byte order within a uint32 word.
+    x = x.reshape(-1, 4).flip(1)
+
+    x = _unpack(x, n_bits)
+    x = x.view(-1, 4 * (8 // n_bits))
+
+    if not undo:
+        bit_order = {
+            1: [
+                1,
+                5,
+                9,
+                13,
+                17,
+                21,
+                25,
+                29,
+                3,
+                7,
+                11,
+                15,
+                19,
+                23,
+                27,
+                31,
+                0,
+                4,
+                8,
+                12,
+                16,
+                20,
+                24,
+                28,
+                2,
+                6,
+                10,
+                14,
+                18,
+                22,
+                26,
+                30,
+            ],
+            2: [1, 5, 9, 13, 3, 7, 11, 15, 0, 4, 8, 12, 2, 6, 10, 14],
+            4: [1, 5, 3, 7, 0, 4, 2, 6],
+        }[n_bits]
+
+    else:
+        # this is inverse of the above, obtained by running
+        # [v.index(i) for i in range(len(v))]
+        bit_order = {
+            1: [
+                16,
+                0,
+                24,
+                8,
+                17,
+                1,
+                25,
+                9,
+                18,
+                2,
+                26,
+                10,
+                19,
+                3,
+                27,
+                11,
+                20,
+                4,
+                28,
+                12,
+                21,
+                5,
+                29,
+                13,
+                22,
+                6,
+                30,
+                14,
+                23,
+                7,
+                31,
+                15,
+            ],
+            2: [8, 0, 12, 4, 9, 1, 13, 5, 10, 2, 14, 6, 11, 3, 15, 7],
+            4: [4, 0, 6, 2, 5, 1, 7, 3],
+        }[n_bits]
+
+    x = x[:, bit_order]
+    x = _pack(x, n_bits)
+
+    # reverse byte order within a uint32 word again.
+    x = x.reshape(-1, 4).flip(1)
+    return x.flatten()
+
+
+# this is a literal adaptation of FP6-LLM ahead-of-time bit-level pre-packing
+# https://github.com/usyd-fsalab/fp6_llm/blob/5df6737cca32f604e957e3f63f03ccc2e4d1df0d/fp6_llm/csrc/utils/weight_prepacking.h
+def _pack_tc_floatx(tensor: Tensor, nbits: int) -> Tensor:
+    assert tensor.ndim == 2, tensor.dtype == torch.uint8
+    M, N = tensor.shape
+    assert (M % 64 == 0) and (N % 64 == 0)
+
+    # Pass 1 from original code
+    tensor = tensor.view(M // 64, 4, 2, 8, N // 16, 2, 8)
+    tensor = tensor.permute(0, 4, 1, 5, 2, 3, 6)
+    tensor = tensor.reshape(-1, 32, 2)
+    tensor = tensor.permute(1, 0, 2)
+    tensor = tensor.flatten()
+
+    used_bits = 0
+    fragments = []
+
+    for y in [1, 2, 4]:
+        if nbits & y:
+            mask = (1 << y) - 1
+            tensor_ybit = (tensor >> (nbits - used_bits - y)) & mask
+            tensor_ybit = _pack(tensor_ybit, y)
+
+            tensor_ybit = (
+                tensor_ybit.view(32, -1, 4).permute(1, 0, 2).flip(2)
+            )  # Pass 2 from original code
+            tensor_ybit = _bit_interleave(
+                tensor_ybit.flatten(), y
+            )  # Pass 3 from original code
+            fragments.append(tensor_ybit)
+            used_bits += y
+
+    return torch.cat(fragments, dim=0).view(M, -1)
+
+
+# more optimized version of _pack_tc_floatx() for FP6 by merging ops
+def _pack_tc_fp6(tensor: Tensor) -> Tensor:
+    assert tensor.ndim == 2, tensor.dtype == torch.uint8
+    M, N = tensor.shape
+    assert (M % 64 == 0) and (N % 64 == 0)
+
+    tensor = tensor.view(M // 64, 2, 2, 2, 8, N // 16, 2, 8)
+    tensor = tensor.flip(3)
+
+    tensor_2bit = (tensor >> 4) & 0b11
+    tensor_2bit = tensor_2bit.permute(0, 5, 1, 4, 7, 3, 2, 6)
+    tensor_2bit = _pack(tensor_2bit.flatten(), 2)
+
+    tensor_4bit = tensor & 0b1111
+    tensor_4bit = tensor_4bit.permute(0, 5, 1, 2, 4, 7, 3, 6)
+    tensor_4bit = _pack(tensor_4bit.flatten(), 4)
+
+    return torch.cat([tensor_2bit, tensor_4bit], dim=0).view(M, -1)
+
+
+# currently only optimize for TC-FP6 packing
+def pack_tc_floatx(tensor: Tensor, nbits: int) -> Tensor:
+    if nbits == 6:
+        return _pack_tc_fp6(tensor)
+    return _pack_tc_floatx(tensor, nbits)
+
+
+def to_scaled_tc_floatx(
+    tensor: Tensor, ebits: int, mbits: int
+) -> Tuple[Tensor, Tensor]:
+    # _n_ones() is not compatible with torch.compile() due to << operator
+    # https://github.com/pytorch/pytorch/issues/119152
+    # exp_bias = _n_ones(ebits - 1)
+    # max_normal = 2 ** (_n_ones(ebits) - exp_bias) * (_n_ones(mbits + 1) / (2 ** mbits))
+
+    # workaround: global lookup table
+    exp_bias = _ONES_TABLE[ebits - 1]
+    max_normal = 2 ** (_ONES_TABLE[ebits] - exp_bias) * (
+        _ONES_TABLE[mbits + 1] / (2**mbits)
+    )
+
+    dtype = tensor.dtype
+    tensor = tensor.float()
+    scale = tensor.abs().amax(1).clamp(min=1e-12) / max_normal
+    tensor_floatx = _f32_to_floatx_unpacked(tensor / scale.view(-1, 1), ebits, mbits)
+    tensor_tc_floatx = pack_tc_floatx(tensor_floatx, 1 + ebits + mbits)
+    return tensor_tc_floatx, scale.to(dtype)
+
+
+# inverse of _pack_tc_floatx()
+def _unpack_tc_floatx(tensor: Tensor, nbits: int) -> Tensor:
+    assert tensor.ndim == 2 and tensor.dtype == torch.uint8
+    M = tensor.shape[0]
+    size = tensor.numel()
+    tensor = tensor.flatten()
+    offset = 0
+    used_bits = 0
+
+    tensor_floatx = None
+
+    for y in [1, 2, 4]:
+        if nbits & y:
+            size_ybit = size // nbits * y
+            tensor_ybit = tensor[offset : offset + size_ybit]
+            offset += size_ybit
+
+            tensor_ybit = _bit_interleave(tensor_ybit, y, undo=True)  # undo Pass 3
+            tensor_ybit = (
+                tensor_ybit.view(-1, 32, 4).flip(2).permute(1, 0, 2)
+            )  # undo Pass 2
+
+            tensor_ybit = _unpack(tensor_ybit.flatten(), y)
+            tensor_ybit = tensor_ybit << (nbits - used_bits - y)
+            used_bits += y
+
+            if tensor_floatx is None:
+                tensor_floatx = tensor_ybit
+            else:
+                tensor_floatx |= tensor_ybit
+
+    # undo Pass 1
+    tensor_floatx = tensor_floatx.view(32, -1, 2).permute(1, 0, 2)
+    tensor_floatx = tensor_floatx.reshape(M // 64, -1, 4, 2, 2, 8, 8)
+    tensor_floatx = tensor_floatx.permute(0, 2, 4, 5, 1, 3, 6)
+    tensor_floatx = tensor_floatx.reshape(M, -1)
+    return tensor_floatx
+
+
+# more optimized version of _unpack_tc_floatx() for FP6 by merging ops
+# inverse of _unpack_tc_fp6()
+def _unpack_tc_fp6(tensor: Tensor) -> Tensor:
+    assert tensor.ndim == 2 and tensor.dtype == torch.uint8
+    M = tensor.shape[0]
+    N = tensor.shape[1] // 3 * 4
+    assert (M % 64 == 0) and (N % 64 == 0)
+    size_2bit = M * N // 4
+    size_4bit = M * N // 2
+    tensor = tensor.view(-1)
+    assert tensor.numel() == size_2bit + size_4bit
+
+    tensor_2bit, tensor_4bit = tensor.split([size_2bit, size_4bit])
+
+    tensor_2bit = _unpack(tensor_2bit, 2)
+    tensor_2bit = tensor_2bit.view(M // 64, N // 16, 2, 8, 8, 2, 2, 2)
+    tensor_2bit = tensor_2bit.permute(0, 2, 6, 5, 3, 1, 7, 4)
+
+    tensor_4bit = _unpack(tensor_4bit, 4)
+    tensor_4bit = tensor_4bit.view(M // 64, N // 16, 2, 2, 8, 8, 2, 2)
+    tensor_4bit = tensor_4bit.permute(0, 2, 3, 6, 4, 1, 7, 5)
+
+    tensor_fp6 = (tensor_2bit << 4) | tensor_4bit
+    tensor_fp6 = tensor_fp6.flip(3).reshape(M, N)
+    return tensor_fp6
+
+
+def unpack_tc_floatx(tensor: Tensor, nbits: int) -> Tensor:
+    if nbits == 6:
+        return _unpack_tc_fp6(tensor)
+    return _unpack_tc_floatx(tensor, nbits)
+
+
+def from_scaled_tc_floatx(tensor: Tensor, ebits: int, mbits: int, scale=None) -> Tensor:
+    floatx_unpacked = unpack_tc_floatx(tensor, 1 + ebits + mbits)
+    tensor = _floatx_unpacked_to_f32(floatx_unpacked, ebits, mbits)
+    if scale is not None:
+        tensor = tensor * scale.float().view(-1, 1)
+    return tensor
+
+
+# https://github.com/microsoft/DeepSpeed/blob/3a3a6db3332e339cc9fd94efd4982f6d60635a3d/deepspeed/inference/v2/kernels/core_ops/cuda_linear/cuda_linear.py
+_SPLIT_K_MAP = [
+    {  # tokens: [1, 64]
+        3072: 18,
+        4096: 13,
+        5120: 10,
+        6144: 9,
+        8192: 6,
+        10240: 5,
+        14336: 7,
+        28672: 7,
+        57344: 7,
+    },
+    {  # tokens: [65:128]
+        3072: 9,
+        4096: 6,
+        5120: 5,
+        6144: 9,
+        8192: 3,
+        10240: 5,
+        14336: 7,
+        28672: 7,
+        57344: 6,
+    },
+    {  # tokens: [129:192]
+        3072: 6,
+        4096: 4,
+        5120: 7,
+        6144: 3,
+        8192: 2,
+        10240: 5,
+        14336: 5,
+        28672: 5,
+        57344: 4,
+    },
+    {  # tokens: [193:256]
+        3072: 9,
+        4096: 3,
+        5120: 5,
+        6144: 2,
+        8192: 5,
+        10240: 4,
+        14336: 8,
+        28672: 6,
+        57344: 4,
+    },
+    {  # tokens: [257:320]
+        3072: 7,
+        4096: 5,
+        5120: 2,
+        6144: 5,
+        8192: 4,
+        10240: 1,
+        14336: 3,
+        28672: 3,
+        57344: 4,
+    },
+    {  # tokens: [321:384]
+        3072: 3,
+        4096: 2,
+        5120: 5,
+        6144: 3,
+        8192: 1,
+        10240: 8,
+        14336: 3,
+        28672: 4,
+        57344: 3,
+    },
+    {  # tokens: [385:448]
+        3072: 5,
+        4096: 7,
+        5120: 3,
+        6144: 5,
+        8192: 7,
+        10240: 3,
+        14336: 1,
+        28672: 1,
+        57344: 3,
+    },
+    {  # tokens: [449:512]
+        3072: 2,
+        4096: 5,
+        5120: 4,
+        6144: 1,
+        8192: 5,
+        10240: 2,
+        14336: 6,
+        28672: 4,
+        57344: 1,
+    },
+    {  # tokens: [513:576]
+        3072: 2,
+        4096: 3,
+        5120: 1,
+        6144: 1,
+        8192: 3,
+        10240: 3,
+        14336: 3,
+        28672: 1,
+        57344: 1,
+    },
+    {  # tokens: [577:640]
+        3072: 5,
+        4096: 4,
+        5120: 1,
+        6144: 4,
+        8192: 2,
+        10240: 1,
+        14336: 1,
+        28672: 1,
+        57344: 1,
+    },
+    {  # tokens: [641:704]
+        3072: 3,
+        4096: 1,
+        5120: 2,
+        6144: 2,
+        8192: 1,
+        10240: 2,
+        14336: 1,
+        28672: 1,
+        57344: 1,
+    },
+    {  # tokens: [705:768]
+        3072: 3,
+        4096: 1,
+        5120: 3,
+        6144: 2,
+        8192: 1,
+        10240: 1,
+        14336: 1,
+        28672: 1,
+        57344: 1,
+    },
+]
+
+
+# quantization api integrations
+@dataclass(frozen=True)
+class FloatxTensorCoreLayout(Layout):
+    """FloatxTensorCoreLayout is a data class that defines the layout for a tensor with a specific number of exponent bits (ebits) and mantissa bits (mbits).
+    This layout is used in the context of quantization and packing of tensors optimized for TensorCore operations.
+    """
+
+    ebits: int
+    mbits: int
+
+
+@register_layout(FloatxTensorCoreLayout)
+class FloatxTensorCoreAQTTensorImpl(AQTTensorImpl):
+    """FloatxTensorCoreAQTTensorImpl represents a Tensor with dtype floatx(ebits=a, mbits=b),
+    it has a internal tensor field of "packed_floatx_data", which is packed from the
+    uint8 unpacked data (the output of `_quantize_affine_floatx` operator)
+
+    The packing is optimized for TensorCore, from the fp6-llm paper: https://arxiv.org/abs/2401.14112
+    github repo: https://github.com/usyd-fsalab/fp6_llm, now renamed to quant-llm
+
+    At a high level packing is done by grouping bits into 1 bit fragments (shards), 2 bit fragments and
+    4 bit fragments each fragments are packed separately and concatenated together.
+    For example for 6 bit dtype, we can extract the first 4 bits for all elements and pack them together
+    in a fragment, and extract the last 2 bits for all elements and pack them into fragment, in the end
+    we concatenate the fragments together.
+
+    If original Tensor shape is (M, N), and the data is in nbit, the shape of the packed data will be
+    (M, N // 8 * nbit)
+
+    FloatxTensorCoreAQTTensorImpl.from_plain takes an unpacked uint8 floatx Tensor of shape (M, N), with format of
+    (zero padding bits + sign bit + exponent bits + mantissa bits), e.g. 00SEEEMM for fp6_e3_m2
+    it will then pack the weight and instantiate the FloatxTensorCoreAQTTensorImpl tensor
+    FloatxTensorCoreAQTTensorImpl.__init__() takes a packed floatx Tensor of shape (M, N // 8 * nbit)
+    """
+
+    def __new__(
+        cls,
+        packed_floatx_data: torch.Tensor,
+        scale: torch.Tensor,
+        _layout: Layout,
+    ):
+        assert packed_floatx_data.ndim == 2
+        assert packed_floatx_data.dtype == torch.uint8
+        shape = (
+            packed_floatx_data.shape[0],
+            packed_floatx_data.shape[1] // (1 + _layout.ebits + _layout.mbits) * 8,
+        )
+        kwargs = {}
+        kwargs["device"] = packed_floatx_data.device
+        kwargs["layout"] = (
+            kwargs.get("layout")
+            if kwargs.get("layout", False)
+            else packed_floatx_data.layout
+        )
+        kwargs["dtype"] = packed_floatx_data.dtype
+        kwargs["requires_grad"] = False
+        return torch.Tensor._make_wrapper_subclass(cls, shape, **kwargs)  # type: ignore[attr-defined]
+
+    def __init__(
+        self,
+        packed_floatx_data: torch.Tensor,
+        scale: torch.Tensor,
+        _layout: Layout,
+    ):
+        self.packed_floatx_data = packed_floatx_data
+        self.scale = scale
+        self._layout = _layout
+
+    def __tensor_flatten__(self):
+        return ["packed_floatx_data", "scale"], [self._layout]
+
+    @classmethod
+    def __tensor_unflatten__(
+        cls, tensor_data_dict, tensor_attributes, outer_size, outer_stride
+    ):
+        packed_floatx_data, scale = (
+            tensor_data_dict["packed_floatx_data"],
+            tensor_data_dict["scale"],
+        )
+        (_layout,) = tensor_attributes
+        return cls(packed_floatx_data, scale, _layout)
+
+    def get_plain(self) -> Tuple[torch.Tensor, torch.Tensor]:
+        unpacked_floatx_data = unpack_tc_floatx(
+            self.packed_floatx_data, 1 + self._layout.ebits + self._layout.mbits
+        )
+        return unpacked_floatx_data, self.scale
+
+    @classmethod
+    def from_plain(
+        cls,
+        unpacked_floatx_data: torch.Tensor,
+        scale: torch.Tensor,
+        zero_point: Optional[torch.Tensor],
+        _layout: Layout,
+    ):
+        """
+        Format for `unpacked_floatx_data` will be:
+        zero padding bits | sign bit | exponent bits | mantissa bits
+
+        For example for fp6_e3_m2, the format will be: `00SEEEMM`, where S is sign bit, E is exponent
+        bit, M is mantissa bit
+        """
+        assert isinstance(_layout, FloatxTensorCoreLayout)
+        packed_floatx_data = pack_tc_floatx(
+            unpacked_floatx_data, 1 + _layout.ebits + _layout.mbits
+        )
+        return cls(packed_floatx_data, scale, _layout)
+
+    def __repr__(self):
+        unpacked_floatx_data, scale = self.get_plain()
+        _layout = self.get_layout()
+        return f"{self.__class__.__name__}(unpacked_floatx_data={unpacked_floatx_data}, scale={scale}, _layout={_layout})"
+
+    def _apply_fn_to_data(self, fn):
+        return self.__class__(
+            fn(self.packed_floatx_data),
+            fn(self.scale),
+            self._layout,
+        )
+
+    def to(self, *args, **kwargs):
+        kwargs = self._get_to_kwargs(*args, **kwargs)
+        device = kwargs.pop("device")
+        return self.__class__(
+            self.packed_floatx_data.to(device),
+            self.scale.to(device),
+            self._layout,
+        )
+
+    @classmethod
+    def __torch_dispatch__(cls, func, types, args, kwargs):
+        kwargs = {} if kwargs is None else kwargs
+
+        if func is aten.detach.default:
+            return return_and_correct_aliasing(
+                func, args, kwargs, args[0]._apply_fn_to_data(torch.detach)
+            )
+        elif func is aten.clone.default:
+            return return_and_correct_aliasing(
+                func, args, kwargs, args[0]._apply_fn_to_data(torch.clone)
+            )
+        elif func is aten._to_copy.default:
+            return return_and_correct_aliasing(
+                func,
+                args,
+                kwargs,
+                args[0]._apply_fn_to_data(
+                    lambda x: x.to(device=kwargs.pop("device", None))
+                ),
+            )
+
+        raise NotImplementedError(
+            f"FloatxTensorCoreAQTTensorImpl dispatch: attempting to run {func}, this is not supported"
+        )
+
+    __torch_function__ = torch._C._disabled_torch_function_impl
+
+    def get_layout(self) -> Layout:
+        return self._layout
+
+
+def _linear_f16_bf16_act_floatx_weight_check(input_tensor, weight_tensor, bias):
+    from torchao.dtypes.floatx import FloatxTensorCoreLayout
+
+    return (
+        # input is native float32 tensor
+        not is_traceable_wrapper_subclass(input_tensor)
+        and input_tensor.is_floating_point()
+        and input_tensor.dtype in (torch.float16, torch.bfloat16)
+        and
+        # weight is floatx Tensor
+        isinstance(weight_tensor, AffineQuantizedTensor)
+        and isinstance(weight_tensor._layout, FloatxTensorCoreLayout)
+        and (
+            # weight is using fp6 quantization
+            (weight_tensor._layout.ebits == 3 and weight_tensor._layout.mbits == 2)
+            or (weight_tensor._layout.ebits == 2 and weight_tensor._layout.mbits == 3)
+            or
+            # weight is using fp5 quantization
+            (weight_tensor._layout.ebits == 2 and weight_tensor._layout.mbits == 2)
+            or (weight_tensor._layout.ebits == 3 and weight_tensor._layout.mbits == 1)
+        )
+    )
+
+
+def _linear_f16_bf16_act_floatx_weight_impl(input_tensor, weight_tensor, bias):
+    from torchao.ops import quant_llm_linear
+
+    act = input_tensor
+    weight = weight_tensor
+
+    out_dim, in_dim = weight.shape
+    act_reshaped = act.view(-1, in_dim)
+
+    # https://github.com/microsoft/DeepSpeed/blob/3a3a6db3332e339cc9fd94efd4982f6d60635a3d/deepspeed/inference/v2/kernels/core_ops/cuda_linear/cuda_linear.py
+    bsize = act_reshaped.shape[0]
+    splitK = _SPLIT_K_MAP[(bsize - 1) // 64].get(out_dim, 1) if bsize <= 768 else 1
+
+    out = quant_llm_linear(
+        weight._layout.ebits,
+        weight._layout.mbits,
+        act_reshaped,
+        weight.tensor_impl.packed_floatx_data,
+        weight.tensor_impl.scale,
+        splitK=splitK,
+    )
+
+    if bias is not None:
+        out += bias
+
+    return out.view(*act.shape[:-1], out_dim).to(act.dtype)
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
index f8602fa66c..83af9068ae 100644
--- a/torchao/quantization/quant_api.py
+++ b/torchao/quantization/quant_api.py
@@ -2401,7 +2401,7 @@ def _fpx_weight_only_transform(
         module = _unwrap_float8_linear(module)
 
     from torchao.dtypes import to_affine_quantized_fpx
-    from torchao.dtypes.floatx import FloatxTensorCoreLayout
+    from torchao.prototype.dtypes.floatx import FloatxTensorCoreLayout
 
     assert weight.dim() == 2, f"floatx only works for 2-d Tensor, got: {weight.dim()}"
     out_dim, in_dim = weight.shape

From d7b537b0293798daec58bb98a10d988f3bebc2d5 Mon Sep 17 00:00:00 2001
From: Apurva Jain <apurvajain.kota@gmail.com>
Date: Wed, 12 Nov 2025 17:53:34 -0800
Subject: [PATCH 19/22] Use conda libgcc-ng 11.2 for nightly tests (#3326)

---
 .github/workflows/regression_test.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/regression_test.yml b/.github/workflows/regression_test.yml
index 149a7b07da..575aca6df0 100644
--- a/.github/workflows/regression_test.yml
+++ b/.github/workflows/regression_test.yml
@@ -45,7 +45,7 @@ jobs:
       gpu-arch-version: ${{ matrix.gpu-arch-version }}
       submodules: recursive
       script: |
-        conda create -n venv python=3.10 -y
+        conda create -n venv python=3.10 libgcc-ng=11.2.0 libstdcxx-ng=11.2.0 -y
         conda activate venv
         python -m pip install --upgrade pip
         pip install ${{ matrix.torch-spec }}
@@ -117,7 +117,7 @@ jobs:
       gpu-arch-version: ${{ matrix.gpu-arch-version }}
       submodules: recursive
       script: |
-        conda create -n venv python=3.10 libgcc-ng=11.2.0 libstdcxx-ng=11.2.0  -y 
+        conda create -n venv python=3.10 libgcc-ng=11.2.0 libstdcxx-ng=11.2.0 -y
         conda activate venv
         python -m pip install --upgrade pip
         pip install ${{ matrix.torch-spec }}

From 9ba0a3f487f230bba4b56fd978a1a29fca7e70a2 Mon Sep 17 00:00:00 2001
From: jainapurva <apurvajain.kota@gmail.com>
Date: Thu, 13 Nov 2025 18:13:42 +0000
Subject: [PATCH 20/22] Fix tests

---
 torchao/quantization/pt2e/utils.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/torchao/quantization/pt2e/utils.py b/torchao/quantization/pt2e/utils.py
index 7ff1dbc619..df92d485b9 100644
--- a/torchao/quantization/pt2e/utils.py
+++ b/torchao/quantization/pt2e/utils.py
@@ -859,6 +859,15 @@ def _get_aten_graph_module_for_pattern(
         ):
             aten_pattern.graph.erase_node(node)  # type: ignore[operator, union-attr]
 
+    # PyTorch 2.9+ adds _guards_fn nodes to exported graphs.
+    # These should not be part of pattern matching, so remove them.
+    for node in list(aten_pattern.graph.nodes):  # type: ignore[union-attr]
+        if node.op == "call_module" and node.name == "_guards_fn":
+            aten_pattern.graph.erase_node(node)  # type: ignore[operator, union-attr]
+            # Also remove the _guards_fn module from the graph module if it exists
+            if hasattr(aten_pattern, "_guards_fn"):
+                delattr(aten_pattern, "_guards_fn")
+
     aten_pattern.graph.eliminate_dead_code()  # type: ignore[operator, union-attr]
     aten_pattern.recompile()  # type: ignore[operator]
 

From 38848060580baa1bce0d3632e31d18417f55b677 Mon Sep 17 00:00:00 2001
From: jainapurva <apurvajain.kota@gmail.com>
Date: Fri, 14 Nov 2025 22:15:01 +0000
Subject: [PATCH 21/22] Add a condition to run only if torch 2.9

---
 torchao/quantization/pt2e/utils.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/torchao/quantization/pt2e/utils.py b/torchao/quantization/pt2e/utils.py
index df92d485b9..333e8ffc00 100644
--- a/torchao/quantization/pt2e/utils.py
+++ b/torchao/quantization/pt2e/utils.py
@@ -859,14 +859,15 @@ def _get_aten_graph_module_for_pattern(
         ):
             aten_pattern.graph.erase_node(node)  # type: ignore[operator, union-attr]
 
-    # PyTorch 2.9+ adds _guards_fn nodes to exported graphs.
-    # These should not be part of pattern matching, so remove them.
-    for node in list(aten_pattern.graph.nodes):  # type: ignore[union-attr]
-        if node.op == "call_module" and node.name == "_guards_fn":
-            aten_pattern.graph.erase_node(node)  # type: ignore[operator, union-attr]
-            # Also remove the _guards_fn module from the graph module if it exists
-            if hasattr(aten_pattern, "_guards_fn"):
-                delattr(aten_pattern, "_guards_fn")
+    if torch.__version__.startswith("2.9"):
+        # PyTorch 2.9 adds _guards_fn nodes to exported graphs.
+        # These have errors only on torch 2.9 and 2.9.0
+        for node in list(aten_pattern.graph.nodes):  # type: ignore[union-attr]
+            if node.op == "call_module" and node.name == "_guards_fn":
+                aten_pattern.graph.erase_node(node)  # type: ignore[operator, union-attr]
+                # Also remove the _guards_fn module from the graph module if it exists
+                if hasattr(aten_pattern, "_guards_fn"):
+                    delattr(aten_pattern, "_guards_fn")
 
     aten_pattern.graph.eliminate_dead_code()  # type: ignore[operator, union-attr]
     aten_pattern.recompile()  # type: ignore[operator]

From a543b2a7132d2c9cc18d95454845a67190c31da9 Mon Sep 17 00:00:00 2001
From: Apurva Jain <apurvajain.kota@gmail.com>
Date: Fri, 14 Nov 2025 14:17:10 -0800
Subject: [PATCH 22/22] Update utils.py

---
 torchao/quantization/pt2e/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchao/quantization/pt2e/utils.py b/torchao/quantization/pt2e/utils.py
index 333e8ffc00..f3cbffa430 100644
--- a/torchao/quantization/pt2e/utils.py
+++ b/torchao/quantization/pt2e/utils.py
@@ -861,7 +861,7 @@ def _get_aten_graph_module_for_pattern(
 
     if torch.__version__.startswith("2.9"):
         # PyTorch 2.9 adds _guards_fn nodes to exported graphs.
-        # These have errors only on torch 2.9 and 2.9.0
+        # These have errors only on torch 2.9.0 and 2.9.1
         for node in list(aten_pattern.graph.nodes):  # type: ignore[union-attr]
             if node.op == "call_module" and node.name == "_guards_fn":
                 aten_pattern.graph.erase_node(node)  # type: ignore[operator, union-attr]