first commit

2026-06-05 16:53:03 +08:00
commit 06f1fd69a6
6047 changed files with 1895387 additions and 0 deletions
--- a/tests/quantization/torchao_integration/test_torchao.py
+++ b/tests/quantization/torchao_integration/test_torchao.py
@@ -0,0 +1,662 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import tempfile
+import unittest
+
+from parameterized import parameterized
+
+from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig
+from transformers.testing_utils import (
+    Expectations,
+    backend_empty_cache,
+    require_cuda_capability_at_least,
+    require_torch_accelerator,
+    require_torch_multi_accelerator,
+    require_torchao,
+    slow,
+    torch_device,
+)
+from transformers.utils import is_torch_available, is_torchao_available
+
+
+if is_torch_available():
+    import torch
+
+if is_torchao_available():
+    from torchao.dtypes import (
+        AffineQuantizedTensor,
+    )
+    from torchao.prototype.mx_formats import NVFP4DynamicActivationNVFP4WeightConfig
+    from torchao.quantization import (
+        Float8DynamicActivationFloat8WeightConfig,
+        Float8Tensor,
+        Float8WeightOnlyConfig,
+        FqnToConfig,
+        Int4WeightOnlyConfig,
+        Int8DynamicActivationInt8WeightConfig,
+        Int8DynamicActivationIntxWeightConfig,
+        Int8WeightOnlyConfig,
+        IntxWeightOnlyConfig,
+        MappingType,
+        PerAxis,
+    )
+
+
+@require_torchao
+class TorchAoConfigTest(unittest.TestCase):
+    def test_to_dict(self):
+        """
+        Makes sure the config format is properly set
+        """
+        quantization_config = TorchAoConfig(Int4WeightOnlyConfig(group_size=32))
+        torchao_orig_config = quantization_config.to_dict()
+
+        self.assertIn("quant_type", torchao_orig_config)
+        self.assertIn("quant_method", torchao_orig_config)
+        self.assertEqual(torchao_orig_config["quant_method"], "torchao")
+
+    def test_repr(self):
+        """
+        Check that there is no error in the repr
+        """
+        config = Int4WeightOnlyConfig(group_size=8)
+        quantization_config = TorchAoConfig(config, modules_to_not_convert=["conv"])
+        repr(quantization_config)
+
+    def test_json_serializable(self):
+        """
+        Check that the config dict can be JSON serialized.
+        """
+        config = Int4WeightOnlyConfig(group_size=32)
+        quantization_config = TorchAoConfig(config)
+        d = quantization_config.to_dict()
+        self.assertTrue("group_size" in d["quant_type"]["default"]["_data"])
+        quantization_config.to_json_string(use_diff=False)
+
+
+@require_torchao
+@slow
+class TorchAoTestBase:
+    """Base mixin with all torchao test methods. Not a TestCase — subclass with unittest.TestCase to run."""
+
+    input_text = "What are we having for dinner?"
+    model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+    device = None  # must be set by subclass
+
+    def tearDown(self):
+        gc.collect()
+        backend_empty_cache(torch_device)
+        gc.collect()
+
+    def test_int4wo_quant(self):
+        """
+        Simple LLM model testing int4 weight only quantization
+        """
+        int4_packing_format = "plain_int32" if self.device == "xpu" else "tile_packed_to_4d"
+        config = Int4WeightOnlyConfig(int4_packing_format=int4_packing_format)
+        quant_config = TorchAoConfig(config)
+
+        quantized_model = AutoModelForCausalLM.from_pretrained(
+            self.model_name,
+            dtype=torch.bfloat16,
+            device_map=self.device,
+            quantization_config=quant_config,
+        )
+        tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+
+        self.assertIn("Int4", type(quantized_model.model.layers[0].self_attn.v_proj.weight).__name__)
+
+        input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
+
+        output = quantized_model.generate(**input_ids, max_new_tokens=10)
+        # fmt: off
+        EXPECTED_OUTPUT = Expectations(
+            {
+                ("cuda", None): "What are we having for dinner?\nRed, white, and green beans,",
+                ("xpu", None): "What are we having for dinner?\n\nJessica: (smiling)",
+                ("xpu", 5): "What are we having for dinner?\n\n[Scene 2]\n\n[",
+            }
+        )
+        # fmt: on
+        self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT.get_expectation())
+
+    def test_int8_dynamic_activation_int8_weight_quant(self):
+        """
+        Simple LLM model testing int8_dynamic_activation_int8_weight
+        """
+        config = Int8DynamicActivationInt8WeightConfig()
+        quant_config = TorchAoConfig(config)
+
+        quantized_model = AutoModelForCausalLM.from_pretrained(
+            self.model_name,
+            device_map=self.device,
+            quantization_config=quant_config,
+            torch_dtype=torch.bfloat16,
+        )
+        tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+
+        input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
+
+        output = quantized_model.generate(**input_ids, max_new_tokens=10)
+        EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
+        self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT)
+
+    def test_include_input_output_embeddings(self):
+        weight_dtype = torch.int8
+        granularity = PerAxis(0)
+        mapping_type = MappingType.ASYMMETRIC
+        embedding_config = IntxWeightOnlyConfig(
+            weight_dtype=weight_dtype,
+            granularity=granularity,
+            mapping_type=mapping_type,
+        )
+        config = FqnToConfig({"_default": None, "model.embed_tokens": embedding_config, "lm_head": embedding_config})
+        # need set `include_input_output_embeddings` to True
+        quant_config = TorchAoConfig(quant_type=config, include_input_output_embeddings=True)
+        quantized_model = AutoModelForCausalLM.from_pretrained(
+            self.model_name,
+            device_map=self.device,
+            quantization_config=quant_config,
+            torch_dtype=torch.bfloat16,
+        )
+        # making sure embedding is quantized
+        self.assertNotEqual(type(quantized_model.model.embed_tokens.weight).__name__, "Parameter")
+        self.assertNotEqual(type(quantized_model.lm_head.weight).__name__, "Parameter")
+        tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+
+        input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
+
+        output = quantized_model.generate(**input_ids, max_new_tokens=10)
+        EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
+        self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT)
+
+    def test_per_module_config_skip(self):
+        linear_config = Int8WeightOnlyConfig()
+        config = FqnToConfig({"_default": linear_config, "model.layers.0.self_attn.q_proj": None})
+        quant_config = TorchAoConfig(quant_type=config)
+        quantized_model = AutoModelForCausalLM.from_pretrained(
+            self.model_name,
+            device_map=self.device,
+            quantization_config=quant_config,
+            torch_dtype=torch.bfloat16,
+        )
+        # making sure `model.layers.0.self_attn.q_proj` is skipped
+        self.assertTrue(not isinstance(quantized_model.model.layers[0].self_attn.q_proj.weight, AffineQuantizedTensor))
+        tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+
+        input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
+
+        output = quantized_model.generate(**input_ids, max_new_tokens=10)
+        EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
+        self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT)
+
+    def test_fqn_to_config_regex_basic(self):
+        linear_config = Int8WeightOnlyConfig()
+        config = FqnToConfig({"_default": linear_config, r"re:model\.layers\..+\.self_attn\.q_proj": None})
+        quant_config = TorchAoConfig(quant_type=config)
+        quantized_model = AutoModelForCausalLM.from_pretrained(
+            self.model_name,
+            device_map=self.device,
+            quantization_config=quant_config,
+            torch_dtype=torch.bfloat16,
+        )
+        # making sure `model.layers.0.self_attn.q_proj` is skipped
+        self.assertTrue(not isinstance(quantized_model.model.layers[0].self_attn.q_proj.weight, AffineQuantizedTensor))
+        tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+
+        input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
+
+        output = quantized_model.generate(**input_ids, max_new_tokens=10)
+        EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
+        self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT)
+
+    def test_fqn_to_config_regex_fullmatch(self):
+        """Testing that we will only match the fqns that fully
+        matches the regex
+        """
+        linear1_config = Int8WeightOnlyConfig()
+        linear2_config = Float8WeightOnlyConfig()
+        # intentially removing `j` after `q_proj` so it's not a full match
+        config = FqnToConfig(
+            {
+                r"re:model\.layers\.+\.self_attn\.q_pro": linear1_config,
+                "model.layers.3.self_attn.q_proj": linear2_config,
+            }
+        )
+        quant_config = TorchAoConfig(quant_type=config)
+        quantized_model = AutoModelForCausalLM.from_pretrained(
+            self.model_name,
+            device_map=self.device,
+            quantization_config=quant_config,
+            torch_dtype=torch.bfloat16,
+        )
+        # highest precedence is fully specified module fqn
+        self.assertTrue(isinstance(quantized_model.model.layers[3].self_attn.q_proj.weight, Float8Tensor))
+        # because regex `model\.layers\.+*\.self_attn\.q_pro` didin't fully match `model.layers.1.self_attn.q_proj` (missing last `j`)
+        # this layer is not expected to be quantized to int8
+        self.assertTrue(not isinstance(quantized_model.model.layers[1].self_attn.q_proj.weight, AffineQuantizedTensor))
+        tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+
+        input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
+
+        output = quantized_model.generate(**input_ids, max_new_tokens=10)
+        EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
+        self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT)
+
+    def test_fqn_to_config_module_regex_precedence(self):
+        linear1_config = Int8WeightOnlyConfig()
+        linear2_config = Float8WeightOnlyConfig()
+        config = FqnToConfig(
+            {
+                r"re:model\.layers\..+\.self_attn\.q_proj": None,
+                "model.layers.3.self_attn.q_proj": linear2_config,
+                "_default": linear1_config,
+            }
+        )
+        quant_config = TorchAoConfig(quant_type=config)
+        quantized_model = AutoModelForCausalLM.from_pretrained(
+            self.model_name,
+            device_map=self.device,
+            quantization_config=quant_config,
+            torch_dtype=torch.bfloat16,
+        )
+        # highest precedence is fully specified module fqn
+        self.assertTrue(isinstance(quantized_model.model.layers[3].self_attn.q_proj.weight, Float8Tensor))
+        # second precedence: regex
+        self.assertTrue(not isinstance(quantized_model.model.layers[1].self_attn.q_proj.weight, AffineQuantizedTensor))
+        # last precedence: _default
+        self.assertTrue(isinstance(quantized_model.model.layers[1].self_attn.k_proj.weight, AffineQuantizedTensor))
+        tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+
+        input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
+
+        output = quantized_model.generate(**input_ids, max_new_tokens=10)
+        EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
+        self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT)
+
+    def test_fqn_to_config_regex_precedence(self):
+        linear1_config = Int8WeightOnlyConfig()
+        linear2_config = Float8WeightOnlyConfig()
+        config = FqnToConfig(
+            {
+                r"re:model\.layers\..+\.self_attn\.q_proj.weight": None,
+                "model.layers.3.self_attn.q_proj.weight": linear2_config,
+                "_default": linear1_config,
+            }
+        )
+        quant_config = TorchAoConfig(quant_type=config)
+        quantized_model = AutoModelForCausalLM.from_pretrained(
+            self.model_name,
+            device_map=self.device,
+            quantization_config=quant_config,
+            torch_dtype=torch.bfloat16,
+        )
+        self.assertTrue(isinstance(quantized_model.model.layers[3].self_attn.q_proj.weight, Float8Tensor))
+        self.assertTrue(not isinstance(quantized_model.model.layers[1].self_attn.q_proj.weight, AffineQuantizedTensor))
+        self.assertTrue(isinstance(quantized_model.model.layers[1].self_attn.k_proj.weight, AffineQuantizedTensor))
+        tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+
+        input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
+
+        output = quantized_model.generate(**input_ids, max_new_tokens=10)
+        EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
+        self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT)
+
+    def test_fqn_to_config_param_over_module_regex_precedence(self):
+        linear1_config = Int8WeightOnlyConfig()
+        linear2_config = Float8WeightOnlyConfig()
+        config = FqnToConfig(
+            {
+                r"re:model\.layers\..+\.self_attn\.q_proj.weight": None,
+                r"re:model\.layers\..+\.self_attn\.q_proj": linear2_config,
+                "_default": linear1_config,
+            }
+        )
+        quant_config = TorchAoConfig(quant_type=config)
+        quantized_model = AutoModelForCausalLM.from_pretrained(
+            self.model_name,
+            device_map=self.device,
+            quantization_config=quant_config,
+            torch_dtype=torch.bfloat16,
+        )
+        self.assertTrue(not isinstance(quantized_model.model.layers[1].self_attn.q_proj.weight, AffineQuantizedTensor))
+        self.assertTrue(isinstance(quantized_model.model.layers[1].self_attn.k_proj.weight, AffineQuantizedTensor))
+        tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+
+        input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
+
+        output = quantized_model.generate(**input_ids, max_new_tokens=10)
+        EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
+        self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT)
+
+    def test_fqn_to_config_param_over_module_precedence(self):
+        linear1_config = Int8WeightOnlyConfig()
+        linear2_config = Float8WeightOnlyConfig()
+        config = FqnToConfig(
+            {
+                "model.layers.3.self_attn.q_proj.weight": None,
+                "model.layers.3.self_attn.q_proj": linear2_config,
+                "_default": linear1_config,
+            }
+        )
+        quant_config = TorchAoConfig(quant_type=config)
+        quantized_model = AutoModelForCausalLM.from_pretrained(
+            self.model_name,
+            device_map=self.device,
+            quantization_config=quant_config,
+            torch_dtype=torch.bfloat16,
+        )
+        self.assertTrue(not isinstance(quantized_model.model.layers[3].self_attn.q_proj.weight, AffineQuantizedTensor))
+        self.assertTrue(isinstance(quantized_model.model.layers[3].self_attn.k_proj.weight, AffineQuantizedTensor))
+        tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+
+        input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
+
+        output = quantized_model.generate(**input_ids, max_new_tokens=10)
+        EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
+        self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT)
+
+    def test_fqn_to_config_exact_over_regex_precedence(self):
+        linear1_config = Int8WeightOnlyConfig()
+        linear2_config = Float8WeightOnlyConfig()
+        config = FqnToConfig(
+            {
+                "model.layers.3.self_attn.q_proj.weight": None,
+                "model.layers.1.self_attn.q_proj": linear1_config,
+                r"re:model\.layers\..+\.self_attn\.q_proj.weight": linear2_config,
+            }
+        )
+        quant_config = TorchAoConfig(quant_type=config)
+        quantized_model = AutoModelForCausalLM.from_pretrained(
+            self.model_name,
+            device_map=self.device,
+            quantization_config=quant_config,
+            torch_dtype=torch.bfloat16,
+        )
+        self.assertTrue(not isinstance(quantized_model.model.layers[3].self_attn.q_proj.weight, AffineQuantizedTensor))
+        self.assertTrue(isinstance(quantized_model.model.layers[1].self_attn.q_proj.weight, AffineQuantizedTensor))
+        self.assertTrue(isinstance(quantized_model.model.layers[2].self_attn.q_proj.weight, Float8Tensor))
+
+        tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+
+        input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
+
+        output = quantized_model.generate(**input_ids, max_new_tokens=10)
+        EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
+        self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT)
+
+    @require_cuda_capability_at_least(8, 9)
+    def test_fqn_to_config_non_weight_param(self):
+        linear1_config = Int8WeightOnlyConfig()
+        linear2_config = Float8WeightOnlyConfig()
+        config = FqnToConfig(
+            {
+                r"re:.*gate_up_proj": linear2_config,
+                "model.layers.0.feed_forward.experts.gate_up_proj": None,
+                "_default": linear1_config,
+            }
+        )
+        quant_config = TorchAoConfig(quant_type=config)
+        quantized_model = AutoModelForCausalLM.from_pretrained(
+            "jcaip/Llama-4-Scout-17B-two-layers-only-testing",
+            device_map=self.device,
+            dtype=torch.bfloat16,
+            quantization_config=quant_config,
+        )
+
+        self.assertTrue(isinstance(quantized_model.model.layers[1].feed_forward.experts.gate_up_proj, Float8Tensor))
+        self.assertTrue(
+            not isinstance(quantized_model.model.layers[0].feed_forward.experts.gate_up_proj, Float8Tensor)
+        )
+        self.assertTrue(isinstance(quantized_model.model.layers[1].self_attn.q_proj.weight, AffineQuantizedTensor))
+
+    def test_compute_module_sizes(self):
+        r"""
+        Test if we compute the right module sizes needed to generate the device map.
+        Also test if we get the right values for `total_byte_count` in `caching_allocator_warmup`.
+        """
+        from transformers import AutoConfig
+        from transformers.integrations.accelerate import compute_module_sizes
+        from transformers.modeling_utils import expand_device_map, get_total_byte_count
+        from transformers.quantizers import AutoHfQuantizer
+
+        # we need to preprocess the model like that because device_map calculation happens before we load the weights inside the model.
+        # For normal wieghts, it's fine but for quantized weights, the tensors dtype might change during loading.
+        with torch.device("meta"):
+            config = AutoConfig.from_pretrained(self.model_name)
+            model = AutoModelForCausalLM.from_config(config, dtype=torch.bfloat16)
+            model_size, _ = compute_module_sizes(model, only_modules=False)
+
+            expected_keys = [name for name, _ in model.named_parameters()] + [
+                name for name, _ in model.named_buffers()
+            ]
+            expanded_device_map = expand_device_map({"": torch_device}, expected_keys)
+            total_byte_count = list(get_total_byte_count(model, expanded_device_map).values())[0]
+
+            # testing prequantized = False should be enough, the shape should be the same whether it is pre-quantized or not
+            hf_quantizer = AutoHfQuantizer.from_config(
+                TorchAoConfig(quant_type=Int4WeightOnlyConfig()), pre_quantized=False
+            )
+            hf_quantizer.preprocess_model(model=model, config=model.config)
+            quantized_model_size, _ = compute_module_sizes(model, hf_quantizer, only_modules=False)
+
+            expected_keys = [name for name, _ in model.named_parameters()] + [
+                name for name, _ in model.named_buffers()
+            ]
+            expanded_device_map = expand_device_map({"": torch_device}, expected_keys)
+            quantized_total_byte_count = list(get_total_byte_count(model, expanded_device_map, hf_quantizer).values())[
+                0
+            ]
+
+        for name, module in model.named_modules():
+            # modules are not replaced when using torchao
+            if isinstance(module, torch.nn.Linear) and "lm_head" not in name:
+                # from 16 bits to 4 bits
+                assert int(model_size[f"{name}.weight"] // 4) == int(quantized_model_size[f"{name}.weight"])
+
+        # check that we get the same value, as we use `compute_module_sizes` in `get_total_byte_count`
+        assert total_byte_count == model_size[""]
+        assert quantized_total_byte_count == quantized_model_size[""]
+
+        # we should at least have 1.5 times memory reduction in total
+        assert model_size[""] > quantized_model_size[""] * 2
+
+
+class TorchAoCPUTest(TorchAoTestBase, unittest.TestCase):
+    device = "cpu"
+
+    @unittest.skip("Int4 does not support CPU")
+    def test_int4wo_quant(self):
+        pass
+
+
+@require_torch_accelerator
+class TorchAoAcceleratorTest(TorchAoTestBase, unittest.TestCase):
+    device = torch_device
+
+    def test_int4wo_offload(self):
+        """
+        Test Int4 weight-only quantization with CPU offload.
+        """
+        device_map_offload = {
+            "model.embed_tokens": 0,
+            "model.layers.0": 0,
+            "model.layers.1": 0,
+            "model.layers.2": 0,
+            "model.layers.3": 0,
+            "model.layers.4": 0,
+            "model.layers.5": 0,
+            "model.layers.6": 0,
+            "model.layers.7": 0,
+            "model.layers.8": 0,
+            "model.layers.9": 0,
+            "model.layers.10": 0,
+            "model.layers.11": 0,
+            "model.layers.12": 0,
+            "model.layers.13": 0,
+            "model.layers.14": 0,
+            "model.layers.15": 0,
+            "model.layers.16": 0,
+            "model.layers.17": 0,
+            "model.layers.18": 0,
+            "model.layers.19": "cpu",
+            "model.layers.20": "cpu",
+            "model.layers.21": "cpu",
+            "model.norm": 0,
+            "model.rotary_emb": 0,
+            "lm_head": 0,
+        }
+
+        int4_packing_format = "plain_int32" if self.device == "xpu" else "tile_packed_to_4d"
+        config = Int4WeightOnlyConfig(int4_packing_format=int4_packing_format)
+        quant_config = TorchAoConfig(config)
+
+        quantized_model = AutoModelForCausalLM.from_pretrained(
+            self.model_name,
+            torch_dtype=torch.bfloat16,
+            device_map=device_map_offload,
+            quantization_config=quant_config,
+        )
+        tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+
+        input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
+
+        output = quantized_model.generate(**input_ids, max_new_tokens=10)
+        # fmt: off
+        EXPECTED_OUTPUT = Expectations(
+            {
+                ("cuda", None): "What are we having for dinner?\nRed, white, and green beans,",
+                ("xpu", None): "What are we having for dinner?\n\nJessica: (smiling)",
+                ("xpu", 5): "What are we having for dinner?\n\n[Scene 2]\n\n[",
+            }
+        )
+        # fmt: on
+        self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT.get_expectation())
+
+    @require_torch_multi_accelerator
+    def test_int4wo_quant_multi_accelerator(self):
+        """
+        Simple test that checks if the quantized model int4 weight only is working properly with multiple accelerators
+        set CUDA_VISIBLE_DEVICES=0,1 if you have more than 2 CUDA GPUs
+        set ZE_AFFINITY_MASK=0,1 if you have more than 2 Intel XPUs
+        """
+
+        int4_packing_format = "plain_int32" if self.device == "xpu" else "tile_packed_to_4d"
+        config = Int4WeightOnlyConfig(int4_packing_format=int4_packing_format)
+        quant_config = TorchAoConfig(config)
+        quantized_model = AutoModelForCausalLM.from_pretrained(
+            self.model_name,
+            torch_dtype=torch.bfloat16,
+            device_map="auto",
+            quantization_config=quant_config,
+        )
+        tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+
+        self.assertTrue(set(quantized_model.hf_device_map.values()) == {0, 1})
+
+        input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
+
+        output = quantized_model.generate(**input_ids, max_new_tokens=10)
+        EXPECTED_OUTPUT = Expectations(
+            {
+                ("cuda", None): "What are we having for dinner?\nRed, white, and green beans,",
+                ("xpu", None): "What are we having for dinner?\n\nJessica: (smiling)",
+                ("xpu", 5): "What are we having for dinner?\n\n[Scene 2]\n\n[",
+            }
+        )
+        self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT.get_expectation())
+
+
+@slow
+@require_torchao
+class TorchAoSerializationTest(unittest.TestCase):
+    """Parameterized serialization tests: quantize, save, reload, check output."""
+
+    input_text = "What are we having for dinner?"
+    model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+
+    # fmt: off
+    COMMON_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
+    ALL_DEVICES_COMMON = Expectations({("cpu", None): COMMON_OUTPUT, ("cuda", None): COMMON_OUTPUT, ("xpu", None): COMMON_OUTPUT})
+
+    test_params = (
+        [
+            ("Int8WeightOnlyConfig", Int8WeightOnlyConfig(version=2), ALL_DEVICES_COMMON),
+            ("Int8DynamicActivationInt8WeightConfig", Int8DynamicActivationInt8WeightConfig(version=2), ALL_DEVICES_COMMON),
+            ("Float8DynamicActivationFloat8WeightConfig", Float8DynamicActivationFloat8WeightConfig(), Expectations({("cuda", None): COMMON_OUTPUT, ("xpu", None): "What are we having for dinner?\n\nJess: (smiling) I", ("xpu", 5): COMMON_OUTPUT})),
+            ("Float8WeightOnlyConfig", Float8WeightOnlyConfig(), Expectations({("cuda", None): COMMON_OUTPUT, ("xpu", None): COMMON_OUTPUT})),
+            ("Int4WeightOnlyConfig", Int4WeightOnlyConfig(int4_packing_format="plain_int32" if torch_device == "xpu" else "tile_packed_to_4d"), Expectations({("cuda", None): "What are we having for dinner?\nRed, white, and green beans,", ("xpu", None): COMMON_OUTPUT, ("xpu", 5): "What are we having for dinner?\n\n[Scene 2]\n\n["})),
+            ("Int8DynamicActivationIntxWeightConfig", Int8DynamicActivationIntxWeightConfig(), Expectations({("cpu", None): COMMON_OUTPUT, ("cuda", 9): COMMON_OUTPUT, ("cuda", 8): "What are we having for dinner?\n\nJEN: (smiling) I", ("xpu", None): COMMON_OUTPUT})),
+            ("IntxWeightOnlyConfig", IntxWeightOnlyConfig(), ALL_DEVICES_COMMON),
+            ("NVFP4DynamicActivationNVFP4WeightConfig", NVFP4DynamicActivationNVFP4WeightConfig(), Expectations({("cuda", None): "What are we having for dinner?\n\n10. Avoid using \"I"})),
+        ]
+        if is_torchao_available()
+        else []
+    )
+    # fmt: on
+
+    def tearDown(self):
+        gc.collect()
+        backend_empty_cache(torch_device)
+        gc.collect()
+
+    def _check_serialization(self, device, config, expected_output):
+        if isinstance(config, (Float8DynamicActivationFloat8WeightConfig, Float8WeightOnlyConfig)):
+            if torch.cuda.is_available() and torch.cuda.get_device_capability() < (8, 9):
+                self.skipTest(f"{type(config).__name__} requires CUDA capability >= (8, 9)")
+        if isinstance(config, NVFP4DynamicActivationNVFP4WeightConfig):
+            if torch.cuda.is_available() and torch.cuda.get_device_capability() < (10, 0):
+                self.skipTest(f"{type(config).__name__} requires CUDA capability >= (10, 0) (SM100)")
+        quant_config = TorchAoConfig(config)
+        needs_bfloat16 = isinstance(config, Int4WeightOnlyConfig | NVFP4DynamicActivationNVFP4WeightConfig)
+        dtype = torch.bfloat16 if needs_bfloat16 else "auto"
+        quantized_model = AutoModelForCausalLM.from_pretrained(
+            self.model_name,
+            dtype=dtype,
+            device_map=device,
+            quantization_config=quant_config,
+        )
+        tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+        input_ids = tokenizer(self.input_text, return_tensors="pt").to(device)
+        output = quantized_model.generate(**input_ids, max_new_tokens=10)
+        self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), expected_output)
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            quantized_model.save_pretrained(tmpdirname)
+            loaded_model = AutoModelForCausalLM.from_pretrained(tmpdirname, dtype=dtype, device_map=device)
+            input_ids = tokenizer(self.input_text, return_tensors="pt").to(device)
+            output = loaded_model.generate(**input_ids, max_new_tokens=10)
+            self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), expected_output)
+
+    @parameterized.expand(test_params, skip_on_empty=True)
+    def test_serialization_cpu(self, _name, config, expected_outputs):
+        try:
+            expected = expected_outputs.find_expectation(("cpu", None, None))
+        except ValueError:
+            self.skipTest(f"{type(config).__name__} does not support CPU")
+        self._check_serialization("cpu", config, expected)
+
+    @parameterized.expand(test_params, skip_on_empty=True)
+    @require_torch_accelerator
+    def test_serialization_accelerator(self, _name, config, expected_outputs):
+        try:
+            expected = expected_outputs.get_expectation()
+        except ValueError:
+            self.skipTest(f"{type(config).__name__} does not support {torch_device}")
+        self._check_serialization(torch_device, config, expected)
+
+
+if __name__ == "__main__":
+    unittest.main()