transformers/tests/quantization/torchao_integration/test_torchao.py

# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import gc
import tempfile
import unittest

from parameterized import parameterized

from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig
from transformers.testing_utils import (
    Expectations,
    backend_empty_cache,
    require_cuda_capability_at_least,
    require_torch_accelerator,
    require_torch_multi_accelerator,
    require_torchao,
    slow,
    torch_device,
)
from transformers.utils import is_torch_available, is_torchao_available


if is_torch_available():
    import torch

if is_torchao_available():
    from torchao.dtypes import (
        AffineQuantizedTensor,
    )
    from torchao.prototype.mx_formats import NVFP4DynamicActivationNVFP4WeightConfig
    from torchao.quantization import (
        Float8DynamicActivationFloat8WeightConfig,
        Float8Tensor,
        Float8WeightOnlyConfig,
        FqnToConfig,
        Int4WeightOnlyConfig,
        Int8DynamicActivationInt8WeightConfig,
        Int8DynamicActivationIntxWeightConfig,
        Int8WeightOnlyConfig,
        IntxWeightOnlyConfig,
        MappingType,
        PerAxis,
    )


@require_torchao
class TorchAoConfigTest(unittest.TestCase):
    def test_to_dict(self):
        """
        Makes sure the config format is properly set
        """
        quantization_config = TorchAoConfig(Int4WeightOnlyConfig(group_size=32))
        torchao_orig_config = quantization_config.to_dict()

        self.assertIn("quant_type", torchao_orig_config)
        self.assertIn("quant_method", torchao_orig_config)
        self.assertEqual(torchao_orig_config["quant_method"], "torchao")

    def test_repr(self):
        """
        Check that there is no error in the repr
        """
        config = Int4WeightOnlyConfig(group_size=8)
        quantization_config = TorchAoConfig(config, modules_to_not_convert=["conv"])
        repr(quantization_config)

    def test_json_serializable(self):
        """
        Check that the config dict can be JSON serialized.
        """
        config = Int4WeightOnlyConfig(group_size=32)
        quantization_config = TorchAoConfig(config)
        d = quantization_config.to_dict()
        self.assertTrue("group_size" in d["quant_type"]["default"]["_data"])
        quantization_config.to_json_string(use_diff=False)


@require_torchao
@slow
class TorchAoTestBase:
    """Base mixin with all torchao test methods. Not a TestCase — subclass with unittest.TestCase to run."""

    input_text = "What are we having for dinner?"
    model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
    device = None  # must be set by subclass

    def tearDown(self):
        gc.collect()
        backend_empty_cache(torch_device)
        gc.collect()

    def test_int4wo_quant(self):
        """
        Simple LLM model testing int4 weight only quantization
        """
        int4_packing_format = "plain_int32" if self.device == "xpu" else "tile_packed_to_4d"
        config = Int4WeightOnlyConfig(int4_packing_format=int4_packing_format)
        quant_config = TorchAoConfig(config)

        quantized_model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            dtype=torch.bfloat16,
            device_map=self.device,
            quantization_config=quant_config,
        )
        tokenizer = AutoTokenizer.from_pretrained(self.model_name)

        self.assertIn("Int4", type(quantized_model.model.layers[0].self_attn.v_proj.weight).__name__)

        input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)

        output = quantized_model.generate(**input_ids, max_new_tokens=10)
        # fmt: off
        EXPECTED_OUTPUT = Expectations(
            {
                ("cuda", None): "What are we having for dinner?\nRed, white, and green beans,",
                ("xpu", None): "What are we having for dinner?\n\nJessica: (smiling)",
                ("xpu", 5): "What are we having for dinner?\n\n[Scene 2]\n\n[",
            }
        )
        # fmt: on
        self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT.get_expectation())

    def test_int8_dynamic_activation_int8_weight_quant(self):
        """
        Simple LLM model testing int8_dynamic_activation_int8_weight
        """
        config = Int8DynamicActivationInt8WeightConfig()
        quant_config = TorchAoConfig(config)

        quantized_model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            device_map=self.device,
            quantization_config=quant_config,
            torch_dtype=torch.bfloat16,
        )
        tokenizer = AutoTokenizer.from_pretrained(self.model_name)

        input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)

        output = quantized_model.generate(**input_ids, max_new_tokens=10)
        EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
        self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT)

    def test_include_input_output_embeddings(self):
        weight_dtype = torch.int8
        granularity = PerAxis(0)
        mapping_type = MappingType.ASYMMETRIC
        embedding_config = IntxWeightOnlyConfig(
            weight_dtype=weight_dtype,
            granularity=granularity,
            mapping_type=mapping_type,
        )
        config = FqnToConfig({"_default": None, "model.embed_tokens": embedding_config, "lm_head": embedding_config})
        # need set `include_input_output_embeddings` to True
        quant_config = TorchAoConfig(quant_type=config, include_input_output_embeddings=True)
        quantized_model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            device_map=self.device,
            quantization_config=quant_config,
            torch_dtype=torch.bfloat16,
        )
        # making sure embedding is quantized
        self.assertNotEqual(type(quantized_model.model.embed_tokens.weight).__name__, "Parameter")
        self.assertNotEqual(type(quantized_model.lm_head.weight).__name__, "Parameter")
        tokenizer = AutoTokenizer.from_pretrained(self.model_name)

        input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)

        output = quantized_model.generate(**input_ids, max_new_tokens=10)
        EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
        self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT)

    def test_per_module_config_skip(self):
        linear_config = Int8WeightOnlyConfig()
        config = FqnToConfig({"_default": linear_config, "model.layers.0.self_attn.q_proj": None})
        quant_config = TorchAoConfig(quant_type=config)
        quantized_model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            device_map=self.device,
            quantization_config=quant_config,
            torch_dtype=torch.bfloat16,
        )
        # making sure `model.layers.0.self_attn.q_proj` is skipped
        self.assertTrue(not isinstance(quantized_model.model.layers[0].self_attn.q_proj.weight, AffineQuantizedTensor))
        tokenizer = AutoTokenizer.from_pretrained(self.model_name)

        input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)

        output = quantized_model.generate(**input_ids, max_new_tokens=10)
        EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
        self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT)

    def test_fqn_to_config_regex_basic(self):
        linear_config = Int8WeightOnlyConfig()
        config = FqnToConfig({"_default": linear_config, r"re:model\.layers\..+\.self_attn\.q_proj": None})
        quant_config = TorchAoConfig(quant_type=config)
        quantized_model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            device_map=self.device,
            quantization_config=quant_config,
            torch_dtype=torch.bfloat16,
        )
        # making sure `model.layers.0.self_attn.q_proj` is skipped
        self.assertTrue(not isinstance(quantized_model.model.layers[0].self_attn.q_proj.weight, AffineQuantizedTensor))
        tokenizer = AutoTokenizer.from_pretrained(self.model_name)

        input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)

        output = quantized_model.generate(**input_ids, max_new_tokens=10)
        EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
        self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT)

    def test_fqn_to_config_regex_fullmatch(self):
        """Testing that we will only match the fqns that fully
        matches the regex
        """
        linear1_config = Int8WeightOnlyConfig()
        linear2_config = Float8WeightOnlyConfig()
        # intentially removing `j` after `q_proj` so it's not a full match
        config = FqnToConfig(
            {
                r"re:model\.layers\.+\.self_attn\.q_pro": linear1_config,
                "model.layers.3.self_attn.q_proj": linear2_config,
            }
        )
        quant_config = TorchAoConfig(quant_type=config)
        quantized_model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            device_map=self.device,
            quantization_config=quant_config,
            torch_dtype=torch.bfloat16,
        )
        # highest precedence is fully specified module fqn
        self.assertTrue(isinstance(quantized_model.model.layers[3].self_attn.q_proj.weight, Float8Tensor))
        # because regex `model\.layers\.+*\.self_attn\.q_pro` didin't fully match `model.layers.1.self_attn.q_proj` (missing last `j`)
        # this layer is not expected to be quantized to int8
        self.assertTrue(not isinstance(quantized_model.model.layers[1].self_attn.q_proj.weight, AffineQuantizedTensor))
        tokenizer = AutoTokenizer.from_pretrained(self.model_name)

        input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)

        output = quantized_model.generate(**input_ids, max_new_tokens=10)
        EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
        self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT)

    def test_fqn_to_config_module_regex_precedence(self):
        linear1_config = Int8WeightOnlyConfig()
        linear2_config = Float8WeightOnlyConfig()
        config = FqnToConfig(
            {
                r"re:model\.layers\..+\.self_attn\.q_proj": None,
                "model.layers.3.self_attn.q_proj": linear2_config,
                "_default": linear1_config,
            }
        )
        quant_config = TorchAoConfig(quant_type=config)
        quantized_model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            device_map=self.device,
            quantization_config=quant_config,
            torch_dtype=torch.bfloat16,
        )
        # highest precedence is fully specified module fqn
        self.assertTrue(isinstance(quantized_model.model.layers[3].self_attn.q_proj.weight, Float8Tensor))
        # second precedence: regex
        self.assertTrue(not isinstance(quantized_model.model.layers[1].self_attn.q_proj.weight, AffineQuantizedTensor))
        # last precedence: _default
        self.assertTrue(isinstance(quantized_model.model.layers[1].self_attn.k_proj.weight, AffineQuantizedTensor))
        tokenizer = AutoTokenizer.from_pretrained(self.model_name)

        input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)

        output = quantized_model.generate(**input_ids, max_new_tokens=10)
        EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
        self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT)

    def test_fqn_to_config_regex_precedence(self):
        linear1_config = Int8WeightOnlyConfig()
        linear2_config = Float8WeightOnlyConfig()
        config = FqnToConfig(
            {
                r"re:model\.layers\..+\.self_attn\.q_proj.weight": None,
                "model.layers.3.self_attn.q_proj.weight": linear2_config,
                "_default": linear1_config,
            }
        )
        quant_config = TorchAoConfig(quant_type=config)
        quantized_model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            device_map=self.device,
            quantization_config=quant_config,
            torch_dtype=torch.bfloat16,
        )
        self.assertTrue(isinstance(quantized_model.model.layers[3].self_attn.q_proj.weight, Float8Tensor))
        self.assertTrue(not isinstance(quantized_model.model.layers[1].self_attn.q_proj.weight, AffineQuantizedTensor))
        self.assertTrue(isinstance(quantized_model.model.layers[1].self_attn.k_proj.weight, AffineQuantizedTensor))
        tokenizer = AutoTokenizer.from_pretrained(self.model_name)

        input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)

        output = quantized_model.generate(**input_ids, max_new_tokens=10)
        EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
        self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT)

    def test_fqn_to_config_param_over_module_regex_precedence(self):
        linear1_config = Int8WeightOnlyConfig()
        linear2_config = Float8WeightOnlyConfig()
        config = FqnToConfig(
            {
                r"re:model\.layers\..+\.self_attn\.q_proj.weight": None,
                r"re:model\.layers\..+\.self_attn\.q_proj": linear2_config,
                "_default": linear1_config,
            }
        )
        quant_config = TorchAoConfig(quant_type=config)
        quantized_model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            device_map=self.device,
            quantization_config=quant_config,
            torch_dtype=torch.bfloat16,
        )
        self.assertTrue(not isinstance(quantized_model.model.layers[1].self_attn.q_proj.weight, AffineQuantizedTensor))
        self.assertTrue(isinstance(quantized_model.model.layers[1].self_attn.k_proj.weight, AffineQuantizedTensor))
        tokenizer = AutoTokenizer.from_pretrained(self.model_name)

        input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)

        output = quantized_model.generate(**input_ids, max_new_tokens=10)
        EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
        self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT)

    def test_fqn_to_config_param_over_module_precedence(self):
        linear1_config = Int8WeightOnlyConfig()
        linear2_config = Float8WeightOnlyConfig()
        config = FqnToConfig(
            {
                "model.layers.3.self_attn.q_proj.weight": None,
                "model.layers.3.self_attn.q_proj": linear2_config,
                "_default": linear1_config,
            }
        )
        quant_config = TorchAoConfig(quant_type=config)
        quantized_model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            device_map=self.device,
            quantization_config=quant_config,
            torch_dtype=torch.bfloat16,
        )
        self.assertTrue(not isinstance(quantized_model.model.layers[3].self_attn.q_proj.weight, AffineQuantizedTensor))
        self.assertTrue(isinstance(quantized_model.model.layers[3].self_attn.k_proj.weight, AffineQuantizedTensor))
        tokenizer = AutoTokenizer.from_pretrained(self.model_name)

        input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)

        output = quantized_model.generate(**input_ids, max_new_tokens=10)
        EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
        self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT)

    def test_fqn_to_config_exact_over_regex_precedence(self):
        linear1_config = Int8WeightOnlyConfig()
        linear2_config = Float8WeightOnlyConfig()
        config = FqnToConfig(
            {
                "model.layers.3.self_attn.q_proj.weight": None,
                "model.layers.1.self_attn.q_proj": linear1_config,
                r"re:model\.layers\..+\.self_attn\.q_proj.weight": linear2_config,
            }
        )
        quant_config = TorchAoConfig(quant_type=config)
        quantized_model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            device_map=self.device,
            quantization_config=quant_config,
            torch_dtype=torch.bfloat16,
        )
        self.assertTrue(not isinstance(quantized_model.model.layers[3].self_attn.q_proj.weight, AffineQuantizedTensor))
        self.assertTrue(isinstance(quantized_model.model.layers[1].self_attn.q_proj.weight, AffineQuantizedTensor))
        self.assertTrue(isinstance(quantized_model.model.layers[2].self_attn.q_proj.weight, Float8Tensor))

        tokenizer = AutoTokenizer.from_pretrained(self.model_name)

        input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)

        output = quantized_model.generate(**input_ids, max_new_tokens=10)
        EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
        self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT)

    @require_cuda_capability_at_least(8, 9)
    def test_fqn_to_config_non_weight_param(self):
        linear1_config = Int8WeightOnlyConfig()
        linear2_config = Float8WeightOnlyConfig()
        config = FqnToConfig(
            {
                r"re:.*gate_up_proj": linear2_config,
                "model.layers.0.feed_forward.experts.gate_up_proj": None,
                "_default": linear1_config,
            }
        )
        quant_config = TorchAoConfig(quant_type=config)
        quantized_model = AutoModelForCausalLM.from_pretrained(
            "jcaip/Llama-4-Scout-17B-two-layers-only-testing",
            device_map=self.device,
            dtype=torch.bfloat16,
            quantization_config=quant_config,
        )

        self.assertTrue(isinstance(quantized_model.model.layers[1].feed_forward.experts.gate_up_proj, Float8Tensor))
        self.assertTrue(
            not isinstance(quantized_model.model.layers[0].feed_forward.experts.gate_up_proj, Float8Tensor)
        )
        self.assertTrue(isinstance(quantized_model.model.layers[1].self_attn.q_proj.weight, AffineQuantizedTensor))

    def test_compute_module_sizes(self):
        r"""
        Test if we compute the right module sizes needed to generate the device map.
        Also test if we get the right values for `total_byte_count` in `caching_allocator_warmup`.
        """
        from transformers import AutoConfig
        from transformers.integrations.accelerate import compute_module_sizes
        from transformers.modeling_utils import expand_device_map, get_total_byte_count
        from transformers.quantizers import AutoHfQuantizer

        # we need to preprocess the model like that because device_map calculation happens before we load the weights inside the model.
        # For normal wieghts, it's fine but for quantized weights, the tensors dtype might change during loading.
        with torch.device("meta"):
            config = AutoConfig.from_pretrained(self.model_name)
            model = AutoModelForCausalLM.from_config(config, dtype=torch.bfloat16)
            model_size, _ = compute_module_sizes(model, only_modules=False)

            expected_keys = [name for name, _ in model.named_parameters()] + [
                name for name, _ in model.named_buffers()
            ]
            expanded_device_map = expand_device_map({"": torch_device}, expected_keys)
            total_byte_count = list(get_total_byte_count(model, expanded_device_map).values())[0]

            # testing prequantized = False should be enough, the shape should be the same whether it is pre-quantized or not
            hf_quantizer = AutoHfQuantizer.from_config(
                TorchAoConfig(quant_type=Int4WeightOnlyConfig()), pre_quantized=False
            )
            hf_quantizer.preprocess_model(model=model, config=model.config)
            quantized_model_size, _ = compute_module_sizes(model, hf_quantizer, only_modules=False)

            expected_keys = [name for name, _ in model.named_parameters()] + [
                name for name, _ in model.named_buffers()
            ]
            expanded_device_map = expand_device_map({"": torch_device}, expected_keys)
            quantized_total_byte_count = list(get_total_byte_count(model, expanded_device_map, hf_quantizer).values())[
                0
            ]

        for name, module in model.named_modules():
            # modules are not replaced when using torchao
            if isinstance(module, torch.nn.Linear) and "lm_head" not in name:
                # from 16 bits to 4 bits
                assert int(model_size[f"{name}.weight"] // 4) == int(quantized_model_size[f"{name}.weight"])

        # check that we get the same value, as we use `compute_module_sizes` in `get_total_byte_count`
        assert total_byte_count == model_size[""]
        assert quantized_total_byte_count == quantized_model_size[""]

        # we should at least have 1.5 times memory reduction in total
        assert model_size[""] > quantized_model_size[""] * 2


class TorchAoCPUTest(TorchAoTestBase, unittest.TestCase):
    device = "cpu"

    @unittest.skip("Int4 does not support CPU")
    def test_int4wo_quant(self):
        pass


@require_torch_accelerator
class TorchAoAcceleratorTest(TorchAoTestBase, unittest.TestCase):
    device = torch_device

    def test_int4wo_offload(self):
        """
        Test Int4 weight-only quantization with CPU offload.
        """
        device_map_offload = {
            "model.embed_tokens": 0,
            "model.layers.0": 0,
            "model.layers.1": 0,
            "model.layers.2": 0,
            "model.layers.3": 0,
            "model.layers.4": 0,
            "model.layers.5": 0,
            "model.layers.6": 0,
            "model.layers.7": 0,
            "model.layers.8": 0,
            "model.layers.9": 0,
            "model.layers.10": 0,
            "model.layers.11": 0,
            "model.layers.12": 0,
            "model.layers.13": 0,
            "model.layers.14": 0,
            "model.layers.15": 0,
            "model.layers.16": 0,
            "model.layers.17": 0,
            "model.layers.18": 0,
            "model.layers.19": "cpu",
            "model.layers.20": "cpu",
            "model.layers.21": "cpu",
            "model.norm": 0,
            "model.rotary_emb": 0,
            "lm_head": 0,
        }

        int4_packing_format = "plain_int32" if self.device == "xpu" else "tile_packed_to_4d"
        config = Int4WeightOnlyConfig(int4_packing_format=int4_packing_format)
        quant_config = TorchAoConfig(config)

        quantized_model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            torch_dtype=torch.bfloat16,
            device_map=device_map_offload,
            quantization_config=quant_config,
        )
        tokenizer = AutoTokenizer.from_pretrained(self.model_name)

        input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)

        output = quantized_model.generate(**input_ids, max_new_tokens=10)
        # fmt: off
        EXPECTED_OUTPUT = Expectations(
            {
                ("cuda", None): "What are we having for dinner?\nRed, white, and green beans,",
                ("xpu", None): "What are we having for dinner?\n\nJessica: (smiling)",
                ("xpu", 5): "What are we having for dinner?\n\n[Scene 2]\n\n[",
            }
        )
        # fmt: on
        self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT.get_expectation())

    @require_torch_multi_accelerator
    def test_int4wo_quant_multi_accelerator(self):
        """
        Simple test that checks if the quantized model int4 weight only is working properly with multiple accelerators
        set CUDA_VISIBLE_DEVICES=0,1 if you have more than 2 CUDA GPUs
        set ZE_AFFINITY_MASK=0,1 if you have more than 2 Intel XPUs
        """

        int4_packing_format = "plain_int32" if self.device == "xpu" else "tile_packed_to_4d"
        config = Int4WeightOnlyConfig(int4_packing_format=int4_packing_format)
        quant_config = TorchAoConfig(config)
        quantized_model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            torch_dtype=torch.bfloat16,
            device_map="auto",
            quantization_config=quant_config,
        )
        tokenizer = AutoTokenizer.from_pretrained(self.model_name)

        self.assertTrue(set(quantized_model.hf_device_map.values()) == {0, 1})

        input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)

        output = quantized_model.generate(**input_ids, max_new_tokens=10)
        EXPECTED_OUTPUT = Expectations(
            {
                ("cuda", None): "What are we having for dinner?\nRed, white, and green beans,",
                ("xpu", None): "What are we having for dinner?\n\nJessica: (smiling)",
                ("xpu", 5): "What are we having for dinner?\n\n[Scene 2]\n\n[",
            }
        )
        self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT.get_expectation())


@slow
@require_torchao
class TorchAoSerializationTest(unittest.TestCase):
    """Parameterized serialization tests: quantize, save, reload, check output."""

    input_text = "What are we having for dinner?"
    model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

    # fmt: off
    COMMON_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
    ALL_DEVICES_COMMON = Expectations({("cpu", None): COMMON_OUTPUT, ("cuda", None): COMMON_OUTPUT, ("xpu", None): COMMON_OUTPUT})

    test_params = (
        [
            ("Int8WeightOnlyConfig", Int8WeightOnlyConfig(version=2), ALL_DEVICES_COMMON),
            ("Int8DynamicActivationInt8WeightConfig", Int8DynamicActivationInt8WeightConfig(version=2), ALL_DEVICES_COMMON),
            ("Float8DynamicActivationFloat8WeightConfig", Float8DynamicActivationFloat8WeightConfig(), Expectations({("cuda", None): COMMON_OUTPUT, ("xpu", None): "What are we having for dinner?\n\nJess: (smiling) I", ("xpu", 5): COMMON_OUTPUT})),
            ("Float8WeightOnlyConfig", Float8WeightOnlyConfig(), Expectations({("cuda", None): COMMON_OUTPUT, ("xpu", None): COMMON_OUTPUT})),
            ("Int4WeightOnlyConfig", Int4WeightOnlyConfig(int4_packing_format="plain_int32" if torch_device == "xpu" else "tile_packed_to_4d"), Expectations({("cuda", None): "What are we having for dinner?\nRed, white, and green beans,", ("xpu", None): COMMON_OUTPUT, ("xpu", 5): "What are we having for dinner?\n\n[Scene 2]\n\n["})),
            ("Int8DynamicActivationIntxWeightConfig", Int8DynamicActivationIntxWeightConfig(), Expectations({("cpu", None): COMMON_OUTPUT, ("cuda", 9): COMMON_OUTPUT, ("cuda", 8): "What are we having for dinner?\n\nJEN: (smiling) I", ("xpu", None): COMMON_OUTPUT})),
            ("IntxWeightOnlyConfig", IntxWeightOnlyConfig(), ALL_DEVICES_COMMON),
            ("NVFP4DynamicActivationNVFP4WeightConfig", NVFP4DynamicActivationNVFP4WeightConfig(), Expectations({("cuda", None): "What are we having for dinner?\n\n10. Avoid using \"I"})),
        ]
        if is_torchao_available()
        else []
    )
    # fmt: on

    def tearDown(self):
        gc.collect()
        backend_empty_cache(torch_device)
        gc.collect()

    def _check_serialization(self, device, config, expected_output):
        if isinstance(config, (Float8DynamicActivationFloat8WeightConfig, Float8WeightOnlyConfig)):
            if torch.cuda.is_available() and torch.cuda.get_device_capability() < (8, 9):
                self.skipTest(f"{type(config).__name__} requires CUDA capability >= (8, 9)")
        if isinstance(config, NVFP4DynamicActivationNVFP4WeightConfig):
            if torch.cuda.is_available() and torch.cuda.get_device_capability() < (10, 0):
                self.skipTest(f"{type(config).__name__} requires CUDA capability >= (10, 0) (SM100)")
        quant_config = TorchAoConfig(config)
        needs_bfloat16 = isinstance(config, Int4WeightOnlyConfig | NVFP4DynamicActivationNVFP4WeightConfig)
        dtype = torch.bfloat16 if needs_bfloat16 else "auto"
        quantized_model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            dtype=dtype,
            device_map=device,
            quantization_config=quant_config,
        )
        tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        input_ids = tokenizer(self.input_text, return_tensors="pt").to(device)
        output = quantized_model.generate(**input_ids, max_new_tokens=10)
        self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), expected_output)
        with tempfile.TemporaryDirectory() as tmpdirname:
            quantized_model.save_pretrained(tmpdirname)
            loaded_model = AutoModelForCausalLM.from_pretrained(tmpdirname, dtype=dtype, device_map=device)
            input_ids = tokenizer(self.input_text, return_tensors="pt").to(device)
            output = loaded_model.generate(**input_ids, max_new_tokens=10)
            self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), expected_output)

    @parameterized.expand(test_params, skip_on_empty=True)
    def test_serialization_cpu(self, _name, config, expected_outputs):
        try:
            expected = expected_outputs.find_expectation(("cpu", None, None))
        except ValueError:
            self.skipTest(f"{type(config).__name__} does not support CPU")
        self._check_serialization("cpu", config, expected)

    @parameterized.expand(test_params, skip_on_empty=True)
    @require_torch_accelerator
    def test_serialization_accelerator(self, _name, config, expected_outputs):
        try:
            expected = expected_outputs.get_expectation()
        except ValueError:
            self.skipTest(f"{type(config).__name__} does not support {torch_device}")
        self._check_serialization(torch_device, config, expected)


if __name__ == "__main__":
    unittest.main()