first commit

2026-06-05 16:53:03 +08:00
commit 06f1fd69a6
6047 changed files with 1895387 additions and 0 deletions
--- a/tests/quantization/compressed_tensors_integration/init.py
+++ b/tests/quantization/compressed_tensors_integration/init.py
--- a/tests/quantization/compressed_tensors_integration/test_compressed_models.py
+++ b/tests/quantization/compressed_tensors_integration/test_compressed_models.py
@@ -0,0 +1,176 @@
+import gc
+import unittest
+import warnings
+
+from transformers import AutoModelForCausalLM
+from transformers.testing_utils import backend_empty_cache, require_compressed_tensors, require_torch, torch_device
+from transformers.utils import is_torch_available
+from transformers.utils.quantization_config import CompressedTensorsConfig
+
+
+if is_torch_available():
+    import torch
+
+
+@require_compressed_tensors
+@require_torch
+class StackCompressedModelTest(unittest.TestCase):
+    # Define stubs as class attributes
+    compressed_uncompressed_model_stubs = [
+        (
+            "nm-testing/llama2.c-stories42M-gsm8k-quantized-only-compressed",
+            "nm-testing/llama2.c-stories42M-gsm8k-quantized-only-uncompressed",
+        ),
+    ]
+    # Flatten the list for tests that require a single list of stubs.
+    model_stubs = [stub for pair in compressed_uncompressed_model_stubs for stub in pair]
+
+    prompt = "Paris is the capital of which country?"
+
+    def tearDown(self):
+        gc.collect()
+        backend_empty_cache(torch_device)
+        gc.collect()
+
+    def test_compressed_uncompressed_model_shapes(self):
+        """
+        Verify that the weights of an uncompressed model and its decompressed compressed counterpart match.
+        Note: Weights for sparsely compressed models may differ due to packing.
+        """
+
+        def _has_nested_attr(obj, attr_path):
+            attrs = attr_path.split(".")
+            for attr in attrs:
+                if not hasattr(obj, attr):
+                    return None
+                obj = getattr(obj, attr)
+            return obj
+
+        for compressed_model, uncompressed_model in self.compressed_uncompressed_model_stubs:
+            with self.subTest(compressed_model=compressed_model, uncompressed_model=uncompressed_model):
+                uncompressed = AutoModelForCausalLM.from_pretrained(
+                    uncompressed_model,
+                    device_map="auto",
+                    dtype="auto",
+                    quantization_config=CompressedTensorsConfig(run_compressed=False),
+                )
+                compressed_decompressed = AutoModelForCausalLM.from_pretrained(
+                    compressed_model,
+                    device_map="auto",
+                    dtype="auto",
+                    quantization_config=CompressedTensorsConfig(run_compressed=False),
+                )
+
+                for name, submodule in uncompressed.named_modules():
+                    if list(submodule.children()):
+                        continue
+                    comp_decomp_obj = _has_nested_attr(compressed_decompressed, name)
+                    if comp_decomp_obj is not None and hasattr(submodule, "weight"):
+                        torch.testing.assert_close(
+                            submodule.weight.to(torch_device),
+                            comp_decomp_obj.weight.to(torch_device),
+                            atol=0.2,
+                            rtol=1e-5,
+                            msg=f"Weight mismatch for module '{name}'.",
+                        )
+
+    def test_no_warnings_for_all_models(self):
+        """
+        Confirm that loading any model using compressed tensors does not trigger
+        warnings about missing or unexpected keys.
+        """
+        for model_stub in self.model_stubs:
+            with self.subTest(model_stub=model_stub):
+                with warnings.catch_warnings(record=True) as caught_warnings:
+                    warnings.simplefilter("always")
+                    AutoModelForCausalLM.from_pretrained(
+                        model_stub,
+                        device_map="auto",
+                        dtype="auto",
+                        quantization_config=CompressedTensorsConfig(run_compressed=False),
+                    )
+                    for warning in caught_warnings:
+                        self.assertNotIn(
+                            "missing keys",
+                            str(warning.message).lower(),
+                            f"'missing keys' found in warnings for model {model_stub}",
+                        )
+                        self.assertNotIn(
+                            "unexpected keys",
+                            str(warning.message).lower(),
+                            f"'unexpected keys' found in warnings for model {model_stub}",
+                        )
+
+
+@require_compressed_tensors
+@require_torch
+class RunCompressedTest(unittest.TestCase):
+    tinyllama_w4a16 = "nm-testing/tinyllama-w4a16-compressed"
+    tinyllama_w8a8 = "nm-testing/tinyllama-w8a8-compressed"
+
+    prompt = "Paris is the capital of which country?"
+
+    stubs = [tinyllama_w4a16, tinyllama_w8a8]
+
+    def tearDown(self):
+        gc.collect()
+        backend_empty_cache(torch_device)
+        gc.collect()
+
+    def test_default_run_compressed__True(self):
+        from compressed_tensors import QuantizationStatus
+
+        for stub in self.stubs:
+            model = AutoModelForCausalLM.from_pretrained(
+                stub,
+            )
+            compressed_count = sum(
+                1 for m in model.modules() if getattr(m, "quantization_status", None) == QuantizationStatus.COMPRESSED
+            )
+
+            # some linear modules are not compressed - ex. lm_head
+            assert compressed_count > 0
+
+    def test_default_run_compressed__False(self):
+        from compressed_tensors import QuantizationStatus
+
+        from transformers.utils.quantization_config import CompressedTensorsConfig
+
+        quantization_config = CompressedTensorsConfig(run_compressed=False)
+
+        for stub in self.stubs:
+            model = AutoModelForCausalLM.from_pretrained(
+                stub,
+                quantization_config=quantization_config,
+            )
+            compressed_count = sum(
+                1 for m in model.modules() if getattr(m, "quantization_status", None) == QuantizationStatus.COMPRESSED
+            )
+
+            # No modules should be in COMPRESSED state
+            assert compressed_count == 0
+
+    def test_run_compressed_outputs_match(self):
+        """Check that run_compressed=True/False output are the same"""
+
+        from transformers import AutoTokenizer
+        from transformers.utils.quantization_config import CompressedTensorsConfig
+
+        quantization_config = CompressedTensorsConfig(run_compressed=False)
+
+        for stub in self.stubs:
+            tokenizer = AutoTokenizer.from_pretrained(stub)
+            input_ids = tokenizer(self.prompt, return_tensors="pt").input_ids
+
+            model_run_compressed__True = AutoModelForCausalLM.from_pretrained(
+                stub,
+            )
+            output_rc_true = model_run_compressed__True.generate(input_ids, max_new_tokens=100)
+
+            model_run_compressed__False = AutoModelForCausalLM.from_pretrained(
+                stub,
+                quantization_config=quantization_config,
+            )
+            output_rc_false = model_run_compressed__False.generate(input_ids, max_new_tokens=100)
+
+            assert tokenizer.decode(output_rc_true[0]) == tokenizer.decode(output_rc_false[0])
--- a/tests/quantization/compressed_tensors_integration/test_compressed_tensors.py
+++ b/tests/quantization/compressed_tensors_integration/test_compressed_tensors.py
@@ -0,0 +1,88 @@
+import gc
+import unittest
+
+from transformers import AutoModelForCausalLM, AutoTokenizer, CompressedTensorsConfig
+from transformers.testing_utils import backend_empty_cache, require_compressed_tensors, require_torch, torch_device
+from transformers.utils import is_torch_available
+
+
+if is_torch_available():
+    import torch
+
+
+@require_compressed_tensors
+@require_torch
+class CompressedTensorsTest(unittest.TestCase):
+    tinyllama_w4a16 = "nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-compressed"
+    tinyllama_int8 = "nm-testing/TinyLlama-1.1B-Chat-v1.0-W8A8-Dynamic-Per-Token-compressed"
+    tinyllama_fp8 = "nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-Dynamic-compressed"
+    tinyllama_w8a16 = "nm-testing/TinyLlama-1.1B-Chat-v1.0-W8A16-G128-compressed"
+
+    prompt = "The capital of France is Paris, the capital of Germany is Berlin"
+
+    def tearDown(self):
+        gc.collect()
+        backend_empty_cache(torch_device)
+        gc.collect()
+
+    def test_config_args(self):
+        with self.assertRaises(ValueError):
+            # passing quant scheme directly is not allowed
+            CompressedTensorsConfig(config_groups={"weights": {"num_bits": 8}})
+        CompressedTensorsConfig(
+            config_groups={"FP8": ["Linear"]},
+            ignore=["lm_head"],
+            quantization_status="frozen",
+        )
+
+    def test_config_to_from_dict(self):
+        config = CompressedTensorsConfig(config_groups={"FP8": ["Linear"]})
+        config_dict = config.to_dict()
+        config_from_dict = CompressedTensorsConfig.from_dict(config_dict)
+
+        from compressed_tensors import QuantizationConfig
+
+        self.assertIsInstance(config_from_dict.quantization_config, QuantizationConfig)
+
+    def test_tinyllama_w4a16(self):
+        self._test_quantized_model(self.tinyllama_w4a16, 20.0)
+
+    def test_tinyllama_int8(self):
+        self._test_quantized_model(self.tinyllama_int8, 30.0)
+
+    def test_tinyllama_fp8(self):
+        self._test_quantized_model(self.tinyllama_fp8, 20.0)
+
+    def test_tinyllama_w8a16(self):
+        self._test_quantized_model(self.tinyllama_w8a16, 20.0)
+
+    def _test_quantized_model(self, model_name: str, expected_perplexity: float):
+        # load model
+        quantized_model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        device = quantized_model.device
+
+        # check config
+        self.assertIsNotNone(
+            quantized_model.config.quantization_config,
+            "quantization_config should not be None",
+        )
+        # check scales
+        self.assertTrue(
+            any(
+                key
+                for key, tensor in quantized_model.state_dict().items()
+                if "scale" in key and not torch.all(tensor == 1.0)
+            ),
+            "quantized model should load a non-trivial scale into the state dict",
+        )
+
+        # compute outputs with loss
+        inputs = tokenizer(self.prompt, return_tensors="pt").to(device)
+        labels = inputs["input_ids"]
+        with torch.no_grad():
+            outputs = quantized_model(**inputs, labels=labels)
+
+        # check perplexity
+        perplexity = torch.exp(outputs.loss)
+        self.assertLessEqual(perplexity, expected_perplexity)