first commit
Some checks failed
Self-hosted runner (nightly-past-ci-caller) / Get number (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.11 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.10 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.9 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.8 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.7 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.6 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.5 (push) Has been cancelled
Self-hosted runner (benchmark) / Benchmark (aws-g5-4xlarge-cache) (push) Has been cancelled
Build documentation / build (push) Has been cancelled
Build documentation / build_other_lang (push) Has been cancelled
CodeQL Security Analysis / CodeQL Analysis (push) Has been cancelled
New model PR merged notification / Notify new model (push) Has been cancelled
PR CI / pr-ci (push) Has been cancelled
Slow tests on important models (on Push - A10) / Get all modified files (push) Has been cancelled
Secret Leaks / trufflehog (push) Has been cancelled
Update Transformers metadata / build_and_package (push) Has been cancelled
Slow tests on important models (on Push - A10) / Model CI (push) Has been cancelled
Check Tiny Models / Check tiny models (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Model CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Pipeline CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Example CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / DeepSpeed CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Trainer/FSDP CI (push) Has been cancelled
Nvidia CI - Flash Attn / Setup (push) Has been cancelled
Nvidia CI - Flash Attn / Model CI (push) Has been cancelled
Nvidia CI / Setup (push) Has been cancelled
Nvidia CI / Model CI (push) Has been cancelled
Nvidia CI / Torch pipeline CI (push) Has been cancelled
Nvidia CI / Example CI (push) Has been cancelled
Nvidia CI / Trainer/FSDP CI (push) Has been cancelled
Nvidia CI / DeepSpeed CI (push) Has been cancelled
Nvidia CI / Quantization CI (push) Has been cancelled
Nvidia CI / Kernels CI (push) Has been cancelled
Doctests / Setup (push) Has been cancelled
Doctests / Call doctest jobs (push) Has been cancelled
Doctests / Send results to webhook (push) Has been cancelled
Extras Smoke Test / Get supported Python versions (push) Has been cancelled
Extras Smoke Test / Test extras on Python ${{ matrix.python-version }} (push) Has been cancelled
Extras Smoke Test / Check Slack token availability (push) Has been cancelled
Extras Smoke Test / Notify failures to Slack (push) Has been cancelled
Self-hosted runner (AMD scheduled CI caller) / Trigger Scheduled AMD CI (push) Has been cancelled
Stale Bot / Close Stale Issues (push) Has been cancelled
Some checks failed
Self-hosted runner (nightly-past-ci-caller) / Get number (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.11 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.10 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.9 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.8 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.7 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.6 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.5 (push) Has been cancelled
Self-hosted runner (benchmark) / Benchmark (aws-g5-4xlarge-cache) (push) Has been cancelled
Build documentation / build (push) Has been cancelled
Build documentation / build_other_lang (push) Has been cancelled
CodeQL Security Analysis / CodeQL Analysis (push) Has been cancelled
New model PR merged notification / Notify new model (push) Has been cancelled
PR CI / pr-ci (push) Has been cancelled
Slow tests on important models (on Push - A10) / Get all modified files (push) Has been cancelled
Secret Leaks / trufflehog (push) Has been cancelled
Update Transformers metadata / build_and_package (push) Has been cancelled
Slow tests on important models (on Push - A10) / Model CI (push) Has been cancelled
Check Tiny Models / Check tiny models (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Model CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Pipeline CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Example CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / DeepSpeed CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Trainer/FSDP CI (push) Has been cancelled
Nvidia CI - Flash Attn / Setup (push) Has been cancelled
Nvidia CI - Flash Attn / Model CI (push) Has been cancelled
Nvidia CI / Setup (push) Has been cancelled
Nvidia CI / Model CI (push) Has been cancelled
Nvidia CI / Torch pipeline CI (push) Has been cancelled
Nvidia CI / Example CI (push) Has been cancelled
Nvidia CI / Trainer/FSDP CI (push) Has been cancelled
Nvidia CI / DeepSpeed CI (push) Has been cancelled
Nvidia CI / Quantization CI (push) Has been cancelled
Nvidia CI / Kernels CI (push) Has been cancelled
Doctests / Setup (push) Has been cancelled
Doctests / Call doctest jobs (push) Has been cancelled
Doctests / Send results to webhook (push) Has been cancelled
Extras Smoke Test / Get supported Python versions (push) Has been cancelled
Extras Smoke Test / Test extras on Python ${{ matrix.python-version }} (push) Has been cancelled
Extras Smoke Test / Check Slack token availability (push) Has been cancelled
Extras Smoke Test / Notify failures to Slack (push) Has been cancelled
Self-hosted runner (AMD scheduled CI caller) / Trigger Scheduled AMD CI (push) Has been cancelled
Stale Bot / Close Stale Issues (push) Has been cancelled
This commit is contained in:
@@ -0,0 +1,176 @@
|
||||
import gc
|
||||
import unittest
|
||||
import warnings
|
||||
|
||||
from transformers import AutoModelForCausalLM
|
||||
from transformers.testing_utils import backend_empty_cache, require_compressed_tensors, require_torch, torch_device
|
||||
from transformers.utils import is_torch_available
|
||||
from transformers.utils.quantization_config import CompressedTensorsConfig
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
|
||||
@require_compressed_tensors
|
||||
@require_torch
|
||||
class StackCompressedModelTest(unittest.TestCase):
|
||||
# Define stubs as class attributes
|
||||
compressed_uncompressed_model_stubs = [
|
||||
(
|
||||
"nm-testing/llama2.c-stories42M-gsm8k-quantized-only-compressed",
|
||||
"nm-testing/llama2.c-stories42M-gsm8k-quantized-only-uncompressed",
|
||||
),
|
||||
]
|
||||
# Flatten the list for tests that require a single list of stubs.
|
||||
model_stubs = [stub for pair in compressed_uncompressed_model_stubs for stub in pair]
|
||||
|
||||
prompt = "Paris is the capital of which country?"
|
||||
|
||||
def tearDown(self):
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
gc.collect()
|
||||
|
||||
def test_compressed_uncompressed_model_shapes(self):
|
||||
"""
|
||||
Verify that the weights of an uncompressed model and its decompressed compressed counterpart match.
|
||||
Note: Weights for sparsely compressed models may differ due to packing.
|
||||
"""
|
||||
|
||||
def _has_nested_attr(obj, attr_path):
|
||||
attrs = attr_path.split(".")
|
||||
for attr in attrs:
|
||||
if not hasattr(obj, attr):
|
||||
return None
|
||||
obj = getattr(obj, attr)
|
||||
return obj
|
||||
|
||||
for compressed_model, uncompressed_model in self.compressed_uncompressed_model_stubs:
|
||||
with self.subTest(compressed_model=compressed_model, uncompressed_model=uncompressed_model):
|
||||
uncompressed = AutoModelForCausalLM.from_pretrained(
|
||||
uncompressed_model,
|
||||
device_map="auto",
|
||||
dtype="auto",
|
||||
quantization_config=CompressedTensorsConfig(run_compressed=False),
|
||||
)
|
||||
compressed_decompressed = AutoModelForCausalLM.from_pretrained(
|
||||
compressed_model,
|
||||
device_map="auto",
|
||||
dtype="auto",
|
||||
quantization_config=CompressedTensorsConfig(run_compressed=False),
|
||||
)
|
||||
|
||||
for name, submodule in uncompressed.named_modules():
|
||||
if list(submodule.children()):
|
||||
continue
|
||||
comp_decomp_obj = _has_nested_attr(compressed_decompressed, name)
|
||||
if comp_decomp_obj is not None and hasattr(submodule, "weight"):
|
||||
torch.testing.assert_close(
|
||||
submodule.weight.to(torch_device),
|
||||
comp_decomp_obj.weight.to(torch_device),
|
||||
atol=0.2,
|
||||
rtol=1e-5,
|
||||
msg=f"Weight mismatch for module '{name}'.",
|
||||
)
|
||||
|
||||
def test_no_warnings_for_all_models(self):
|
||||
"""
|
||||
Confirm that loading any model using compressed tensors does not trigger
|
||||
warnings about missing or unexpected keys.
|
||||
"""
|
||||
for model_stub in self.model_stubs:
|
||||
with self.subTest(model_stub=model_stub):
|
||||
with warnings.catch_warnings(record=True) as caught_warnings:
|
||||
warnings.simplefilter("always")
|
||||
AutoModelForCausalLM.from_pretrained(
|
||||
model_stub,
|
||||
device_map="auto",
|
||||
dtype="auto",
|
||||
quantization_config=CompressedTensorsConfig(run_compressed=False),
|
||||
)
|
||||
for warning in caught_warnings:
|
||||
self.assertNotIn(
|
||||
"missing keys",
|
||||
str(warning.message).lower(),
|
||||
f"'missing keys' found in warnings for model {model_stub}",
|
||||
)
|
||||
self.assertNotIn(
|
||||
"unexpected keys",
|
||||
str(warning.message).lower(),
|
||||
f"'unexpected keys' found in warnings for model {model_stub}",
|
||||
)
|
||||
|
||||
|
||||
@require_compressed_tensors
|
||||
@require_torch
|
||||
class RunCompressedTest(unittest.TestCase):
|
||||
tinyllama_w4a16 = "nm-testing/tinyllama-w4a16-compressed"
|
||||
tinyllama_w8a8 = "nm-testing/tinyllama-w8a8-compressed"
|
||||
|
||||
prompt = "Paris is the capital of which country?"
|
||||
|
||||
stubs = [tinyllama_w4a16, tinyllama_w8a8]
|
||||
|
||||
def tearDown(self):
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
gc.collect()
|
||||
|
||||
def test_default_run_compressed__True(self):
|
||||
from compressed_tensors import QuantizationStatus
|
||||
|
||||
for stub in self.stubs:
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
stub,
|
||||
)
|
||||
compressed_count = sum(
|
||||
1 for m in model.modules() if getattr(m, "quantization_status", None) == QuantizationStatus.COMPRESSED
|
||||
)
|
||||
|
||||
# some linear modules are not compressed - ex. lm_head
|
||||
assert compressed_count > 0
|
||||
|
||||
def test_default_run_compressed__False(self):
|
||||
from compressed_tensors import QuantizationStatus
|
||||
|
||||
from transformers.utils.quantization_config import CompressedTensorsConfig
|
||||
|
||||
quantization_config = CompressedTensorsConfig(run_compressed=False)
|
||||
|
||||
for stub in self.stubs:
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
stub,
|
||||
quantization_config=quantization_config,
|
||||
)
|
||||
compressed_count = sum(
|
||||
1 for m in model.modules() if getattr(m, "quantization_status", None) == QuantizationStatus.COMPRESSED
|
||||
)
|
||||
|
||||
# No modules should be in COMPRESSED state
|
||||
assert compressed_count == 0
|
||||
|
||||
def test_run_compressed_outputs_match(self):
|
||||
"""Check that run_compressed=True/False output are the same"""
|
||||
|
||||
from transformers import AutoTokenizer
|
||||
from transformers.utils.quantization_config import CompressedTensorsConfig
|
||||
|
||||
quantization_config = CompressedTensorsConfig(run_compressed=False)
|
||||
|
||||
for stub in self.stubs:
|
||||
tokenizer = AutoTokenizer.from_pretrained(stub)
|
||||
input_ids = tokenizer(self.prompt, return_tensors="pt").input_ids
|
||||
|
||||
model_run_compressed__True = AutoModelForCausalLM.from_pretrained(
|
||||
stub,
|
||||
)
|
||||
output_rc_true = model_run_compressed__True.generate(input_ids, max_new_tokens=100)
|
||||
|
||||
model_run_compressed__False = AutoModelForCausalLM.from_pretrained(
|
||||
stub,
|
||||
quantization_config=quantization_config,
|
||||
)
|
||||
output_rc_false = model_run_compressed__False.generate(input_ids, max_new_tokens=100)
|
||||
|
||||
assert tokenizer.decode(output_rc_true[0]) == tokenizer.decode(output_rc_false[0])
|
||||
@@ -0,0 +1,88 @@
|
||||
import gc
|
||||
import unittest
|
||||
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, CompressedTensorsConfig
|
||||
from transformers.testing_utils import backend_empty_cache, require_compressed_tensors, require_torch, torch_device
|
||||
from transformers.utils import is_torch_available
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
|
||||
@require_compressed_tensors
|
||||
@require_torch
|
||||
class CompressedTensorsTest(unittest.TestCase):
|
||||
tinyllama_w4a16 = "nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-compressed"
|
||||
tinyllama_int8 = "nm-testing/TinyLlama-1.1B-Chat-v1.0-W8A8-Dynamic-Per-Token-compressed"
|
||||
tinyllama_fp8 = "nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-Dynamic-compressed"
|
||||
tinyllama_w8a16 = "nm-testing/TinyLlama-1.1B-Chat-v1.0-W8A16-G128-compressed"
|
||||
|
||||
prompt = "The capital of France is Paris, the capital of Germany is Berlin"
|
||||
|
||||
def tearDown(self):
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
gc.collect()
|
||||
|
||||
def test_config_args(self):
|
||||
with self.assertRaises(ValueError):
|
||||
# passing quant scheme directly is not allowed
|
||||
CompressedTensorsConfig(config_groups={"weights": {"num_bits": 8}})
|
||||
CompressedTensorsConfig(
|
||||
config_groups={"FP8": ["Linear"]},
|
||||
ignore=["lm_head"],
|
||||
quantization_status="frozen",
|
||||
)
|
||||
|
||||
def test_config_to_from_dict(self):
|
||||
config = CompressedTensorsConfig(config_groups={"FP8": ["Linear"]})
|
||||
config_dict = config.to_dict()
|
||||
config_from_dict = CompressedTensorsConfig.from_dict(config_dict)
|
||||
|
||||
from compressed_tensors import QuantizationConfig
|
||||
|
||||
self.assertIsInstance(config_from_dict.quantization_config, QuantizationConfig)
|
||||
|
||||
def test_tinyllama_w4a16(self):
|
||||
self._test_quantized_model(self.tinyllama_w4a16, 20.0)
|
||||
|
||||
def test_tinyllama_int8(self):
|
||||
self._test_quantized_model(self.tinyllama_int8, 30.0)
|
||||
|
||||
def test_tinyllama_fp8(self):
|
||||
self._test_quantized_model(self.tinyllama_fp8, 20.0)
|
||||
|
||||
def test_tinyllama_w8a16(self):
|
||||
self._test_quantized_model(self.tinyllama_w8a16, 20.0)
|
||||
|
||||
def _test_quantized_model(self, model_name: str, expected_perplexity: float):
|
||||
# load model
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
device = quantized_model.device
|
||||
|
||||
# check config
|
||||
self.assertIsNotNone(
|
||||
quantized_model.config.quantization_config,
|
||||
"quantization_config should not be None",
|
||||
)
|
||||
# check scales
|
||||
self.assertTrue(
|
||||
any(
|
||||
key
|
||||
for key, tensor in quantized_model.state_dict().items()
|
||||
if "scale" in key and not torch.all(tensor == 1.0)
|
||||
),
|
||||
"quantized model should load a non-trivial scale into the state dict",
|
||||
)
|
||||
|
||||
# compute outputs with loss
|
||||
inputs = tokenizer(self.prompt, return_tensors="pt").to(device)
|
||||
labels = inputs["input_ids"]
|
||||
with torch.no_grad():
|
||||
outputs = quantized_model(**inputs, labels=labels)
|
||||
|
||||
# check perplexity
|
||||
perplexity = torch.exp(outputs.loss)
|
||||
self.assertLessEqual(perplexity, expected_perplexity)
|
||||
Reference in New Issue
Block a user