first commit
Some checks failed
Self-hosted runner (nightly-past-ci-caller) / Get number (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.11 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.10 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.9 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.8 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.7 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.6 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.5 (push) Has been cancelled
Self-hosted runner (benchmark) / Benchmark (aws-g5-4xlarge-cache) (push) Has been cancelled
Build documentation / build (push) Has been cancelled
Build documentation / build_other_lang (push) Has been cancelled
CodeQL Security Analysis / CodeQL Analysis (push) Has been cancelled
New model PR merged notification / Notify new model (push) Has been cancelled
PR CI / pr-ci (push) Has been cancelled
Slow tests on important models (on Push - A10) / Get all modified files (push) Has been cancelled
Secret Leaks / trufflehog (push) Has been cancelled
Update Transformers metadata / build_and_package (push) Has been cancelled
Slow tests on important models (on Push - A10) / Model CI (push) Has been cancelled
Check Tiny Models / Check tiny models (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Model CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Pipeline CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Example CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / DeepSpeed CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Trainer/FSDP CI (push) Has been cancelled
Nvidia CI - Flash Attn / Setup (push) Has been cancelled
Nvidia CI - Flash Attn / Model CI (push) Has been cancelled
Nvidia CI / Setup (push) Has been cancelled
Nvidia CI / Model CI (push) Has been cancelled
Nvidia CI / Torch pipeline CI (push) Has been cancelled
Nvidia CI / Example CI (push) Has been cancelled
Nvidia CI / Trainer/FSDP CI (push) Has been cancelled
Nvidia CI / DeepSpeed CI (push) Has been cancelled
Nvidia CI / Quantization CI (push) Has been cancelled
Nvidia CI / Kernels CI (push) Has been cancelled
Doctests / Setup (push) Has been cancelled
Doctests / Call doctest jobs (push) Has been cancelled
Doctests / Send results to webhook (push) Has been cancelled
Extras Smoke Test / Get supported Python versions (push) Has been cancelled
Extras Smoke Test / Test extras on Python ${{ matrix.python-version }} (push) Has been cancelled
Extras Smoke Test / Check Slack token availability (push) Has been cancelled
Extras Smoke Test / Notify failures to Slack (push) Has been cancelled
Self-hosted runner (AMD scheduled CI caller) / Trigger Scheduled AMD CI (push) Has been cancelled
Stale Bot / Close Stale Issues (push) Has been cancelled

This commit is contained in:
陈赣
2026-06-05 16:53:03 +08:00
commit 06f1fd69a6
6047 changed files with 1895387 additions and 0 deletions

View File

@@ -0,0 +1,257 @@
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import importlib
import tempfile
import unittest
from unittest import skip
import pytest
from packaging import version
from transformers import AqlmConfig, AutoConfig, AutoModelForCausalLM, AutoTokenizer, OPTForCausalLM, StaticCache
from transformers.testing_utils import (
backend_empty_cache,
require_accelerate,
require_aqlm,
require_torch_accelerator,
require_torch_multi_accelerator,
slow,
torch_device,
)
from transformers.utils import is_aqlm_available, is_torch_available
if is_torch_available():
import torch
@require_torch_accelerator
class AqlmConfigTest(unittest.TestCase):
def test_to_dict(self):
"""
Simple test that checks if one uses a config and converts it to a dict, the dict is the same as the config object
"""
quantization_config = AqlmConfig()
config_to_dict = quantization_config.to_dict()
for key in config_to_dict:
self.assertEqual(getattr(quantization_config, key), config_to_dict[key])
def test_from_dict(self):
"""
Simple test that checks if one uses a dict and converts it to a config object, the config object is the same as the dict
"""
dict = {
"in_group_size": 32,
"num_codebooks": 8,
"nbits_per_codebook": 8,
"linear_weights_not_to_quantize": ["lm_head.weight"],
}
quantization_config = AqlmConfig.from_dict(dict)
self.assertEqual(dict["in_group_size"], quantization_config.in_group_size)
self.assertEqual(dict["num_codebooks"], quantization_config.num_codebooks)
self.assertEqual(dict["nbits_per_codebook"], quantization_config.nbits_per_codebook)
self.assertEqual(dict["linear_weights_not_to_quantize"], quantization_config.linear_weights_not_to_quantize)
@slow
@require_torch_accelerator
@require_aqlm
@require_accelerate
class AqlmTest(unittest.TestCase):
model_name = "BlackSamorez/Llama-2-7b-AQLM-2Bit-1x16-hf"
input_text = "Hello my name is"
max_new_tokens = 32
EXPECTED_OUTPUT = "Hello my name is Katie. I am a 20 year old college student. I am a very outgoing person. I love to have fun and be active. I"
# called only once for all test in this class
@classmethod
def setUpClass(cls):
"""
Setup quantized model
"""
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
cls.quantized_model = AutoModelForCausalLM.from_pretrained(
cls.model_name,
device_map=torch_device,
)
def tearDown(self):
gc.collect()
backend_empty_cache(torch_device)
gc.collect()
def test_quantized_model_conversion(self):
"""
Simple test that checks if the quantized model has been converted properly
"""
from aqlm import QuantizedLinear
from transformers.integrations import replace_with_aqlm_linear
model_id = "facebook/opt-350m"
config = AutoConfig.from_pretrained(model_id, revision="cb32f77e905cccbca1d970436fb0f5e6b58ee3c5")
quantization_config = AqlmConfig()
with torch.device("meta"):
model = OPTForCausalLM(config)
nb_linears = 0
for module in model.modules():
if isinstance(module, torch.nn.Linear):
nb_linears += 1
model = replace_with_aqlm_linear(model, quantization_config=quantization_config)
nb_aqlm_linear = 0
for module in model.modules():
if isinstance(module, QuantizedLinear):
nb_aqlm_linear += 1
self.assertEqual(nb_linears, nb_aqlm_linear)
# Try with `modules_to_not_convert`
with torch.device("meta"):
model = OPTForCausalLM(config)
model = replace_with_aqlm_linear(
model, quantization_config=quantization_config, modules_to_not_convert=["lm_head"]
)
nb_aqlm_linear = 0
for module in model.modules():
if isinstance(module, QuantizedLinear):
nb_aqlm_linear += 1
self.assertEqual(nb_linears - 1, nb_aqlm_linear)
@skip(
"inference doesn't work with quantized aqlm models using torch.Any type with recent torch versions. Waiting for the fix from AQLM side"
)
def test_quantized_model(self):
"""
Simple test that checks if the quantized model is working properly
"""
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
output = self.quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
def test_raise_if_non_quantized(self):
model_id = "facebook/opt-125m"
quantization_config = AqlmConfig(bits=4)
with self.assertRaises(ValueError):
_ = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config)
@skip(
"inference doesn't work with quantized aqlm models using torch.Any type with recent torch versions. Waiting for the fix from AQLM side"
)
def test_save_pretrained(self):
"""
Simple test that checks if the quantized model is working properly after being saved and loaded
"""
with tempfile.TemporaryDirectory() as tmpdirname:
self.quantized_model.save_pretrained(tmpdirname)
model = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map=torch_device)
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
output = model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
@skip(
"inference doesn't work with quantized aqlm models using torch.Any type with recent torch versions. Waiting for the fix from AQLM side"
)
@require_torch_multi_accelerator
def test_quantized_model_multi_gpu(self):
"""
Simple test that checks if the quantized model is working properly with multiple GPUs
"""
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
quantized_model = AutoModelForCausalLM.from_pretrained(self.model_name, device_map="auto")
self.assertTrue(set(quantized_model.hf_device_map.values()) == {0, 1})
output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
@unittest.skipUnless(
is_aqlm_available() and version.parse(importlib.metadata.version("aqlm")) >= version.parse("1.0.3"),
"test requires `aqlm>=1.0.3`",
)
@pytest.mark.torch_compile_test
def test_quantized_model_compile(self):
"""
Simple test that checks if the quantized model is working properly
"""
# Sample tokens greedily
def decode_one_tokens(model, cur_token, input_pos, past_key_values):
logits = model(
cur_token,
position_ids=input_pos,
past_key_values=past_key_values,
return_dict=False,
use_cache=True,
)[0]
new_token = torch.argmax(logits[:, [-1]], dim=-1).to(torch.int)
return new_token
# Tokenize the test input
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)["input_ids"]
seq_length = input_ids.shape[1]
# Setup static KV cache for generation
past_key_values = StaticCache(
config=self.quantized_model.config,
batch_size=input_ids.shape[0],
max_cache_len=seq_length + self.max_new_tokens + 1,
)
# Allocate token ids to be generated and copy prefix ids
positions = torch.arange(seq_length, device=torch_device)
generated_ids = torch.zeros(1, seq_length + self.max_new_tokens, dtype=torch.int, device=torch_device)
generated_ids[:, positions] = input_ids.to(torch_device).to(torch.int)
# Do a forward pass to fill the prefix cache and compile the kernels if necessary
logits = self.quantized_model(
input_ids,
past_key_values=past_key_values,
return_dict=False,
use_cache=True,
)[0]
next_token = torch.argmax(logits[:, [-1]], dim=-1).to(torch.int)
generated_ids[:, [seq_length]] = next_token
with torch.no_grad():
# Compile the CUDA graph
decode_one_tokens = torch.compile(decode_one_tokens, mode="reduce-overhead", fullgraph=True)
# Generate tokens one by one
positions = torch.tensor([seq_length + 1], device=torch_device)
for _ in range(1, self.max_new_tokens):
with torch.backends.cuda.sdp_kernel(enable_flash=False, enable_mem_efficient=False, enable_math=True):
next_token = decode_one_tokens(self.quantized_model, next_token.clone(), None, past_key_values)
generated_ids.index_copy_(1, positions, next_token)
positions += 1
# Check generated text
self.assertEqual(self.tokenizer.decode(generated_ids[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)

View File

View File

@@ -0,0 +1,327 @@
# Copyright 2023 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import tempfile
import unittest
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, AwqConfig, OPTForCausalLM
from transformers.testing_utils import (
backend_empty_cache,
require_accelerate,
require_gptqmodel,
require_torch_accelerator,
require_torch_gpu,
require_torch_multi_accelerator,
slow,
torch_device,
)
from transformers.utils import is_torch_available
from transformers.utils.quantization_config import AwqBackend
if is_torch_available():
import torch
@require_torch_accelerator
class AwqConfigTest(unittest.TestCase):
def test_wrong_backend(self):
"""
Simple test that checks if a user passes a wrong backend an error is raised
"""
# This should work fine
_ = AwqConfig(bits=4)
with self.assertRaises(ValueError):
AwqConfig(bits=4, backend="")
# These should work fine
_ = AwqConfig(bits=4, version="GEMM")
_ = AwqConfig(bits=4, version="gemm")
with self.assertRaises(ValueError):
AwqConfig(bits=4, backend="unexisting-backend")
def test_to_dict(self):
"""
Simple test that checks if one uses a config and converts it to a dict, the dict is the same as the config object
"""
quantization_config = AwqConfig(bits=4)
config_to_dict = quantization_config.to_dict()
for key in config_to_dict:
if key == "version":
# "version" is legacy filed.
# It will be written in to_dict() for compatibility, but AwqConfig will not have this field.
self.assertFalse(hasattr(quantization_config, key))
else:
self.assertEqual(getattr(quantization_config, key), config_to_dict[key])
def test_from_dict(self):
"""
Simple test that checks if one uses a dict and converts it to a config object, the config object is the same as the dict
"""
dict = {"bits": 2, "zero_point": False, "backend": "auto"}
quantization_config = AwqConfig.from_dict(dict)
self.assertEqual(dict["bits"], quantization_config.bits)
self.assertEqual(dict["zero_point"], quantization_config.zero_point)
self.assertEqual(dict["backend"], quantization_config.backend)
@slow
@require_torch_accelerator
@require_gptqmodel
@require_accelerate
class AwqTest(unittest.TestCase):
model_name = "TheBloke/Mistral-7B-v0.1-AWQ"
dummy_transformers_model_name = "bigscience/bloom-560m"
model_with_no_k_proj_quantized = "hf-internal-testing/opt-125m-awq-no-k-proj"
input_text = "Hello my name is"
EXPECTED_OUTPUT = set()
EXPECTED_OUTPUT.add(
"Hello my name is Katie and I am a 20 year old student at the University of North Carolina at Chapel Hill. I am a junior and I am majoring in Journalism and minoring in Spanish"
)
EXPECTED_OUTPUT.add(
"Hello my name is Katie and I am a 20 year old student at the University of North Carolina at Chapel Hill. I am a junior and I am majoring in Journalism and minoring in Spanish. I am"
)
EXPECTED_OUTPUT.add(
"Hello my name is Katie and I am a 20 year old student at the University of North Carolina at Chapel Hill. I am a junior and I am majoring in Exercise and Sport Science with a"
)
EXPECTED_OUTPUT.add(
"Hello my name is Katie and I am a 20 year old student from the UK. I am currently studying for a degree in English Literature and History at the University of York. I am a very out"
)
EXPECTED_OUTPUT.add(
"Hello my name is Katie and I am a 20 year old student from the UK. I am currently studying for a degree in English Literature and History at the University of York. I am a very creative"
)
EXPECTED_OUTPUT_BF16 = [
"Hello my name is Katie and I am a 20 year old student at the University of North Carolina at Chapel Hill. I am a junior and I am majoring in Journalism and minoring in Spanish",
"Hello my name is Katie and I am a 20 year old student at the University of North Carolina at Chapel Hill. I am a junior and I am majoring in Exercise and Sport Science with a",
]
EXPECTED_OUTPUT_EXLLAMA = [
"Hello my name is Katie and I am a 20 year old student from the UK. I am currently studying for a degree in English Literature and History at the University of York. I am a very out",
"Hello my name is Katie and I am a 20 year old student from the UK. I am currently studying for a degree in English Literature and History at the University of York. I am a very creative",
"Hello my name is Katie and I am a 20 year old student at the University of North Carolina at Chapel Hill. I am a junior and I am majoring in Journalism and minoring in Spanish",
]
device_map = torch_device
# called only once for all test in this class
@classmethod
def setUpClass(cls):
"""
Setup quantized model
"""
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
cls.quantized_model = AutoModelForCausalLM.from_pretrained(cls.model_name, device_map=cls.device_map)
def tearDown(self):
gc.collect()
backend_empty_cache(torch_device)
gc.collect()
def test_quantized_model_conversion(self):
"""
Simple test that checks if the quantized model has been converted properly
"""
from gptqmodel.nn_modules.qlinear import BaseQuantLinear
from transformers.integrations.awq import replace_with_awq_linear
model_id = "facebook/opt-350m"
config = AutoConfig.from_pretrained(model_id, revision="cb32f77e905cccbca1d970436fb0f5e6b58ee3c5")
quantization_config = AwqConfig(bits=4)
with torch.device("meta"):
model = OPTForCausalLM(config)
nb_linears = 0
for module in model.modules():
if isinstance(module, torch.nn.Linear):
nb_linears += 1
model = replace_with_awq_linear(model, quantization_config=quantization_config)
nb_awq_linear = 0
for module in model.modules():
if isinstance(module, BaseQuantLinear):
nb_awq_linear += 1
self.assertEqual(nb_linears, nb_awq_linear)
# Try with `modules_not_to_convert`
with torch.device("meta"):
model = OPTForCausalLM(config)
model = replace_with_awq_linear(
model, quantization_config=quantization_config, modules_to_not_convert=["lm_head"]
)
nb_awq_linear = 0
for module in model.modules():
if isinstance(module, BaseQuantLinear):
nb_awq_linear += 1
self.assertEqual(nb_linears - 1, nb_awq_linear)
def test_quantized_model(self):
"""
Simple test that checks if the quantized model is working properly
"""
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
output = self.quantized_model.generate(**input_ids, max_new_tokens=40)
self.assertIn(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
def test_raise_if_non_quantized(self):
model_id = "facebook/opt-125m"
quantization_config = AwqConfig(bits=4)
with self.assertRaises(ValueError):
_ = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config)
def test_quantized_model_bf16(self):
"""
Simple test that checks if the quantized model is working properly with bf16
"""
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
quantized_model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype=torch.bfloat16).to(torch_device)
output = quantized_model.generate(**input_ids, max_new_tokens=40)
self.assertIn(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT_BF16)
@require_torch_gpu
def test_quantized_model_exllama(self):
"""
Simple test that checks if the quantized model is working properly with exllama backend
"""
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
quantization_config = AwqConfig(backend=AwqBackend.EXLLAMA_V2)
quantized_model = AutoModelForCausalLM.from_pretrained(
self.model_name, quantization_config=quantization_config, device_map=torch_device
)
output = quantized_model.generate(**input_ids, max_new_tokens=40)
self.assertIn(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT_EXLLAMA)
def test_quantized_model_no_device_map(self):
"""
Simple test that checks if the quantized model is working properly
"""
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
quantized_model = AutoModelForCausalLM.from_pretrained(self.model_name).to(torch_device)
output = quantized_model.generate(**input_ids, max_new_tokens=40)
self.assertIn(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
def test_save_pretrained(self):
"""
Simple test that checks if the quantized model is working properly after being saved and loaded
"""
# Load a fresh model for saving — the shared self.quantized_model may have
# already been in-place transformed by a prior generate() call, and saving
# those transformed buffers then re-transforming on reload would corrupt data.
fresh_model = AutoModelForCausalLM.from_pretrained(self.model_name)
with tempfile.TemporaryDirectory() as tmpdirname:
fresh_model.save_pretrained(tmpdirname)
model = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map=self.device_map)
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
output = model.generate(**input_ids, max_new_tokens=40)
self.assertIn(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
@require_torch_multi_accelerator
def test_quantized_model_multi_accelerator(self):
"""
Simple test that checks if the quantized model is working properly with multiple GPUs
"""
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
quantized_model = AutoModelForCausalLM.from_pretrained(self.model_name, device_map="auto")
self.assertTrue(len(set(quantized_model.hf_device_map.values())) >= 2)
output = quantized_model.generate(**input_ids, max_new_tokens=40)
self.assertIn(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
def test_quantized_model_no_k_proj_quantized(self):
"""
Simple test that checks if the quantized model is working properly with multiple GPUs
"""
dummy_input = torch.LongTensor([[0, 1, 0]]).to(torch_device)
quantized_model = AutoModelForCausalLM.from_pretrained(self.model_with_no_k_proj_quantized).to(torch_device)
self.assertTrue(isinstance(quantized_model.model.decoder.layers[0].self_attn.k_proj, torch.nn.Linear))
self.assertFalse(isinstance(quantized_model.model.decoder.layers[0].self_attn.v_proj, torch.nn.Linear))
EXPECTED_OUTPUT = torch.LongTensor([[0, 1, 0, 50118, 50118, 133, 248, 12, 134, 16, 10, 372, 2031]]).to(
torch_device
)
output = quantized_model.generate(dummy_input, max_new_tokens=10)
self.assertTrue((output == EXPECTED_OUTPUT).all())
@slow
@require_torch_accelerator
@require_gptqmodel
@require_accelerate
class AwqScaleTest(unittest.TestCase):
model_name = "TechxGenus/starcoder2-3b-AWQ"
def test_load_quantized_model(self):
from gptqmodel.quantization.awq.modules.act import ScaledActivation
"""
Simple test that checks if the scales have been replaced in the quantized model
"""
quantized_model = AutoModelForCausalLM.from_pretrained(
"TechxGenus/starcoder2-3b-AWQ", dtype=torch.float16, device_map=torch_device
)
self.assertTrue(isinstance(quantized_model.model.layers[0].mlp.act, ScaledActivation))
@slow
@require_gptqmodel
@require_accelerate
class AwqTorchFusedTest(unittest.TestCase):
def test_quantized_model_torch_fused(self):
"""
Simple test that checks if the quantized model is working properly with torch_fused backend
"""
quantization_config = AwqConfig(backend=AwqBackend.TORCH_FUSED_AWQ)
model = AutoModelForCausalLM.from_pretrained(
"TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ",
quantization_config=quantization_config,
device_map="cpu",
)
tokenizer = AutoTokenizer.from_pretrained("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ")
input_ids = tokenizer.encode("How to make a cake", return_tensors="pt")
pad_token_id = tokenizer.eos_token_id
output = model.generate(input_ids, do_sample=False, max_length=20, pad_token_id=pad_token_id)
print(tokenizer.decode(output[0], skip_special_tokens=True))
expected_output = (
"How to make a cake with a round tin?\nHow to make a cake with a round tin?\n1. Preheat the oven to 180°"
)
self.assertIn(tokenizer.decode(output[0], skip_special_tokens=True), expected_output)

View File

View File

@@ -0,0 +1,218 @@
# Copyright 2025 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import tempfile
import unittest
import pytest
from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer
from transformers.testing_utils import (
backend_empty_cache,
backend_synchronize,
require_accelerate,
require_auto_round,
require_torch_accelerator,
require_torch_gpu,
require_torch_multi_accelerator,
slow,
torch_device,
)
from transformers.utils import is_torch_available
if is_torch_available():
import torch
@slow
@require_torch_accelerator
@require_auto_round
@require_accelerate
class AutoRoundTest(unittest.TestCase):
model_name = "OPEA/Qwen2.5-1.5B-Instruct-int4-sym-inc"
input_text = "There is a girl who likes adventure,"
EXPECTED_OUTPUTS = set()
## Different backends may produce slight variations in output
EXPECTED_OUTPUTS.add(
"There is a girl who likes adventure, and she has been exploring the world "
"for many years. She travels to different countries and cultures, trying new "
"things every day. One of her favorite places to visit is a small village in "
"the mountains where"
)
EXPECTED_OUTPUTS.add(
"There is a girl who likes adventure, and she has been exploring the world for many years. She has visited every country in Europe and has even traveled to some of the most remote parts of Africa. She enjoys hiking through the mountains and discovering"
)
EXPECTED_OUTPUTS.add(
"There is a girl who likes adventure, and she has been exploring the world for many years. She has visited every country in Europe and has even traveled to some of the most remote parts of Africa. She has also climbed mountains and explored caves"
)
device_map = torch_device
# called only once for all test in this class
@classmethod
def setUpClass(cls):
"""
Setup quantized model
"""
backend_synchronize(torch_device)
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
cls.quantized_model = AutoModelForCausalLM.from_pretrained(
cls.model_name, device_map=cls.device_map, dtype=torch.float16
)
def tearDown(self):
gc.collect()
backend_empty_cache(torch_device)
gc.collect()
def test_quantized_model(self):
"""
Simple test that checks if the quantized model is working properly
"""
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
output = self.quantized_model.generate(**input_ids, max_new_tokens=40, do_sample=False)
self.assertIn(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
def test_raise_if_non_quantized(self):
model_id = "facebook/opt-125m"
quantization_config = AutoRoundConfig(bits=4)
with self.assertRaises(ValueError):
_ = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config)
def test_quantized_model_bf16(self):
"""
Simple test that checks if the quantized model is working properly with bf16
"""
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
quantization_config = AutoRoundConfig(backend="triton")
quantized_model = AutoModelForCausalLM.from_pretrained(
self.model_name,
dtype=torch.bfloat16,
device_map=self.device_map,
quantization_config=quantization_config,
)
output = quantized_model.generate(**input_ids, max_new_tokens=40, do_sample=False)
self.assertIn(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
@pytest.mark.skip(reason="This test is temperarily disabled for CI machine's CPU is slow")
def test_quantized_model_on_cpu(self):
"""
Simple test that checks if the quantized model is working properly
"""
input_ids = self.tokenizer(self.input_text, return_tensors="pt")
quantized_model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto")
output = quantized_model.generate(**input_ids, max_new_tokens=40, do_sample=False)
self.assertIn(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
def test_save_pretrained(self):
"""
Simple test that checks if the quantized model is working properly after being saved and loaded
"""
## some backends like marlin/ipex will repack the weight that caused the weight shape changed
with tempfile.TemporaryDirectory() as tmpdirname:
quantization_config = AutoRoundConfig(backend="triton")
quantized_model = AutoModelForCausalLM.from_pretrained(
self.model_name,
device_map=self.device_map,
dtype=torch.float16,
quantization_config=quantization_config,
)
quantized_model.save_pretrained(tmpdirname)
model = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map=torch_device)
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
output = model.generate(**input_ids, max_new_tokens=40, do_sample=False)
output_tokens = self.tokenizer.decode(output[0], skip_special_tokens=True)
self.assertIn(output_tokens, self.EXPECTED_OUTPUTS)
@require_torch_multi_accelerator
def test_quantized_model_multi_accelerator(self):
"""
Simple test that checks if the quantized model is working properly with multiple accelerators
"""
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
quantization_config = AutoRoundConfig(backend="triton")
quantized_model = AutoModelForCausalLM.from_pretrained(
self.model_name, device_map="auto", quantization_config=quantization_config, dtype="auto"
)
output = quantized_model.generate(**input_ids, max_new_tokens=40, do_sample=False)
self.assertIn(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
def test_convert_from_gptq(self):
"""
Simple test that checks if auto-round work properly with gptq format
"""
model_name = "ybelkada/opt-125m-gptq-4bit"
quantization_config = AutoRoundConfig()
model = AutoModelForCausalLM.from_pretrained(
model_name, device_map=torch_device, quantization_config=quantization_config, dtype="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
text = "There is a girl who likes adventure,"
inputs = tokenizer(text, return_tensors="pt").to(model.device)
tokenizer.decode(model.generate(**inputs, max_new_tokens=5)[0])
@pytest.mark.skip(reason="This test is temperarily disabled for CI machine's CPU is slow")
def test_convert_from_awq_cpu(self):
"""
Simple test that checks if auto-round work properly with awq format
"""
model_name = "casperhansen/opt-125m-awq"
quantization_config = AutoRoundConfig()
model = AutoModelForCausalLM.from_pretrained(
model_name, device_map="cpu", quantization_config=quantization_config, dtype="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
text = "There is a girl who likes adventure,"
inputs = tokenizer(text, return_tensors="pt").to(model.device)
tokenizer.decode(model.generate(**inputs, max_new_tokens=5)[0])
@require_torch_gpu
def test_mixed_bits(self):
"""
Simple test that checks if auto-round work properly with mixed bits
"""
model_name = "facebook/opt-125m"
model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)
layer_config = {
"model.decoder.layers.0.self_attn.k_proj": {"bits": 8},
"model.decoder.layers.6.self_attn.out_proj": {"bits": 2, "group_size": 32},
}
bits, group_size, sym = 4, 128, True
from auto_round import AutoRound
autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym, layer_config=layer_config)
with tempfile.TemporaryDirectory() as tmpdirname:
autoround.quantize_and_save(output_dir=tmpdirname)
model = AutoModelForCausalLM.from_pretrained(tmpdirname, dtype=torch.float16, device_map=torch_device)
text = "There is a girl who likes adventure,"
inputs = tokenizer(text, return_tensors="pt").to(model.device)
tokenizer.decode(model.generate(**inputs, max_new_tokens=5)[0])

View File

@@ -0,0 +1,259 @@
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import unittest
from transformers import (
AutoConfig,
AutoModelForCausalLM,
AutoTokenizer,
BitNetQuantConfig,
OPTForCausalLM,
)
from transformers.testing_utils import (
backend_empty_cache,
require_accelerate,
require_torch_accelerator,
slow,
torch_device,
)
from transformers.utils import is_torch_available
if is_torch_available():
import torch
class BitNetPackedWeightsTest(unittest.TestCase):
def test_offline_autobitlinear_weight_conversion(self):
"""get_weight_conversions() must return a WeightConverter for autobitlinear+offline"""
from transformers.quantizers.quantizer_bitnet import BitNetHfQuantizer
config = BitNetQuantConfig(linear_class="autobitlinear", quantization_mode="offline")
quantizer = BitNetHfQuantizer(config)
conversions = quantizer.get_weight_conversions()
self.assertEqual(len(conversions), 1)
self.assertEqual(conversions[0].source_patterns, ["weight"])
self.assertEqual(conversions[0].target_patterns, ["weight"])
def test_unpack_packed_weights(self):
"""BitNetDeserialize.convert() must unpack packed weights to the original ternary values"""
from transformers.integrations.bitnet import AutoBitLinear, BitNetDeserialize, pack_weights
out_features = 128
in_features = 64
class SimpleModel(torch.nn.Module):
def __init__(self):
super().__init__()
self.linear = AutoBitLinear(in_features=in_features, out_features=out_features, bias=False)
model = SimpleModel()
# same as the ckpt loading with safetensors: ternary weights {-1, 0, 1} packed into uint8, then cast to bfloat16
original = torch.randint(-1, 2, (out_features, in_features)).to(torch.bfloat16)
packed = pack_weights(original.clone().float()).to(torch.bfloat16)
# packed shape is [out_features // 4, in_features]
self.assertEqual(packed.shape[0], out_features // 4)
deserializer = BitNetDeserialize(hf_quantizer=None)
result = deserializer.convert({"weight": packed}, model=model, full_layer_name="linear.weight")
self.assertEqual(result["weight"].shape, (out_features, in_features))
self.assertTrue(torch.equal(result["weight"], original))
@require_torch_accelerator
class BitNetQuantConfigTest(unittest.TestCase):
def test_to_dict(self):
"""
Simple test that checks if one uses a config and converts it to a dict, the dict is the same as the config object
"""
quantization_config = BitNetQuantConfig()
config_to_dict = quantization_config.to_dict()
for key in config_to_dict:
self.assertEqual(getattr(quantization_config, key), config_to_dict[key])
@slow
@require_torch_accelerator
@require_accelerate
class BitNetTest(unittest.TestCase):
model_name = "HF1BitLLM/Llama3-8B-1.58-100B-tokens"
# called only once for all test in this class
@classmethod
def setUpClass(cls):
"""
Load the model
"""
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
cls.quantized_model = AutoModelForCausalLM.from_pretrained(
cls.model_name, dtype=torch.bfloat16, device_map=torch_device
)
def tearDown(self):
gc.collect()
backend_empty_cache(torch_device)
gc.collect()
def test_replace_with_bitlinear(self):
from transformers.integrations import BitLinear, replace_with_bitnet_linear
model_id = "facebook/opt-350m"
config = AutoConfig.from_pretrained(model_id)
with torch.device("meta"):
model = OPTForCausalLM(config)
nb_linears = 0
for module in model.modules():
if isinstance(module, torch.nn.Linear):
nb_linears += 1
model = replace_with_bitnet_linear(model)
nb_bitnet_linear = 0
for module in model.modules():
if isinstance(module, BitLinear):
nb_bitnet_linear += 1
self.assertEqual(nb_linears, nb_bitnet_linear)
def test_quantized_model(self):
"""
Simple test that checks if the quantized model is working properly
"""
input_text = "What are we having for dinner?"
expected_output = "What are we having for dinner? What are we going to do for fun? What are"
input_ids = self.tokenizer(input_text, return_tensors="pt").to(torch_device)
output = self.quantized_model.generate(**input_ids, max_new_tokens=11, do_sample=False)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), expected_output)
def test_packing_unpacking(self):
"""
Simple test the packing and unpacking logic
"""
from transformers.integrations import pack_weights, unpack_weights
u = torch.randint(0, 255, (256, 256), dtype=torch.uint8)
unpacked_u = unpack_weights(u, dtype=torch.bfloat16)
repacked_u = pack_weights(unpacked_u)
for i in range(u.shape[0]):
for j in range(u.shape[1]):
self.assertEqual(repacked_u[i][j], u[i][j])
def test_activation_quant(self):
"""
test the activation function behaviour
"""
from transformers.integrations import BitLinear
layer = BitLinear(in_features=4, out_features=2, bias=False, dtype=torch.float32)
layer.to(torch_device)
input_tensor = torch.tensor([1.0, -1.0, -1.0, 1.0], dtype=torch.float32).to(torch_device)
# Quantize the input tensor
quantized_tensor, scale = layer.activation_quant(input_tensor)
# Verify the output quantized tensor
for i in range(input_tensor.shape[0]):
self.assertEqual(quantized_tensor[i] / scale, input_tensor[i])
# Verify the scale tensor
self.assertEqual(scale, 127)
def test_weights_dtype(self):
"""
test the weights dtype after loading
"""
self_attn_q = self.quantized_model.model.layers[0].self_attn.q_proj.weight
self_attn_k = self.quantized_model.model.layers[0].self_attn.k_proj.weight
self_attn_v = self.quantized_model.model.layers[0].self_attn.v_proj.weight
self_attn_o = self.quantized_model.model.layers[0].self_attn.o_proj.weight
mlp_gate = self.quantized_model.model.layers[0].mlp.gate_proj.weight
mlp_up = self.quantized_model.model.layers[0].mlp.up_proj.weight
mlp_down = self.quantized_model.model.layers[0].mlp.down_proj.weight
self.assertEqual(self_attn_q.dtype, torch.uint8)
self.assertEqual(self_attn_k.dtype, torch.uint8)
self.assertEqual(self_attn_v.dtype, torch.uint8)
self.assertEqual(self_attn_o.dtype, torch.uint8)
self.assertEqual(mlp_up.dtype, torch.uint8)
self.assertEqual(mlp_gate.dtype, torch.uint8)
self.assertEqual(mlp_down.dtype, torch.uint8)
def test_replace_with_bitlinear_shape(self):
"""
test that the BitNet layer weight shapes are correct, and the weight_scale is correctly initialized to 1
"""
from transformers.integrations import replace_with_bitnet_linear
out_features = 1024
in_features = 512
class SimpleLinearModule(torch.nn.Module):
"""
Simple class to test BitLinear
"""
def __init__(
self,
in_features: int = in_features,
out_features: int = out_features,
bias: bool = False,
):
super().__init__()
self.linear = torch.nn.Linear(in_features=in_features, out_features=out_features, bias=bias)
def forward(self, x):
return self.linear(x)
model = SimpleLinearModule()
replace_with_bitnet_linear(model)
self.assertEqual(list(model.linear.weight.shape), [out_features // 4, in_features])
self.assertEqual(model.linear.weight_scale, 1)
@slow
@require_torch_accelerator
@require_accelerate
class BitNetSerializationTest(unittest.TestCase):
def test_model_serialization(self):
model_name = "HF1BitLLM/Llama3-8B-1.58-100B-tokens"
quantized_model = AutoModelForCausalLM.from_pretrained(model_name, device_map=torch_device)
input_tensor = torch.zeros((1, 8), dtype=torch.int32, device=torch_device)
with torch.no_grad():
logits_ref = quantized_model.forward(input_tensor).logits
# Save
saved_model_id = "quant_model"
quantized_model.save_pretrained(saved_model_id)
# Remove old model
del quantized_model
backend_empty_cache(torch_device)
# Load and check if the logits match
model_loaded = AutoModelForCausalLM.from_pretrained("quant_model", device_map=torch_device)
with torch.no_grad():
logits_loaded = model_loaded.forward(input_tensor).logits
self.assertEqual((logits_loaded - logits_ref).abs().mean().item(), 0)

View File

@@ -0,0 +1,120 @@
# Testing mixed int8 quantization
![HFxbitsandbytes.png](https://cdn-uploads.huggingface.co/production/uploads/1660567705337-62441d1d9fdefb55a0b7d12c.png)
The following is the recipe on how to effectively debug `bitsandbytes` integration on Hugging Face `transformers`.
## Library requirements
+ `transformers>=4.22.0`
+ `accelerate>=0.12.0`
+ `bitsandbytes>=0.31.5`.
## Hardware requirements
The following instructions are tested with 2 NVIDIA-Tesla T4 GPUs. To run successfully `bitsandbytes` you would need a 8-bit core tensor supported GPU. Note that Turing, Ampere or newer architectures - e.g. T4, RTX20s RTX30s, A40-A100, A6000 should be supported.
## Virtual envs
```bash
conda create --name int8-testing python==3.8
pip install bitsandbytes>=0.31.5
pip install accelerate>=0.12.0
pip install transformers>=4.23.0
```
if `transformers>=4.23.0` is not released yet, then use:
```bash
pip install git+https://github.com/huggingface/transformers.git
```
## Troubleshooting
A list of common errors:
### Torch does not correctly do the operations on GPU
First check that:
```py
import torch
vec = torch.randn(1, 2, 3).to(0)
```
Works without any error. If not, install torch using `conda` like:
```bash
conda create --name int8-testing python==3.8
conda install pytorch torchvision torchaudio cudatoolkit=11.6 -c pytorch -c conda-forge
pip install bitsandbytes>=0.31.5
pip install accelerate>=0.12.0
pip install transformers>=4.23.0
```
For the latest pytorch instructions please see [this](https://pytorch.org/get-started/locally/)
and the snippet above should work.
### ` bitsandbytes operations are not supported under CPU!`
This happens when some Linear weights are set to the CPU when using `accelerate`. Please check carefully `model.hf_device_map` and make sure that there is no `Linear` module that is assigned to CPU. It is fine to have the last module (usually the Lm_head) set on CPU.
### `To use the type as a Parameter, please correct the detach() semantics defined by __torch_dispatch__() implementation.`
Use the latest version of `accelerate` with a command such as: `pip install -U accelerate` and the problem should be solved.
### `Parameter has no attribute .CB`
Same solution as above.
### `RuntimeError: CUDA error: an illegal memory access was encountered ... consider passing CUDA_LAUNCH_BLOCKING=1`
Run your script by prepending `CUDA_LAUNCH_BLOCKING=1` and you should observe an error as described in the next section.
### `CUDA illegal memory error: an illegal memory access at line...`:
Check the CUDA versions with:
```bash
nvcc --version
```
and confirm it is the same version as the one detected by `bitsandbytes`. If not, run:
```bash
ls -l $CONDA_PREFIX/lib/libcudart.so
```
or
```bash
ls -l $LD_LIBRARY_PATH
```
Check if `libcudart.so` has a correct symlink that is set. Sometimes `nvcc` detects the correct CUDA version but `bitsandbytes` doesn't. You have to make sure that the symlink that is set for the file `libcudart.so` is redirected to the correct CUDA file.
Here is an example of a badly configured CUDA installation:
`nvcc --version` gives:
![Screenshot 2022-08-15 at 15.12.23.png](https://cdn-uploads.huggingface.co/production/uploads/1660569220888-62441d1d9fdefb55a0b7d12c.png)
which means that the detected CUDA version is 11.3 but `bitsandbytes` outputs:
![image.png](https://cdn-uploads.huggingface.co/production/uploads/1660569284243-62441d1d9fdefb55a0b7d12c.png)
First check:
```bash
echo $LD_LIBRARY_PATH
```
If this contains multiple paths separated by `:`. Then you have to make sure that the correct CUDA version is set. By doing:
```bash
ls -l $path/libcudart.so
```
On each path (`$path`) separated by `:`.
If not, simply run
```bash
ls -l $LD_LIBRARY_PATH/libcudart.so
```
and you can see
![Screenshot 2022-08-15 at 15.12.33.png](https://cdn-uploads.huggingface.co/production/uploads/1660569176504-62441d1d9fdefb55a0b7d12c.png)
If you see that the file is linked to the wrong CUDA version (here 10.2), find the correct location for `libcudart.so` (`find --name libcudart.so`) and replace the environment variable `LD_LIBRARY_PATH` with the one containing the correct `libcudart.so` file.

View File

View File

@@ -0,0 +1,870 @@
# Copyright 2022 The HuggingFace Team Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a clone of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import tempfile
import unittest
import pytest
from transformers import (
AutoConfig,
AutoModel,
AutoModelForCausalLM,
AutoModelForSeq2SeqLM,
AutoModelForSequenceClassification,
AutoTokenizer,
BitsAndBytesConfig,
pipeline,
set_seed,
)
from transformers.models.opt.modeling_opt import OPTAttention
from transformers.testing_utils import (
apply_skip_if_not_implemented,
backend_empty_cache,
backend_torch_accelerator_module,
is_bitsandbytes_available,
is_torch_available,
require_accelerate,
require_bitsandbytes,
require_torch,
require_torch_multi_accelerator,
slow,
torch_device,
)
def get_some_linear_layer(model):
if model.config.model_type == "gpt2":
return model.transformer.h[0].mlp.c_fc
elif model.config.model_type == "opt":
try:
return model.decoder.layers[0].fc1
except AttributeError:
# for AutoModelforCausalLM
return model.model.decoder.layers[0].fc1
elif model.config.model_type == "llama":
return model.model.layers[0].mlp.gate_proj
else:
return model.transformer.h[0].mlp.dense_4h_to_h
if is_torch_available():
import torch
import torch.nn as nn
class LoRALayer(nn.Module):
"""Wraps a linear layer with LoRA-like adapter - Used for testing purposes only"""
def __init__(self, module: nn.Module, rank: int):
super().__init__()
self.module = module
self.adapter = nn.Sequential(
nn.Linear(module.in_features, rank, bias=False),
nn.Linear(rank, module.out_features, bias=False),
)
small_std = (2.0 / (5 * min(module.in_features, module.out_features))) ** 0.5
nn.init.normal_(self.adapter[0].weight, std=small_std)
nn.init.zeros_(self.adapter[1].weight)
self.adapter.to(module.weight.device)
def forward(self, input, *args, **kwargs):
return self.module(input, *args, **kwargs) + self.adapter(input)
if is_bitsandbytes_available():
import bitsandbytes as bnb
@require_bitsandbytes
@require_accelerate
@require_torch
@slow
class Base4bitTest(unittest.TestCase):
# We keep the constants inside the init function and model loading inside setUp function
# We need to test on relatively large models (aka >1b parameters otherwise the quantiztion may not work as expected)
# Therefore here we use only bloom-1b3 to test our module
model_name = "bigscience/bloom-1b7"
# Constant values
EXPECTED_RELATIVE_DIFFERENCE = (
2.109659552692574 # This was obtained on a RTX Titan so the number might slightly change
)
input_text = "Hello my name is"
EXPECTED_OUTPUTS = set()
EXPECTED_OUTPUTS.add("Hello my name is John and I am a professional photographer. I")
EXPECTED_OUTPUTS.add("Hello my name is John.\nI am a friend of your father.\n")
EXPECTED_OUTPUTS.add("Hello my name is John Doe, I am a student at the University")
EXPECTED_OUTPUTS.add("Hello my name is John and I am 25 years old.")
EXPECTED_OUTPUTS.add("Hello my name is John and I am a student at the University of")
# Expected values on Intel XPU and NV A100
EXPECTED_OUTPUTS.add("Hello my name is Alina. I have been working as a professional")
MAX_NEW_TOKENS = 10
def setUp(self):
# Models and tokenizer
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
@apply_skip_if_not_implemented
class Bnb4BitTest(Base4bitTest):
def setUp(self):
super().setUp()
# Models and tokenizer
self.model_fp16 = AutoModelForCausalLM.from_pretrained(self.model_name, dtype=torch.float16, device_map="auto")
self.model_4bit = AutoModelForCausalLM.from_pretrained(
self.model_name,
dtype=torch.float16,
quantization_config=BitsAndBytesConfig(load_in_4bit=True),
device_map="auto",
)
def tearDown(self):
r"""
TearDown function needs to be called at the end of each test to free the GPU memory and cache, also to
avoid unexpected behaviors. Please see: https://discuss.pytorch.org/t/how-can-we-release-gpu-memory-cache/14530/27
"""
del self.model_fp16
del self.model_4bit
gc.collect()
backend_empty_cache(torch_device)
def test_quantization_num_parameters(self):
r"""
Test if the number of returned parameters is correct
See: https://github.com/huggingface/transformers/issues/25978
"""
num_params_4bit = self.model_4bit.num_parameters()
num_params_fp16 = self.model_fp16.num_parameters()
self.assertEqual(num_params_4bit, num_params_fp16)
def test_compute_module_sizes(self):
r"""
Test if we compute the right module sizes needed to generate the device map.
Also test if we get the right values for `total_byte_count` in `caching_allocator_warmup`.
"""
from transformers.integrations.accelerate import compute_module_sizes
from transformers.modeling_utils import expand_device_map, get_total_byte_count
from transformers.quantizers import AutoHfQuantizer
# we need to preprocess the model like that because device_map calculation happens before we load the weights inside the model.
# For normal wieghts, it's fine but for quantized weights, the tensors dtype might change during loading.
with torch.device("meta"):
model = AutoModelForCausalLM.from_config(self.model_fp16.config, dtype=torch.float16)
model_size, _ = compute_module_sizes(model, only_modules=False)
expected_keys = [name for name, _ in model.named_parameters()] + [
name for name, _ in model.named_buffers()
]
expanded_device_map = expand_device_map({"": torch_device}, expected_keys)
total_byte_count = list(get_total_byte_count(model, expanded_device_map).values())[0]
# testing prequantized = False should be enough, the shape should be the same whether it is pre-quantized or not
hf_quantizer = AutoHfQuantizer.from_config(BitsAndBytesConfig(load_in_4bit=True), pre_quantized=False)
hf_quantizer.preprocess_model(model=model, config=model.config, device_map=expanded_device_map)
quantized_model_size, _ = compute_module_sizes(model, hf_quantizer, only_modules=False)
expected_keys = [name for name, _ in model.named_parameters()] + [
name for name, _ in model.named_buffers()
]
expanded_device_map = expand_device_map({"": torch_device}, expected_keys)
quantized_total_byte_count = list(get_total_byte_count(model, expanded_device_map, hf_quantizer).values())[
0
]
for name, module in model.named_modules():
if isinstance(module, bnb.nn.Linear4bit):
# from 16 bits to 4 bits
assert int(model_size[f"{name}.weight"] // 4) == int(quantized_model_size[f"{name}.weight"])
# check that we get the same value, as we use `compute_module_sizes` in `get_total_byte_count`
assert total_byte_count == model_size[""]
assert quantized_total_byte_count == quantized_model_size[""]
# we should at least have 2 times memory reduction in total
assert model_size[""] > quantized_model_size[""] * 2
def test_quantization_config_json_serialization(self):
r"""
A simple test to check if the quantization config is correctly serialized and deserialized
"""
config = self.model_4bit.config
self.assertTrue(hasattr(config, "quantization_config"))
_ = config.to_dict()
_ = config.to_diff_dict()
_ = config.to_json_string()
def test_memory_footprint(self):
r"""
A simple test to check if the model conversion has been done correctly by checking on the
memory footprint of the converted model and the class type of the linear layers of the converted models
"""
from bitsandbytes.nn import Params4bit
mem_fp16 = self.model_fp16.get_memory_footprint()
mem_4bit = self.model_4bit.get_memory_footprint()
self.assertAlmostEqual(mem_fp16 / mem_4bit, self.EXPECTED_RELATIVE_DIFFERENCE, delta=1e-5)
linear = get_some_linear_layer(self.model_4bit)
self.assertTrue(linear.weight.__class__ == Params4bit)
def test_linear_are_4bit(self):
r"""
A simple test to check if the model conversion has been done correctly by checking on the
memory footprint of the converted model and the class type of the linear layers of the converted models
"""
from transformers import T5PreTrainedModel
self.model_fp16.get_memory_footprint()
self.model_4bit.get_memory_footprint()
for name, module in self.model_4bit.named_modules():
if isinstance(module, torch.nn.Linear):
if name not in ["lm_head"] + T5PreTrainedModel._keep_in_fp32_modules:
# 4-bit parameters are packed in uint8 variables
self.assertTrue(module.weight.dtype == torch.uint8)
def test_generate_quality(self):
r"""
Test the generation quality of the quantized model and see that we are matching the expected output.
Given that we are operating on small numbers + the testing model is relatively small, we might not get
the same output across GPUs. So we'll generate few tokens (5-10) and check their output.
"""
encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
output_sequences = self.model_4bit.generate(
input_ids=encoded_input["input_ids"].to(self.model_4bit.device), max_new_tokens=10
)
self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
def test_generate_quality_config(self):
r"""
Test that loading the model with the config is equivalent
"""
bnb_config = BitsAndBytesConfig()
bnb_config.load_in_4bit = True
model_4bit_from_config = AutoModelForCausalLM.from_pretrained(
self.model_name, quantization_config=bnb_config, device_map="auto"
)
encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
output_sequences = model_4bit_from_config.generate(
input_ids=encoded_input["input_ids"].to(model_4bit_from_config.device), max_new_tokens=10
)
self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
def test_generate_quality_dequantize(self):
r"""
Test that loading the model and unquantize it produce correct results
"""
bnb_config = BitsAndBytesConfig(load_in_4bit=True)
model_4bit = AutoModelForCausalLM.from_pretrained(
self.model_name, quantization_config=bnb_config, device_map="auto"
)
model_4bit.dequantize()
encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
output_sequences = model_4bit.generate(
input_ids=encoded_input["input_ids"].to(model_4bit.device), max_new_tokens=10
)
self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
def test_clear_quantization_trace(self):
r"""
Test that dequantizing the model won't leave any attribute relative to quantization in the model's configuration
"""
bnb_config = BitsAndBytesConfig(load_in_4bit=True)
model_4bit = AutoModelForCausalLM.from_pretrained(
self.model_name, quantization_config=bnb_config, device_map="auto"
)
model_4bit.dequantize()
self.assertFalse(hasattr(model_4bit, "hf_quantizer"))
self.assertFalse(hasattr(model_4bit.config, "quantization_config"))
self.assertFalse(hasattr(model_4bit, "quantization_method"))
self.assertFalse(model_4bit.is_quantized)
def test_to_device_dequantized(self):
r"""
Test that dequantizing the model won't prevent converting it to a different dtype
"""
bnb_config = BitsAndBytesConfig(load_in_4bit=True)
model_4bit = AutoModelForCausalLM.from_pretrained(
self.model_name, quantization_config=bnb_config, device_map="auto"
)
model_4bit.dequantize()
model_4bit.to(dtype=torch.float16)
def test_device_assignment(self):
mem_before = self.model_4bit.get_memory_footprint()
# Move to CPU
self.model_4bit.to("cpu")
self.assertEqual(self.model_4bit.device.type, "cpu")
self.assertAlmostEqual(self.model_4bit.get_memory_footprint(), mem_before)
if torch_device in ["cuda", "xpu"]:
# Move back to CUDA device
self.model_4bit.to(torch_device)
self.assertEqual(self.model_4bit.device.type, torch_device)
self.assertAlmostEqual(self.model_4bit.get_memory_footprint(), mem_before)
def test_device_and_dtype_assignment(self):
r"""
Test whether attempting to change the device or cast the dtype of a model
after converting it to 4-bit precision will raise an appropriate error.
The test ensures that such operations are prohibited on 4-bit models
to prevent invalid conversions.
"""
with self.assertRaises(ValueError):
# Tries with a `dtype`
self.model_4bit.to(torch.float16)
with self.assertRaises(ValueError):
# Tries to cast the 4-bit model to float32 using `float()`
self.model_4bit.float()
with self.assertRaises(ValueError):
# Tries to cast the 4-bit model to float16 using `half()`
self.model_4bit.half()
# Test if we did not break anything
self.model_4bit.to(torch.device(torch_device))
encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
self.model_fp16 = self.model_fp16.to(torch.float32)
_ = self.model_fp16.generate(
input_ids=encoded_input["input_ids"].to(self.model_fp16.device), max_new_tokens=10
)
if torch_device in ["cuda", "xpu"]:
# Check that this does not throw an error
_ = self.model_fp16.to(torch_device)
# Check this does not throw an error
_ = self.model_fp16.to("cpu")
# Check this does not throw an error
_ = self.model_fp16.half()
# Check this does not throw an error
_ = self.model_fp16.float()
def test_fp32_4bit_conversion(self):
r"""
Test whether it is possible to mix both `4bit` and `fp32` weights when using `keep_in_fp32_modules` correctly.
"""
model = AutoModelForSeq2SeqLM.from_pretrained(
"google-t5/t5-small", quantization_config=BitsAndBytesConfig(load_in_4bit=True), device_map="auto"
)
self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wo.weight.dtype == torch.float32)
def test_bnb_4bit_wrong_config(self):
r"""
Test whether creating a bnb config with unsupported values leads to errors.
"""
with self.assertRaises(ValueError):
_ = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_storage="add")
@require_bitsandbytes
@require_accelerate
@require_torch
@slow
@apply_skip_if_not_implemented
class Bnb4BitT5Test(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.model_name = "google-t5/t5-small"
cls.dense_act_model_name = "google/flan-t5-small" # flan-t5 uses dense-act instead of dense-relu-dense
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
cls.input_text = "Translate in German: Hello, my dog is cute"
def tearDown(self):
r"""
TearDown function needs to be called at the end of each test to free the GPU memory and cache, also to
avoid unexpected behaviors. Please see: https://discuss.pytorch.org/t/how-can-we-release-gpu-memory-cache/14530/27
"""
gc.collect()
backend_empty_cache(torch_device)
def test_inference_without_keep_in_fp32(self):
r"""
Test whether it is possible to mix both `4bit` and `fp32` weights when using `keep_in_fp32_modules` correctly.
`flan-t5-small` uses `T5DenseGatedActDense` whereas `google-t5/t5-small` uses `T5DenseReluDense`. We need to test
both cases.
"""
from transformers import T5ForConditionalGeneration
modules = T5ForConditionalGeneration._keep_in_fp32_modules
T5ForConditionalGeneration._keep_in_fp32_modules = None
# test with `google-t5/t5-small`
model = T5ForConditionalGeneration.from_pretrained(
self.model_name, quantization_config=BitsAndBytesConfig(load_in_4bit=True), device_map="auto"
)
encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(model.device)
_ = model.generate(**encoded_input)
# test with `flan-t5-small`
model = T5ForConditionalGeneration.from_pretrained(
self.dense_act_model_name, quantization_config=BitsAndBytesConfig(load_in_4bit=True), device_map="auto"
)
encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(model.device)
_ = model.generate(**encoded_input)
T5ForConditionalGeneration._keep_in_fp32_modules = modules
def test_inference_with_keep_in_fp32(self):
r"""
Test whether it is possible to mix both `4bit` and `fp32` weights when using `keep_in_fp32_modules` correctly.
`flan-t5-small` uses `T5DenseGatedActDense` whereas `google-t5/t5-small` uses `T5DenseReluDense`. We need to test
both cases.
"""
from transformers import T5ForConditionalGeneration
# test with `google-t5/t5-small`
model = T5ForConditionalGeneration.from_pretrained(
self.model_name, quantization_config=BitsAndBytesConfig(load_in_4bit=True), device_map="auto"
)
# there was a bug with decoders - this test checks that it is fixed
self.assertTrue(isinstance(model.decoder.block[0].layer[0].SelfAttention.q, bnb.nn.Linear4bit))
encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(model.device)
_ = model.generate(**encoded_input)
# test with `flan-t5-small`
model = T5ForConditionalGeneration.from_pretrained(
self.dense_act_model_name, quantization_config=BitsAndBytesConfig(load_in_4bit=True), device_map="auto"
)
encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(model.device)
_ = model.generate(**encoded_input)
@apply_skip_if_not_implemented
class Classes4BitModelTest(Base4bitTest):
def setUp(self):
super().setUp()
# model_name
self.model_name = "bigscience/bloom-560m"
self.seq_to_seq_name = "google-t5/t5-small"
# Different types of model
self.base_model = AutoModel.from_pretrained(
self.model_name, quantization_config=BitsAndBytesConfig(load_in_4bit=True), device_map="auto"
)
# Sequence classification model
self.sequence_model = AutoModelForSequenceClassification.from_pretrained(
self.model_name, quantization_config=BitsAndBytesConfig(load_in_4bit=True), device_map="auto"
)
# CausalLM model
self.model_4bit = AutoModelForCausalLM.from_pretrained(
self.model_name, quantization_config=BitsAndBytesConfig(load_in_4bit=True), device_map="auto"
)
# Seq2seq model
self.seq_to_seq_model = AutoModelForSeq2SeqLM.from_pretrained(
self.seq_to_seq_name, quantization_config=BitsAndBytesConfig(load_in_4bit=True), device_map="auto"
)
def tearDown(self):
r"""
TearDown function needs to be called at the end of each test to free the GPU memory and cache, also to
avoid unexpected behaviors. Please see: https://discuss.pytorch.org/t/how-can-we-release-gpu-memory-cache/14530/27
"""
del self.base_model
del self.sequence_model
del self.model_4bit
del self.seq_to_seq_model
gc.collect()
backend_empty_cache(torch_device)
def test_correct_head_class(self):
r"""
A simple test to check if the last modules for some classes (AutoModelForCausalLM or SequenceClassification)
are kept in their native class.
"""
from bitsandbytes.nn import Params4bit
self.assertTrue(self.base_model.h[-1].mlp.dense_4h_to_h.weight.__class__ == Params4bit)
# Other heads should be nn.Parameter
self.assertTrue(self.model_4bit.lm_head.weight.__class__ == torch.nn.Parameter)
self.assertTrue(self.sequence_model.score.weight.__class__ == torch.nn.Parameter)
self.assertTrue(self.seq_to_seq_model.lm_head.weight.__class__ == torch.nn.Parameter)
@apply_skip_if_not_implemented
class Pipeline4BitTest(Base4bitTest):
def setUp(self):
super().setUp()
def tearDown(self):
r"""
TearDown function needs to be called at the end of each test to free the GPU memory and cache, also to
avoid unexpected behaviors. Please see: https://discuss.pytorch.org/t/how-can-we-release-gpu-memory-cache/14530/27
"""
if hasattr(self, "pipe"):
del self.pipe
gc.collect()
backend_empty_cache(torch_device)
def test_pipeline(self):
r"""
The aim of this test is to verify that the mixed 4bit is compatible with `pipeline` from transformers. Since
we used pipeline for inference speed benchmarking we want to make sure that this feature does not break anything
on pipeline.
"""
# self._clear_cuda_cache()
self.pipe = pipeline(
"text-generation",
model=self.model_name,
model_kwargs={
"device_map": "auto",
"quantization_config": BitsAndBytesConfig(load_in_4bit=True),
# float16 isn't supported on CPU, use bfloat16 instead
"dtype": torch.bfloat16 if torch_device == "cpu" else torch.float16,
},
max_new_tokens=self.MAX_NEW_TOKENS,
)
# Avoid sampling different outputs
set_seed(42)
# Real second forward pass
pipeline_output = self.pipe(self.input_text)
self.assertIn(pipeline_output[0]["generated_text"], self.EXPECTED_OUTPUTS)
@require_torch_multi_accelerator
@apply_skip_if_not_implemented
class Bnb4bitTestMultiAccelerator(Base4bitTest):
def setUp(self):
super().setUp()
def test_multi_accelerator_loading(self):
r"""
This tests that the model has been loaded and can be used correctly on a multi-accelerator setup.
Let's just try to load a model on 2 accelerators and see if it works. The model we test has ~2GB of total, 3GB should suffice
"""
device_map = {
"transformer.word_embeddings": 0,
"transformer.word_embeddings_layernorm": 0,
"lm_head": 0,
"transformer.h.0": 0,
"transformer.h.1": 0,
"transformer.h.2": 0,
"transformer.h.3": 0,
"transformer.h.4": 0,
"transformer.h.5": 0,
"transformer.h.6": 0,
"transformer.h.7": 0,
"transformer.h.8": 0,
"transformer.h.9": 0,
"transformer.h.10": 1,
"transformer.h.11": 1,
"transformer.h.12": 1,
"transformer.h.13": 1,
"transformer.h.14": 1,
"transformer.h.15": 1,
"transformer.h.16": 1,
"transformer.h.17": 0,
"transformer.h.18": 0,
"transformer.h.19": 0,
"transformer.h.20": 0,
"transformer.h.21": 0,
"transformer.h.22": 0,
"transformer.h.23": 1,
"transformer.ln_f": 0,
}
model_parallel = AutoModelForCausalLM.from_pretrained(
self.model_name, quantization_config=BitsAndBytesConfig(load_in_4bit=True), device_map=device_map
)
# Check correct device map
self.assertEqual(set(model_parallel.hf_device_map.values()), {0, 1})
# Check that inference pass works on the model
encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
# Second real batch
output_parallel = model_parallel.generate(
input_ids=encoded_input["input_ids"].to(torch_device), max_new_tokens=10
)
self.assertIn(self.tokenizer.decode(output_parallel[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
@apply_skip_if_not_implemented
class Bnb4BitTestTraining(Base4bitTest):
def setUp(self):
self.model_name = "facebook/opt-350m"
super().setUp()
def test_training(self):
# Step 1: freeze all parameters
model = AutoModelForCausalLM.from_pretrained(
self.model_name, quantization_config=BitsAndBytesConfig(load_in_4bit=True), revision="refs/pr/40"
)
if torch_device in ["cuda", "xpu"]:
hf_device_map = getattr(model, "hf_device_map", None)
if hf_device_map is not None:
self.assertEqual(
set(hf_device_map.values()), {backend_torch_accelerator_module(torch_device).current_device()}
)
else:
self.assertTrue(all(param.device.type == "cpu" for param in model.parameters()))
for param in model.parameters():
param.requires_grad = False # freeze the model - train adapters later
if param.ndim == 1:
# cast the small parameters (e.g. layernorm) to fp32 for stability
param.data = param.data.to(torch.float32)
# Step 2: add adapters
for _, module in model.named_modules():
if isinstance(module, OPTAttention):
module.q_proj = LoRALayer(module.q_proj, rank=16)
module.k_proj = LoRALayer(module.k_proj, rank=16)
module.v_proj = LoRALayer(module.v_proj, rank=16)
# Step 3: dummy batch
batch = self.tokenizer("Test batch ", return_tensors="pt").to(torch_device)
# Step 4: Check if the gradient is not None
with torch.autocast(torch_device):
out = model.forward(**batch)
out.logits.norm().backward()
for module in model.modules():
if isinstance(module, LoRALayer):
self.assertTrue(module.adapter[1].weight.grad is not None)
self.assertTrue(module.adapter[1].weight.grad.norm().item() > 0)
elif isinstance(module, nn.Embedding):
self.assertTrue(module.weight.grad is None)
@apply_skip_if_not_implemented
class Bnb4BitGPT2Test(Bnb4BitTest):
model_name = "openai-community/gpt2-xl"
EXPECTED_RELATIVE_DIFFERENCE = 3.4483983748189027
@apply_skip_if_not_implemented
class Bnb4BitLlamaTest(Bnb4BitTest):
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
EXPECTED_RELATIVE_DIFFERENCE = 2.9461410686392764
@require_bitsandbytes
@require_accelerate
@require_torch
@slow
@apply_skip_if_not_implemented
class BaseSerializationTest(unittest.TestCase):
model_name = "facebook/opt-125m"
input_text = "Mars colonists' favorite meals are"
def tearDown(self):
gc.collect()
backend_empty_cache(torch_device)
def test_serialization(self, quant_type="nf4", double_quant=True):
r"""
Test whether it is possible to serialize a model in 4-bit. Uses most typical params as default.
See ExtendedSerializationTest class for more params combinations.
"""
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
self.quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type=quant_type,
bnb_4bit_use_double_quant=double_quant,
bnb_4bit_compute_dtype=torch.bfloat16,
)
# for now, we should be able to fetch those in from_pretrained directly
if self.model_name == "facebook/opt-125m":
revision = "refs/pr/49"
else:
revision = "main"
model_0 = AutoModelForCausalLM.from_pretrained(
self.model_name, quantization_config=self.quantization_config, device_map=torch_device, revision=revision
)
with tempfile.TemporaryDirectory() as tmpdirname:
model_0.save_pretrained(tmpdirname)
config = AutoConfig.from_pretrained(tmpdirname)
self.assertTrue(hasattr(config, "quantization_config"))
model_1 = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map=torch_device)
# checking quantized linear module weight
linear = get_some_linear_layer(model_1)
self.assertTrue(linear.weight.__class__ == bnb.nn.Params4bit)
self.assertTrue(hasattr(linear.weight, "quant_state"))
self.assertTrue(linear.weight.quant_state.__class__ == bnb.functional.QuantState)
# checking memory footpring
self.assertAlmostEqual(model_0.get_memory_footprint() / model_1.get_memory_footprint(), 1, places=2)
# Matching all parameters and their quant_state items:
d0 = dict(model_0.named_parameters())
d1 = dict(model_1.named_parameters())
self.assertTrue(d0.keys() == d1.keys())
for k in d0:
self.assertTrue(d0[k].shape == d1[k].shape)
self.assertTrue(d0[k].device.type == d1[k].device.type)
self.assertTrue(d0[k].device == d1[k].device)
self.assertTrue(d0[k].dtype == d1[k].dtype)
self.assertTrue(torch.equal(d0[k], d1[k].to(d0[k].device)))
if isinstance(d0[k], bnb.nn.modules.Params4bit):
for v0, v1 in zip(
d0[k].quant_state.as_dict().values(),
d1[k].quant_state.as_dict().values(),
):
if isinstance(v0, torch.Tensor):
# The absmax will not be saved in the quant_state when using NF4 in CPU
if v0.numel() != 0:
self.assertTrue(torch.equal(v0, v1.to(v0.device)))
else:
self.assertTrue(v0 == v1)
# comparing forward() outputs
encoded_input = tokenizer(self.input_text, return_tensors="pt", return_token_type_ids=False).to(torch_device)
out_0 = model_0(**encoded_input)
out_1 = model_1(**encoded_input)
torch.testing.assert_close(out_0["logits"], out_1["logits"], rtol=0.05, atol=0.05)
# comparing generate() outputs
encoded_input = tokenizer(self.input_text, return_tensors="pt", return_token_type_ids=False).to(torch_device)
output_sequences_0 = model_0.generate(**encoded_input, max_new_tokens=10)
output_sequences_1 = model_1.generate(**encoded_input, max_new_tokens=10)
def _decode(token):
return tokenizer.decode(token, skip_special_tokens=True)
self.assertEqual(
[_decode(x) for x in output_sequences_0],
[_decode(x) for x in output_sequences_1],
)
@apply_skip_if_not_implemented
class ExtendedSerializationTest(BaseSerializationTest):
"""
tests more combinations of parameters
"""
def test_nf4_single_safe(self):
self.test_serialization(quant_type="nf4", double_quant=False)
# nf4 double safetensors quantization is tested in test_serialization() method from the parent class
def test_fp4_single_safe(self):
self.test_serialization(quant_type="fp4", double_quant=False)
def test_fp4_double_safe(self):
self.test_serialization(quant_type="fp4", double_quant=True)
class BloomSerializationTest(BaseSerializationTest):
"""
default BaseSerializationTest config tested with Bloom family model
"""
model_name = "bigscience/bloom-560m"
class GPTSerializationTest(BaseSerializationTest):
"""
default BaseSerializationTest config tested with GPT family model
"""
model_name = "openai-community/gpt2-xl"
class LlamaSerializationTest(BaseSerializationTest):
"""
default BaseSerializationTest config tested with Llama family model
"""
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
@require_bitsandbytes
@require_accelerate
@slow
@apply_skip_if_not_implemented
class Bnb4BitTestBasicConfigTest(unittest.TestCase):
def test_set_load_in_8_bit(self):
quantization_config = BitsAndBytesConfig(load_in_4bit=True)
with self.assertRaisesRegex(ValueError, "load_in_4bit and load_in_8bit are both True"):
quantization_config.load_in_8bit = True
@require_bitsandbytes
@require_accelerate
@slow
@apply_skip_if_not_implemented
class Bnb4bitCompile(unittest.TestCase):
model_name = "hf-internal-testing/tiny-random-LlamaForCausalLM"
input_text = "Hello my name is"
def setUp(self):
# Models and tokenizer
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
self.model_4bit = AutoModelForCausalLM.from_pretrained(
self.model_name, quantization_config=BitsAndBytesConfig(load_in_4bit=True)
)
@pytest.mark.torch_compile_test
def test_generate_compile(self):
encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
# if nothing is set, compile will be disabled for bnb
self.model_4bit.generate(
input_ids=encoded_input["input_ids"].to(self.model_4bit.device),
max_new_tokens=10,
cache_implementation="static",
)
with self.assertRaises(Exception):
# overwrite property
object.__setattr__(self.model_4bit.hf_quantizer, "is_compileable", True)
self.model_4bit.generate(
input_ids=encoded_input["input_ids"].to(self.model_4bit.device),
max_new_tokens=10,
cache_implementation="static",
)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,176 @@
import gc
import unittest
import warnings
from transformers import AutoModelForCausalLM
from transformers.testing_utils import backend_empty_cache, require_compressed_tensors, require_torch, torch_device
from transformers.utils import is_torch_available
from transformers.utils.quantization_config import CompressedTensorsConfig
if is_torch_available():
import torch
@require_compressed_tensors
@require_torch
class StackCompressedModelTest(unittest.TestCase):
# Define stubs as class attributes
compressed_uncompressed_model_stubs = [
(
"nm-testing/llama2.c-stories42M-gsm8k-quantized-only-compressed",
"nm-testing/llama2.c-stories42M-gsm8k-quantized-only-uncompressed",
),
]
# Flatten the list for tests that require a single list of stubs.
model_stubs = [stub for pair in compressed_uncompressed_model_stubs for stub in pair]
prompt = "Paris is the capital of which country?"
def tearDown(self):
gc.collect()
backend_empty_cache(torch_device)
gc.collect()
def test_compressed_uncompressed_model_shapes(self):
"""
Verify that the weights of an uncompressed model and its decompressed compressed counterpart match.
Note: Weights for sparsely compressed models may differ due to packing.
"""
def _has_nested_attr(obj, attr_path):
attrs = attr_path.split(".")
for attr in attrs:
if not hasattr(obj, attr):
return None
obj = getattr(obj, attr)
return obj
for compressed_model, uncompressed_model in self.compressed_uncompressed_model_stubs:
with self.subTest(compressed_model=compressed_model, uncompressed_model=uncompressed_model):
uncompressed = AutoModelForCausalLM.from_pretrained(
uncompressed_model,
device_map="auto",
dtype="auto",
quantization_config=CompressedTensorsConfig(run_compressed=False),
)
compressed_decompressed = AutoModelForCausalLM.from_pretrained(
compressed_model,
device_map="auto",
dtype="auto",
quantization_config=CompressedTensorsConfig(run_compressed=False),
)
for name, submodule in uncompressed.named_modules():
if list(submodule.children()):
continue
comp_decomp_obj = _has_nested_attr(compressed_decompressed, name)
if comp_decomp_obj is not None and hasattr(submodule, "weight"):
torch.testing.assert_close(
submodule.weight.to(torch_device),
comp_decomp_obj.weight.to(torch_device),
atol=0.2,
rtol=1e-5,
msg=f"Weight mismatch for module '{name}'.",
)
def test_no_warnings_for_all_models(self):
"""
Confirm that loading any model using compressed tensors does not trigger
warnings about missing or unexpected keys.
"""
for model_stub in self.model_stubs:
with self.subTest(model_stub=model_stub):
with warnings.catch_warnings(record=True) as caught_warnings:
warnings.simplefilter("always")
AutoModelForCausalLM.from_pretrained(
model_stub,
device_map="auto",
dtype="auto",
quantization_config=CompressedTensorsConfig(run_compressed=False),
)
for warning in caught_warnings:
self.assertNotIn(
"missing keys",
str(warning.message).lower(),
f"'missing keys' found in warnings for model {model_stub}",
)
self.assertNotIn(
"unexpected keys",
str(warning.message).lower(),
f"'unexpected keys' found in warnings for model {model_stub}",
)
@require_compressed_tensors
@require_torch
class RunCompressedTest(unittest.TestCase):
tinyllama_w4a16 = "nm-testing/tinyllama-w4a16-compressed"
tinyllama_w8a8 = "nm-testing/tinyllama-w8a8-compressed"
prompt = "Paris is the capital of which country?"
stubs = [tinyllama_w4a16, tinyllama_w8a8]
def tearDown(self):
gc.collect()
backend_empty_cache(torch_device)
gc.collect()
def test_default_run_compressed__True(self):
from compressed_tensors import QuantizationStatus
for stub in self.stubs:
model = AutoModelForCausalLM.from_pretrained(
stub,
)
compressed_count = sum(
1 for m in model.modules() if getattr(m, "quantization_status", None) == QuantizationStatus.COMPRESSED
)
# some linear modules are not compressed - ex. lm_head
assert compressed_count > 0
def test_default_run_compressed__False(self):
from compressed_tensors import QuantizationStatus
from transformers.utils.quantization_config import CompressedTensorsConfig
quantization_config = CompressedTensorsConfig(run_compressed=False)
for stub in self.stubs:
model = AutoModelForCausalLM.from_pretrained(
stub,
quantization_config=quantization_config,
)
compressed_count = sum(
1 for m in model.modules() if getattr(m, "quantization_status", None) == QuantizationStatus.COMPRESSED
)
# No modules should be in COMPRESSED state
assert compressed_count == 0
def test_run_compressed_outputs_match(self):
"""Check that run_compressed=True/False output are the same"""
from transformers import AutoTokenizer
from transformers.utils.quantization_config import CompressedTensorsConfig
quantization_config = CompressedTensorsConfig(run_compressed=False)
for stub in self.stubs:
tokenizer = AutoTokenizer.from_pretrained(stub)
input_ids = tokenizer(self.prompt, return_tensors="pt").input_ids
model_run_compressed__True = AutoModelForCausalLM.from_pretrained(
stub,
)
output_rc_true = model_run_compressed__True.generate(input_ids, max_new_tokens=100)
model_run_compressed__False = AutoModelForCausalLM.from_pretrained(
stub,
quantization_config=quantization_config,
)
output_rc_false = model_run_compressed__False.generate(input_ids, max_new_tokens=100)
assert tokenizer.decode(output_rc_true[0]) == tokenizer.decode(output_rc_false[0])

View File

@@ -0,0 +1,88 @@
import gc
import unittest
from transformers import AutoModelForCausalLM, AutoTokenizer, CompressedTensorsConfig
from transformers.testing_utils import backend_empty_cache, require_compressed_tensors, require_torch, torch_device
from transformers.utils import is_torch_available
if is_torch_available():
import torch
@require_compressed_tensors
@require_torch
class CompressedTensorsTest(unittest.TestCase):
tinyllama_w4a16 = "nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-compressed"
tinyllama_int8 = "nm-testing/TinyLlama-1.1B-Chat-v1.0-W8A8-Dynamic-Per-Token-compressed"
tinyllama_fp8 = "nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-Dynamic-compressed"
tinyllama_w8a16 = "nm-testing/TinyLlama-1.1B-Chat-v1.0-W8A16-G128-compressed"
prompt = "The capital of France is Paris, the capital of Germany is Berlin"
def tearDown(self):
gc.collect()
backend_empty_cache(torch_device)
gc.collect()
def test_config_args(self):
with self.assertRaises(ValueError):
# passing quant scheme directly is not allowed
CompressedTensorsConfig(config_groups={"weights": {"num_bits": 8}})
CompressedTensorsConfig(
config_groups={"FP8": ["Linear"]},
ignore=["lm_head"],
quantization_status="frozen",
)
def test_config_to_from_dict(self):
config = CompressedTensorsConfig(config_groups={"FP8": ["Linear"]})
config_dict = config.to_dict()
config_from_dict = CompressedTensorsConfig.from_dict(config_dict)
from compressed_tensors import QuantizationConfig
self.assertIsInstance(config_from_dict.quantization_config, QuantizationConfig)
def test_tinyllama_w4a16(self):
self._test_quantized_model(self.tinyllama_w4a16, 20.0)
def test_tinyllama_int8(self):
self._test_quantized_model(self.tinyllama_int8, 30.0)
def test_tinyllama_fp8(self):
self._test_quantized_model(self.tinyllama_fp8, 20.0)
def test_tinyllama_w8a16(self):
self._test_quantized_model(self.tinyllama_w8a16, 20.0)
def _test_quantized_model(self, model_name: str, expected_perplexity: float):
# load model
quantized_model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)
device = quantized_model.device
# check config
self.assertIsNotNone(
quantized_model.config.quantization_config,
"quantization_config should not be None",
)
# check scales
self.assertTrue(
any(
key
for key, tensor in quantized_model.state_dict().items()
if "scale" in key and not torch.all(tensor == 1.0)
),
"quantized model should load a non-trivial scale into the state dict",
)
# compute outputs with loss
inputs = tokenizer(self.prompt, return_tensors="pt").to(device)
labels = inputs["input_ids"]
with torch.no_grad():
outputs = quantized_model(**inputs, labels=labels)
# check perplexity
perplexity = torch.exp(outputs.loss)
self.assertLessEqual(perplexity, expected_perplexity)

View File

@@ -0,0 +1,165 @@
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import tempfile
import unittest
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, EetqConfig, OPTForCausalLM
from transformers.testing_utils import (
backend_empty_cache,
require_accelerate,
require_kernels,
require_torch_gpu,
require_torch_multi_gpu,
slow,
torch_device,
)
from transformers.utils import is_torch_available
if is_torch_available():
import torch
@require_torch_gpu
class EetqConfigTest(unittest.TestCase):
def test_to_dict(self):
"""
Simple test that checks if one uses a config and converts it to a dict, the dict is the same as the config object
"""
quantization_config = EetqConfig()
config_to_dict = quantization_config.to_dict()
for key in config_to_dict:
self.assertEqual(getattr(quantization_config, key), config_to_dict[key])
def test_from_dict(self):
"""
Simple test that checks if one uses a dict and converts it to a config object, the config object is the same as the dict
"""
dict = {"modules_to_not_convert": ["lm_head.weight"], "quant_method": "eetq", "weights": "int8"}
quantization_config = EetqConfig.from_dict(dict)
self.assertEqual(dict["modules_to_not_convert"], quantization_config.modules_to_not_convert)
self.assertEqual(dict["quant_method"], quantization_config.quant_method)
self.assertEqual(dict["weights"], quantization_config.weights)
@slow
@require_torch_gpu
@require_accelerate
@require_kernels
class EetqTest(unittest.TestCase):
model_name = "facebook/opt-350m"
input_text = "What are we having for dinner?"
max_new_tokens = 9
EXPECTED_OUTPUT = "What are we having for dinner?\nI'm having a steak and a salad"
device_map = "cuda"
# called only once for all test in this class
@classmethod
def setUpClass(cls):
"""
Setup quantized model
"""
quantization_config = EetqConfig(weights="int8")
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
cls.quantized_model = AutoModelForCausalLM.from_pretrained(
cls.model_name, device_map=cls.device_map, quantization_config=quantization_config
)
def tearDown(self):
gc.collect()
backend_empty_cache(torch_device)
gc.collect()
def test_quantized_model_conversion(self):
"""
Simple test that checks if the quantized model has been converted properly
"""
from transformers.integrations import replace_with_eetq_linear
from transformers.integrations.eetq import EetqLinear
model_id = "facebook/opt-350m"
config = AutoConfig.from_pretrained(model_id, revision="cb32f77e905cccbca1d970436fb0f5e6b58ee3c5")
with torch.device("meta"):
model = OPTForCausalLM(config)
nb_linears = 0
for module in model.modules():
if isinstance(module, torch.nn.Linear):
nb_linears += 1
model = replace_with_eetq_linear(model)
nb_eetq_linear = 0
for module in model.modules():
if isinstance(module, EetqLinear):
nb_eetq_linear += 1
self.assertEqual(nb_linears, nb_eetq_linear)
# Try with `modules_to_not_convert`
with torch.device("meta"):
model = OPTForCausalLM(config)
model = replace_with_eetq_linear(model, modules_to_not_convert=["fc1"])
nb_eetq_linear = 0
for module in model.modules():
if isinstance(module, EetqLinear):
nb_eetq_linear += 1
# 25 corresponds to the lm_head along with 24 fc1 layers.
self.assertEqual(nb_linears - 24, nb_eetq_linear)
def test_quantized_model(self):
"""
Simple test that checks if the quantized model is working properly
"""
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
output = self.quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
def test_save_pretrained(self):
"""
Simple test that checks if the quantized model is working properly after being saved and loaded
"""
with tempfile.TemporaryDirectory() as tmpdirname:
self.quantized_model.save_pretrained(tmpdirname)
model = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map=self.device_map)
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
output = model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
@require_torch_multi_gpu
def test_quantized_model_multi_gpu(self):
"""
Simple test that checks if the quantized model is working properly with multiple GPUs
set CUDA_VISIBLE_DEVICES=0,1 if you have more than 2 GPUs
"""
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
quantization_config = EetqConfig()
quantized_model = AutoModelForCausalLM.from_pretrained(
self.model_name, device_map="auto", quantization_config=quantization_config
)
self.assertTrue(set(quantized_model.hf_device_map.values()) == {0, 1})
output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)

View File

@@ -0,0 +1,291 @@
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import tempfile
import unittest
from typing import Any
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, FbgemmFp8Config, OPTForCausalLM
from transformers.testing_utils import (
backend_empty_cache,
require_accelerate,
require_deterministic_for_xpu,
require_torch_accelerator,
require_torch_multi_accelerator,
slow,
torch_device,
)
from transformers.utils import (
is_fbgemm_gpu_available,
is_kernels_available,
is_torch_available,
is_torch_xpu_available,
)
if is_torch_available():
import torch
@require_torch_accelerator
class FbgemmFp8ConfigTest(unittest.TestCase):
def test_to_dict(self):
"""
Simple test that checks if one uses a config and converts it to a dict, the dict is the same as the config object
"""
quantization_config = FbgemmFp8Config()
config_to_dict = quantization_config.to_dict()
for key in config_to_dict:
self.assertEqual(getattr(quantization_config, key), config_to_dict[key])
def test_from_dict(self):
"""
Simple test that checks if one uses a dict and converts it to a config object, the config object is the same as the dict
"""
dict = {"modules_to_not_convert": ["lm_head.weight"], "quant_method": "fbgemm_fp8"}
quantization_config = FbgemmFp8Config.from_dict(dict)
self.assertEqual(dict["modules_to_not_convert"], quantization_config.modules_to_not_convert)
self.assertEqual(dict["quant_method"], quantization_config.quant_method)
@slow
@require_torch_accelerator
@unittest.skipIf(
not (is_torch_xpu_available() and is_kernels_available()) and not is_fbgemm_gpu_available(),
"test requires fbgemm-gpu or (xpu and kernels)",
)
@require_accelerate
class FbgemmFp8Test(unittest.TestCase):
model_name = "meta-llama/Meta-Llama-3-8B"
input_text = "What are we having for dinner?"
max_new_tokens = 9
EXPECTED_OUTPUT = set[Any](
[
"What are we having for dinner?\nI'm having a steak and a salad",
"What are we having for dinner? I dont know. What are we having",
"What are we having for dinner? I dont know, what are you having",
]
)
device_map = "xpu" if is_torch_xpu_available() else "cuda"
offload_device_map = {
"model.embed_tokens": 0,
"model.layers.0": 0,
"model.layers.1": 0,
"model.layers.2": 0,
"model.layers.3": 0,
"model.layers.4": 0,
"model.layers.5": 0,
"model.layers.6": 0,
"model.layers.7": 0,
"model.layers.8": 0,
"model.layers.9": 0,
"model.layers.10": 0,
"model.layers.11": 0,
"model.layers.12": 0,
"model.layers.13": 0,
"model.layers.14": 0,
"model.layers.15": 0,
"model.layers.16": "cpu",
"model.layers.17": "cpu",
"model.layers.18": "cpu",
"model.layers.19": "cpu",
"model.layers.20": "disk",
"model.layers.21": "disk",
"model.layers.22": "disk",
"model.layers.23": "disk",
"model.layers.24": "disk",
"model.layers.25": "disk",
"model.layers.26": "disk",
"model.layers.27": "disk",
"model.layers.28": "disk",
"model.layers.29": "disk",
"model.layers.30": "disk",
"model.layers.31": "disk",
"model.norm": "disk",
"lm_head": "disk",
}
# called only once for all test in this class
@classmethod
def setUpClass(cls):
"""
Setup quantized model
"""
quantization_config = FbgemmFp8Config()
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
cls.quantized_model = AutoModelForCausalLM.from_pretrained(
cls.model_name, device_map=cls.device_map, quantization_config=quantization_config
)
def tearDown(self):
gc.collect()
backend_empty_cache(torch_device)
gc.collect()
def test_quantized_model_conversion(self):
"""
Simple test that checks if the quantized model has been converted properly
"""
from transformers.integrations import FbgemmFp8Linear, replace_with_fbgemm_fp8_linear
model_id = "facebook/opt-350m"
config = AutoConfig.from_pretrained(model_id, revision="cb32f77e905cccbca1d970436fb0f5e6b58ee3c5")
quantization_config = FbgemmFp8Config()
with torch.device("meta"):
model = OPTForCausalLM(config)
nb_linears = 0
for module in model.modules():
if isinstance(module, torch.nn.Linear):
nb_linears += 1
model = replace_with_fbgemm_fp8_linear(model, quantization_config=quantization_config)
nb_fbgemm_linear = 0
for module in model.modules():
if isinstance(module, FbgemmFp8Linear):
nb_fbgemm_linear += 1
self.assertEqual(nb_linears, nb_fbgemm_linear)
with torch.device("meta"):
model = OPTForCausalLM(config)
quantization_config = FbgemmFp8Config(modules_to_not_convert=["fc1"])
model = replace_with_fbgemm_fp8_linear(
model, modules_to_not_convert=["fc1"], quantization_config=quantization_config
)
nb_fbgemm_linear = 0
for module in model.modules():
if isinstance(module, FbgemmFp8Linear):
nb_fbgemm_linear += 1
self.assertEqual(nb_linears - 24, nb_fbgemm_linear)
@require_deterministic_for_xpu
def test_quantized_model(self):
"""
Simple test that checks if the quantized model is working properly
"""
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
output = self.quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens, do_sample=False)
self.assertTrue(self.tokenizer.decode(output[0], skip_special_tokens=True) in self.EXPECTED_OUTPUT)
@require_deterministic_for_xpu
def test_save_pretrained(self):
"""
Simple test that checks if the quantized model is working properly after being saved and loaded
"""
with tempfile.TemporaryDirectory() as tmpdirname:
self.quantized_model.save_pretrained(tmpdirname)
model = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map=self.device_map)
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
output = model.generate(**input_ids, max_new_tokens=self.max_new_tokens, do_sample=False)
self.assertTrue(self.tokenizer.decode(output[0], skip_special_tokens=True) in self.EXPECTED_OUTPUT)
def test_change_loading_attributes(self):
"""
Simple test that checks if the quantized model is working properly after being saved and loaded
"""
with tempfile.TemporaryDirectory() as tmpdirname:
self.quantized_model.save_pretrained(tmpdirname)
quantization_config = FbgemmFp8Config(activation_scale_ub=1000.0)
model = AutoModelForCausalLM.from_pretrained(
tmpdirname, device_map=self.device_map, quantization_config=quantization_config
)
self.assertEqual(model.model.layers[1].mlp.down_proj.input_scale_ub.item(), 1000.0)
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
output = model.generate(**input_ids, max_new_tokens=self.max_new_tokens, do_sample=False)
self.assertTrue(self.tokenizer.decode(output[0], skip_special_tokens=True) in self.EXPECTED_OUTPUT)
@require_torch_multi_accelerator
def test_quantized_model_multi_gpu(self):
"""
Simple test that checks if the quantized model is working properly with multiple GPUs
set CUDA_VISIBLE_DEVICES=0,1 if you have more than 2 GPUs
"""
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
quantization_config = FbgemmFp8Config()
quantized_model = AutoModelForCausalLM.from_pretrained(
self.model_name,
device_map="auto",
quantization_config=quantization_config,
max_memory={0: "6GB", 1: "6GB"},
)
self.assertTrue(set(quantized_model.hf_device_map.values()) == {0, 1})
output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens, do_sample=False)
self.assertTrue(self.tokenizer.decode(output[0], skip_special_tokens=True) in self.EXPECTED_OUTPUT)
def test_quantized_model_offload(self):
"""
Simple test that checks if the quantized model returns an error when loading with cpu/disk offloaded
"""
quantization_config = FbgemmFp8Config()
with self.assertRaisesRegex(
ValueError, "You are attempting to load an FP8 model with a device_map that contains a CPU or disk device."
):
AutoModelForCausalLM.from_pretrained(
self.model_name, device_map=self.offload_device_map, quantization_config=quantization_config
)
@require_deterministic_for_xpu
def test_save_pretrained_offload(self):
"""
Simple test that checks if the saved quantized model is working properly cpu/disk offload
"""
with tempfile.TemporaryDirectory() as tmpdirname:
self.quantized_model.save_pretrained(tmpdirname)
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
quantized_model = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map=self.offload_device_map)
output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens, do_sample=False)
self.assertTrue(self.tokenizer.decode(output[0], skip_special_tokens=True) in self.EXPECTED_OUTPUT)
@require_torch_multi_accelerator
@require_deterministic_for_xpu
def test_save_pretrained_multi_gpu(self):
"""
Simple test that checks if the quantized model is working properly after being saved and loaded
"""
with tempfile.TemporaryDirectory() as tmpdirname:
self.quantized_model.save_pretrained(tmpdirname)
model = AutoModelForCausalLM.from_pretrained(
tmpdirname, device_map="auto", max_memory={0: "6GB", 1: "6GB"}
)
self.assertTrue(set(model.hf_device_map.values()) == {0, 1})
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
output = model.generate(**input_ids, max_new_tokens=self.max_new_tokens, do_sample=False)
self.assertTrue(self.tokenizer.decode(output[0], skip_special_tokens=True) in self.EXPECTED_OUTPUT)

View File

@@ -0,0 +1,455 @@
# Copyright 2025 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import tempfile
import unittest
from contextlib import ExitStack, contextmanager
from unittest.mock import patch
from parameterized import parameterized
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, FineGrainedFP8Config, OPTForCausalLM
from transformers.quantizers.quantizer_finegrained_fp8 import FineGrainedFP8HfQuantizer
from transformers.testing_utils import (
backend_empty_cache,
get_device_properties,
require_accelerate,
require_torch_accelerator,
require_torch_multi_accelerator,
slow,
torch_device,
)
from transformers.utils import is_torch_available
if is_torch_available():
import torch
@contextmanager
def _patch_no_accelerator():
with ExitStack() as stack:
stack.enter_context(patch("torch.cuda.is_available", return_value=False))
if hasattr(torch, "xpu"):
stack.enter_context(patch("torch.xpu.is_available", return_value=False))
stack.enter_context(
patch("transformers.quantizers.quantizer_finegrained_fp8.is_torch_xpu_available", return_value=False)
)
yield
@require_torch_accelerator
class FineGrainedFP8ConfigTest(unittest.TestCase):
def test_to_dict(self):
"""
Simple test that checks if one uses a config and converts it to a dict, the dict is the same as the config object
"""
quantization_config = FineGrainedFP8Config()
config_to_dict = quantization_config.to_dict()
for key in config_to_dict:
self.assertEqual(getattr(quantization_config, key), config_to_dict[key])
def test_from_dict(self):
"""
Simple test that checks if one uses a dict and converts it to a config object, the config object is the same as the dict
"""
dict = {"modules_to_not_convert": ["lm_head.weight"], "quant_method": "fp8"}
quantization_config = FineGrainedFP8Config.from_dict(dict)
self.assertEqual(dict["modules_to_not_convert"], quantization_config.modules_to_not_convert)
self.assertEqual(dict["quant_method"], quantization_config.quant_method)
@slow
@require_accelerate
@require_torch_accelerator
@unittest.skipIf(
get_device_properties()[0] == "cuda"
and (get_device_properties()[1] < 8 or (get_device_properties()[1] == 8 and get_device_properties()[2] < 9)),
"Skipping FP8QuantizerTest because it is not supported on GPU with capability < 8.9",
)
class FP8QuantizerTest(unittest.TestCase):
model_name = "meta-llama/Llama-3.2-1B"
quantized_model_name = "hf-internal-testing/Llama-3.2-1B-Instruct-fp8"
input_text = "Once upon a time"
max_new_tokens = 10
EXPECTED_OUTPUTS = {
"Once upon a time, there was a little girl who loved to play",
"Once upon a time, there was a man who was very rich.",
}
EXPECTED_DEQUANTIZED_OUTPUT = "Once upon a time, in a small village nestled in the rolling hills"
device_map = torch_device
offload_device_map = {
"model.embed_tokens": 0,
"model.layers.0": 0,
"model.layers.1": 0,
"model.layers.2": 0,
"model.layers.3": 0,
"model.layers.4": 0,
"model.layers.5": 0,
"model.layers.6": 0,
"model.layers.7": "cpu",
"model.layers.8": "cpu",
"model.layers.9": "cpu",
"model.layers.10": "cpu",
"model.layers.11": "cpu",
"model.layers.12": "cpu",
"model.layers.13": "cpu",
"model.layers.14": "cpu",
"model.layers.15": "cpu",
"model.rotary_emb": "cpu",
"model.norm": "cpu",
"lm_head": 0,
}
@classmethod
def setUpClass(cls):
"""
Setup quantized model
"""
cls.quantization_config = FineGrainedFP8Config()
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
cls.quantized_model = AutoModelForCausalLM.from_pretrained(
cls.model_name, device_map=cls.device_map, quantization_config=cls.quantization_config
)
def setup(self):
"""
Clear also on each setup (e.g. if a different model is used than the base cls one)
"""
gc.collect()
backend_empty_cache(torch_device)
gc.collect()
def tearDown(self):
gc.collect()
backend_empty_cache(torch_device)
gc.collect()
@parameterized.expand(
[
"hf-internal-testing/tiny-random-Qwen3MoeForCausalLM",
"hf-internal-testing/tiny-random-MixtralForCausalLM",
]
)
def test_moe_conversion_doesnt_raise(self, model_id):
quantization_config = FineGrainedFP8Config(weight_block_size=(32, 32))
AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config)
def test_quantized_model_conversion(self):
"""
Simple test that checks if the quantized model has been converted properly
"""
from transformers.integrations import FP8Linear, replace_with_fp8_linear
model_id = "facebook/opt-350m"
config = AutoConfig.from_pretrained(model_id, revision="cb32f77e905cccbca1d970436fb0f5e6b58ee3c5")
quantization_config = FineGrainedFP8Config()
with torch.device("meta"):
model = OPTForCausalLM(config)
nb_linears = 0
for module in model.modules():
if isinstance(module, torch.nn.Linear):
nb_linears += 1
model = replace_with_fp8_linear(model, quantization_config=quantization_config)
nb_fp8_linear = 0
for module in model.modules():
if isinstance(module, FP8Linear):
nb_fp8_linear += 1
self.assertEqual(nb_linears, nb_fp8_linear)
with torch.device("meta"):
model = OPTForCausalLM(config)
quantization_config = FineGrainedFP8Config()
model = replace_with_fp8_linear(model, modules_to_not_convert=["fc1"], quantization_config=quantization_config)
nb_fp8_linear = 0
for module in model.modules():
if isinstance(module, FP8Linear):
nb_fp8_linear += 1
self.assertEqual(nb_linears - 24, nb_fp8_linear)
def test_quantizer_validation_no_accelerator(self):
"""Test quantizer validation when CUDA/XPU is not available"""
with _patch_no_accelerator():
config = FineGrainedFP8Config()
quantizer = FineGrainedFP8HfQuantizer(config)
quantizer.pre_quantized = False
with self.assertRaises(RuntimeError):
quantizer.validate_environment()
def test_dequantization_no_accelerator(self):
"""Test dequantization when CUDA/XPU is not available"""
with _patch_no_accelerator():
config = FineGrainedFP8Config()
quantizer = FineGrainedFP8HfQuantizer(config)
quantizer.pre_quantized = True
quantizer.validate_environment()
self.assertTrue(quantizer.quantization_config.dequantize)
def test_quantized_model(self):
"""
Simple test that checks if the quantized model is working properly
"""
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(self.device_map)
output = self.quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens, do_sample=False)
output_tokens = self.tokenizer.decode(output[0], skip_special_tokens=True)
self.assertIn(output_tokens, self.EXPECTED_OUTPUTS)
def test_dequantized_model(self):
"""
Simple test that checks if the dequantized model is working properly
"""
quantization_config = FineGrainedFP8Config(dequantize=True)
dequantized_model = AutoModelForCausalLM.from_pretrained(
self.quantized_model_name, device_map=self.device_map, quantization_config=quantization_config
)
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(self.device_map)
output = dequantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens, do_sample=False)
output_tokens = self.tokenizer.decode(output[0], skip_special_tokens=True)
self.assertEqual(output_tokens, self.EXPECTED_DEQUANTIZED_OUTPUT)
del dequantized_model
def test_dequantize_when_no_accelerator(self):
"""
Simple test that checks if the dequantized model is working properly when no accelerator is available
"""
with _patch_no_accelerator():
dequantized_model = AutoModelForCausalLM.from_pretrained(self.quantized_model_name, device_map="cpu")
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to("cpu")
output = dequantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens, do_sample=False)
output_tokens = self.tokenizer.decode(output[0], skip_special_tokens=True)
self.assertEqual(output_tokens, self.EXPECTED_DEQUANTIZED_OUTPUT)
del dequantized_model
def test_save_pretrained(self):
"""
Simple test that checks if the quantized model is working properly after being saved and loaded
"""
with tempfile.TemporaryDirectory() as tmpdirname:
self.quantized_model.save_pretrained(tmpdirname)
model = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map=self.device_map)
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(self.device_map)
output = model.generate(**input_ids, max_new_tokens=self.max_new_tokens, do_sample=False)
self.assertIn(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
def test_weight_and_weight_scale_inv(self):
"""
Simple test that checks if the weight and weight_scale_inv are working properly
"""
weight = self.quantized_model.model.layers[0].self_attn.q_proj.weight
weight_scale_inv = self.quantized_model.model.layers[0].self_attn.q_proj.weight_scale_inv
self.assertEqual(weight.dtype, torch.float8_e4m3fn)
self.assertEqual(weight_scale_inv.dtype, torch.float32)
self.assertEqual(weight.shape, (weight_scale_inv.shape[0] * 128, weight_scale_inv.shape[1] * 128))
def test_block_size(self):
"""
Simple test that checks if the block size is working properly
"""
self.assertEqual(self.quantized_model.config.quantization_config.weight_block_size, (128, 128))
quantization_config = FineGrainedFP8Config(weight_block_size=(32, 32))
quantized_model = AutoModelForCausalLM.from_pretrained(
self.model_name, device_map=self.device_map, quantization_config=quantization_config
)
self.assertEqual(quantized_model.config.quantization_config.weight_block_size, (32, 32))
@require_torch_multi_accelerator
def test_quantized_model_multi_accelerators(self):
"""
Simple test that checks if the quantized model is working properly with multiple accelerators
set CUDA_VISIBLE_DEVICES=0,1 if you have more than 2 GPUs; or set ZE_AFFINITY_MASK=0,1 if you
have more than 2 XPUs.
"""
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(self.device_map)
quantization_config = FineGrainedFP8Config()
# need to empty cache or set max_memory, otherwise we will use the reserved memory that was not allocated when computing max-memory
# this will lead to put the entire model to device 0.
quantized_model = AutoModelForCausalLM.from_pretrained(
self.model_name,
device_map="auto",
quantization_config=quantization_config,
max_memory={0: "1GB", 1: "10GB"},
)
self.assertTrue(set(quantized_model.hf_device_map.values()) == {0, 1})
output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens, do_sample=False)
self.assertIn(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
@require_torch_multi_accelerator
def test_save_pretrained_multi_accelerators(self):
"""
Simple test that checks if the quantized model is working properly after being saved and loaded
"""
with tempfile.TemporaryDirectory() as tmpdirname:
self.quantized_model.save_pretrained(tmpdirname)
# need to empty cache or set max_memory, otherwise we will use the reserved memory that was not allocated when computing max-memory
# this will lead to put the entire model to device 0.
model = AutoModelForCausalLM.from_pretrained(
tmpdirname, device_map="auto", max_memory={0: "1GB", 1: "10GB"}
)
self.assertTrue(set(model.hf_device_map.values()) == {0, 1})
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(self.device_map)
output = model.generate(**input_ids, max_new_tokens=self.max_new_tokens, do_sample=False)
self.assertIn(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
def test_quantized_model_offload(self):
"""
Simple test that checks if the quantized model returns an error when loading with cpu/disk offloaded
"""
with self.assertRaisesRegex(
ValueError, "You are attempting to load an FP8 model with a device_map that contains a cpu/disk device."
):
AutoModelForCausalLM.from_pretrained(
self.model_name, device_map=self.offload_device_map, quantization_config=self.quantization_config
)
def test_save_pretrained_offload(self):
"""
Simple test that checks if the saved quantized model is working properly cpu/disk offload
"""
with tempfile.TemporaryDirectory() as tmpdirname:
self.quantized_model.save_pretrained(tmpdirname)
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(self.device_map)
quantized_model = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map=self.offload_device_map)
output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens, do_sample=False)
self.assertIn(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
def test_compute_module_sizes(self):
r"""
Test if we compute the right module sizes needed to generate the device map.
Also test if we get the right values for `total_byte_count` in `caching_allocator_warmup`.
"""
from transformers.integrations import FP8Linear
from transformers.integrations.accelerate import compute_module_sizes
from transformers.modeling_utils import expand_device_map, get_total_byte_count
from transformers.quantizers import AutoHfQuantizer
# we need to preprocess the model like that because device_map calculation happens before we load the weights inside the model.
# For normal wieghts, it's fine but for quantized weights, the tensors dtype might change during loading.
with torch.device("meta"):
config = AutoConfig.from_pretrained(self.model_name)
model = AutoModelForCausalLM.from_config(config, dtype=torch.bfloat16)
model_size, _ = compute_module_sizes(model, only_modules=False)
expected_keys = [name for name, _ in model.named_parameters()] + [
name for name, _ in model.named_buffers()
]
expanded_device_map = expand_device_map({"": torch_device}, expected_keys)
total_byte_count = list(get_total_byte_count(model, expanded_device_map).values())[0]
# testing prequantized = False should be enough, the shape should be the same whether it is pre-quantized or not
hf_quantizer = AutoHfQuantizer.from_config(FineGrainedFP8Config(), pre_quantized=False)
hf_quantizer.preprocess_model(model=model, config=model.config)
quantized_model_size, _ = compute_module_sizes(model, hf_quantizer, only_modules=False)
expected_keys = [name for name, _ in model.named_parameters()] + [
name for name, _ in model.named_buffers()
]
expanded_device_map = expand_device_map({"": torch_device}, expected_keys)
quantized_total_byte_count = list(get_total_byte_count(model, expanded_device_map, hf_quantizer).values())[
0
]
for name, module in model.named_modules():
if isinstance(module, FP8Linear):
# from 16 bits to 8 bits
assert int(model_size[f"{name}.weight"] // 2) == int(quantized_model_size[f"{name}.weight"])
# check that we get the same value, as we use `compute_module_sizes` in `get_total_byte_count`
assert total_byte_count == model_size[""]
assert quantized_total_byte_count == quantized_model_size[""]
# we should at least have 1.5 times memory reduction in total
assert model_size[""] > quantized_model_size[""] * 1.5
@parameterized.expand(["eager", "batched_mm", "grouped_mm", "deepgemm"])
def test_quantized_moe_forward(self, experts_implementation):
"""
Checks implicitly if the moe implementation is correct, i.e. it does not crash for cases
where the indices go over `top_k` as shown within the Minimax M2 model
"""
model = AutoModelForCausalLM.from_pretrained(
"hf-internal-testing/MiniMax-M2-Tiny-FP8", # single layer version
experts_implementation=experts_implementation,
device_map=self.device_map,
)
assert model.config._experts_implementation == experts_implementation
tokenizer = AutoTokenizer.from_pretrained("MiniMaxAI/MiniMax-M2")
messages = [
{"role": "user", "content": [{"type": "text", "text": "What is your favourite condiment?"}]},
{
"role": "assistant",
"content": [
{
"type": "text",
"text": "Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!",
}
],
},
{"role": "user", "content": [{"type": "text", "text": "Do you have mayonnaise recipes?"}]},
]
model_inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to(
self.device_map
)
# Only caring about this not crashing
_ = model.generate(**model_inputs, max_new_tokens=24)
@require_torch_accelerator
@unittest.skipIf(
get_device_properties()[0] == "cuda"
and (get_device_properties()[1] < 8 or (get_device_properties()[1] == 8 and get_device_properties()[2] < 9)),
"Skipping FP8LinearTest because it is not supported on GPU with capability < 8.9",
)
class FP8LinearTest(unittest.TestCase):
device = torch_device
def test_linear_preserves_shape(self):
"""
Test that FP8Linear preserves shape when in_features == out_features.
"""
from transformers.integrations import FP8Linear
linear = FP8Linear(256, 256, block_size=(128, 128)).to(self.device)
x = torch.rand((1, 5, 256)).to(self.device)
x_ = linear(x)
self.assertEqual(x_.shape, x.shape)
def test_linear_with_diff_feature_size_preserves_shape(self):
"""
Test that FP8Linear generates the correct shape when in_features != out_features.
"""
from transformers.integrations import FP8Linear
linear = FP8Linear(128, 256, block_size=(128, 128)).to(self.device)
x = torch.rand((1, 5, 128)).to(self.device)
x_ = linear(x)
self.assertEqual(x_.shape, (1, 5, 256))

View File

@@ -0,0 +1,185 @@
# Copyright 2026 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import tempfile
import unittest
from transformers import AutoModelForCausalLM, AutoTokenizer, FourOverSixConfig
from transformers.testing_utils import (
backend_empty_cache,
require_accelerate,
require_fouroversix,
require_torch_accelerator,
require_torch_multi_accelerator,
slow,
torch_device,
)
@require_torch_accelerator
class FourOverSixConfigTest(unittest.TestCase):
def test_to_dict(self):
"""
Simple test that checks if one uses a config and converts it to a dict, the dict is the same as the config object
"""
quantization_config = FourOverSixConfig()
config_to_dict = quantization_config.to_dict()
for key in config_to_dict:
self.assertEqual(getattr(quantization_config, key), config_to_dict[key])
def test_from_dict(self):
"""
Simple test that checks if one uses a dict and converts it to a config object, the config object is the same as the dict
"""
dict = {
"scale_rule": "mse",
"quant_method": "fouroversix",
}
quantization_config = FourOverSixConfig.from_dict(dict)
self.assertEqual(dict["scale_rule"], quantization_config.scale_rule)
self.assertEqual(dict["quant_method"], quantization_config.quant_method)
@slow
@require_torch_accelerator
@require_fouroversix
@require_accelerate
class FourOverSixBaseTest(unittest.TestCase):
model_name = "unsloth/Llama-3.2-1B"
input_text = "1 2 3 4"
max_new_tokens = 4
EXPECTED_OUTPUT = "1 2 3 4 5 6"
device_map = torch_device
@classmethod
def getQuantizationConfig(cls):
unittest.skip("Subclass must implement this method")
# Called only once for all tests in this class
@classmethod
def setUpClass(cls):
"""
Setup quantized model
"""
cls.quantization_config = cls.getQuantizationConfig()
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
cls.quantized_model = AutoModelForCausalLM.from_pretrained(
cls.model_name,
device_map=cls.device_map,
quantization_config=cls.quantization_config,
)
def tearDown(self):
gc.collect()
backend_empty_cache(torch_device)
gc.collect()
def test_quantized_model(self):
"""
Simple test that checks if the quantized model is working properly
"""
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
output = self.quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
self.assertEqual(
self.tokenizer.decode(output[0], skip_special_tokens=True),
self.EXPECTED_OUTPUT,
)
def test_save_pretrained(self):
"""
Simple test that checks if the quantized model is working properly after being saved and loaded
"""
with tempfile.TemporaryDirectory() as tmpdirname:
self.quantized_model.save_pretrained(tmpdirname)
model = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map=self.device_map)
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
output = model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
self.assertEqual(
self.tokenizer.decode(output[0], skip_special_tokens=True),
self.EXPECTED_OUTPUT,
)
@require_torch_multi_accelerator
def test_quantized_model_multi_accelerator(self):
"""
Simple test that checks if the quantized model is working properly with multiple accelerators.
Set CUDA_VISIBLE_DEVICES=0,1 if you have more than 2 CUDA GPUs.
"""
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to("cuda:0")
quantized_model = AutoModelForCausalLM.from_pretrained(
self.model_name,
device_map="auto",
quantization_config=self.quantization_config,
max_memory={0: "1GB", 1: "10GB"},
)
self.assertTrue(set(quantized_model.hf_device_map.values()) == {0, 1})
output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
self.assertEqual(
self.tokenizer.decode(output[0], skip_special_tokens=True),
self.EXPECTED_OUTPUT,
)
@require_torch_multi_accelerator
def test_save_pretrained_multi_accelerator(self):
"""
Simple test that checks if the quantized model is working properly after being saved and loaded
"""
with tempfile.TemporaryDirectory() as tmpdirname:
self.quantized_model.save_pretrained(tmpdirname)
model = AutoModelForCausalLM.from_pretrained(
tmpdirname,
device_map="sequential",
max_memory={0: "1GB", 1: "10GB"},
)
self.assertTrue(set(model.hf_device_map.values()) == {0, 1})
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
output = model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
self.assertEqual(
self.tokenizer.decode(output[0], skip_special_tokens=True),
self.EXPECTED_OUTPUT,
)
class FourOverSixMSETest(FourOverSixBaseTest):
@classmethod
def getQuantizationConfig(cls):
return FourOverSixConfig()
class FourOverSixStatic6Test(FourOverSixBaseTest):
@classmethod
def getQuantizationConfig(cls):
return FourOverSixConfig(scale_rule="static_6")
class FourOverSixKeepMasterWeightsTest(FourOverSixBaseTest):
@classmethod
def getQuantizationConfig(cls):
return FourOverSixConfig(keep_master_weights=True)

View File

@@ -0,0 +1,207 @@
# Copyright 2025 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import tempfile
import unittest
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, FPQuantConfig
from transformers.testing_utils import (
backend_empty_cache,
require_accelerate,
require_fp_quant,
require_qutlass,
require_torch_accelerator,
require_torch_multi_accelerator,
slow,
torch_device,
)
@require_torch_accelerator
class FPQuantConfigTest(unittest.TestCase):
def test_to_dict(self):
"""
Simple test that checks if one uses a config and converts it to a dict, the dict is the same as the config object
"""
quantization_config = FPQuantConfig()
config_to_dict = quantization_config.to_dict()
for key in config_to_dict:
self.assertEqual(getattr(quantization_config, key), config_to_dict[key])
def test_from_dict(self):
"""
Simple test that checks if one uses a dict and converts it to a config object, the config object is the same as the dict
"""
dict = {"modules_to_not_convert": ["embed_tokens", "lm_head"], "quant_method": "fp_quant"}
quantization_config = FPQuantConfig.from_dict(dict)
self.assertEqual(dict["modules_to_not_convert"], quantization_config.modules_to_not_convert)
self.assertEqual(dict["quant_method"], quantization_config.quant_method)
@slow
@require_torch_accelerator
@require_fp_quant
@require_accelerate
class FPQuantBaseTest(unittest.TestCase):
model_name = "unsloth/Llama-3.2-1B"
input_text = "1 2 3 4"
max_new_tokens = 4
EXPECTED_OUTPUT = "1 2 3 4 5 6"
device_map = torch_device
@classmethod
def getQuantizationConfig(cls):
unittest.skip("Subclass must implement this method")
# called only once for all test in this class
@classmethod
def setUpClass(cls):
"""
Setup quantized model
"""
cls.quantization_config = cls.getQuantizationConfig()
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
cls.quantized_model = AutoModelForCausalLM.from_pretrained(
cls.model_name, device_map=cls.device_map, quantization_config=cls.quantization_config
)
def tearDown(self):
gc.collect()
backend_empty_cache(torch_device)
gc.collect()
def test_quantized_model(self):
"""
Simple test that checks if the quantized model is working properly
"""
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
output = self.quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
def test_save_pretrained(self):
"""
Simple test that checks if the quantized model is working properly after being saved and loaded
"""
with tempfile.TemporaryDirectory() as tmpdirname:
self.quantized_model.save_pretrained(tmpdirname)
model = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map=self.device_map)
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
output = model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
@require_torch_multi_accelerator
def test_quantized_model_multi_accelerator(self):
"""
Simple test that checks if the quantized model is working properly with multiple accelerators.
Set CUDA_VISIBLE_DEVICES=0,1 if you have more than 2 CUDA GPUs. Or set ZE_AFFINITY_MASK=0,1
if you have more than 2 Intel XPUs.
"""
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
quantized_model = AutoModelForCausalLM.from_pretrained(
self.model_name, device_map="auto", quantization_config=self.quantization_config
)
self.assertTrue(set(quantized_model.hf_device_map.values()) == {0, 1})
output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
@require_torch_multi_accelerator
def test_save_pretrained_multi_accelerator(self):
"""
Simple test that checks if the quantized model is working properly after being saved and loaded
"""
with tempfile.TemporaryDirectory() as tmpdirname:
self.quantized_model.save_pretrained(tmpdirname)
model = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map="auto")
self.assertTrue(set(model.hf_device_map.values()) == {0, 1})
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
output = model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
class FPQuantMXFP4PseudoquantTest(FPQuantBaseTest):
@classmethod
def getQuantizationConfig(cls):
return FPQuantConfig(forward_dtype="mxfp4", pseudoquantization=True)
@unittest.skip("Pseudoquant Triton kernels do not support multi-GPU")
def test_quantized_model_multi_accelerator(self):
pass
@unittest.skip("Pseudoquant Triton kernels do not support multi-GPU")
def test_save_pretrained_multi_accelerator(self):
pass
@unittest.skipUnless(
torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 9,
"NVFP4 pseudoquantization requires compute capability >= 9.0 (Hopper or newer)",
)
class FPQuantNVFP4PseudoquantTest(FPQuantBaseTest):
@classmethod
def getQuantizationConfig(cls):
return FPQuantConfig(forward_dtype="nvfp4", pseudoquantization=True)
@unittest.skip("Pseudoquant Triton kernels do not support multi-GPU")
def test_quantized_model_multi_accelerator(self):
pass
@unittest.skip("Pseudoquant Triton kernels do not support multi-GPU")
def test_save_pretrained_multi_accelerator(self):
pass
@require_qutlass
class FPQuantMXFP4Test(FPQuantBaseTest):
@classmethod
def getQuantizationConfig(cls):
return FPQuantConfig(forward_dtype="mxfp4", pseudoquantization=False)
@require_qutlass
class FPQuantNVFP4Test(FPQuantBaseTest):
@classmethod
def getQuantizationConfig(cls):
return FPQuantConfig(forward_dtype="nvfp4", pseudoquantization=False)
@require_qutlass
class FPQuantMXFP4GS128Test(FPQuantBaseTest):
@classmethod
def getQuantizationConfig(cls):
return FPQuantConfig(forward_dtype="mxfp4", pseudoquantization=False, hadamard_group_size=128)
@require_qutlass
class FPQuantNVFP4GS128Test(FPQuantBaseTest):
@classmethod
def getQuantizationConfig(cls):
return FPQuantConfig(forward_dtype="nvfp4", pseudoquantization=False, hadamard_group_size=128)

View File

File diff suppressed because it is too large Load Diff

View File

View File

@@ -0,0 +1,453 @@
# Copyright 2023 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import tempfile
import unittest
import pytest
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, GPTQConfig
from transformers.testing_utils import (
is_torch_available,
require_accelerate,
require_gptqmodel,
require_optimum,
require_torch_gpu,
require_torch_multi_gpu,
slow,
)
from transformers.utils import is_gptqmodel_available
if is_torch_available():
import torch
if is_gptqmodel_available():
from gptqmodel import BACKEND
from gptqmodel.quantization import METHOD
from gptqmodel.utils.importer import hf_select_quant_linear_v2
class GPTQConfigTest(unittest.TestCase):
def test_bits(self):
with self.assertRaises(ValueError):
GPTQConfig(bits="")
GPTQConfig(bits=1)
GPTQConfig(bits=2)
GPTQConfig(bits=4)
def test_dataset(self):
with self.assertRaises(ValueError):
GPTQConfig(bits=2, dataset="auto_gpt")
GPTQConfig(bits=2, dataset="c4")
def test_damp_percent(self):
with self.assertRaises(ValueError):
GPTQConfig(bits=2, damp_percent=10)
GPTQConfig(bits=2, damp_percent=-1)
GPTQConfig(bits=2, damp_percent="0")
GPTQConfig(bits=2, damp_percent=0.01)
def test_to_dict(self):
quantization_config = GPTQConfig(bits=2)
quantization_config.to_dict()
def test_from_dict(self):
dict = {"bits": 2}
quantization_config = GPTQConfig.from_dict(dict)
self.assertEqual(dict["bits"], quantization_config.bits)
@require_optimum
@require_gptqmodel
def test_optimum_config(self):
from optimum.gptq import GPTQQuantizer
config = GPTQConfig(bits=2)
optimum_config = GPTQQuantizer.from_dict(config.to_dict_optimum())
self.assertEqual(optimum_config.bits, config.bits)
new_config = GPTQConfig.from_dict_optimum(optimum_config.to_dict())
self.assertEqual(optimum_config.bits, new_config.bits)
@slow
@require_optimum
@require_gptqmodel
class GPTQTest(unittest.TestCase):
model_name = "bigscience/bloom-560m"
input_text = "Hello my name is"
EXPECTED_OUTPUTS = set()
# flaky test: gptqmodel kernels are not always bitwise deterministic even between transformer/torch versions
EXPECTED_OUTPUTS.add("Hello my name is John and I am a professional photographer. I")
EXPECTED_OUTPUTS.add("Hello my name is John, I am a professional photographer and I")
EXPECTED_OUTPUTS.add("Hello my name is John, I am a student in the University of")
EXPECTED_OUTPUTS.add("Hello my name is John and I am a very good looking man.")
EXPECTED_OUTPUTS.add("Hello my name is Alyson, I am a student in the")
EXPECTED_OUTPUTS.add("Hello my name is Alyson and I am a very sweet,")
EXPECTED_OUTPUTS.add("Hello my name is Aiden, I am a student at the University")
EXPECTED_OUTPUTS.add("Hello my name is Nate and I am a member of the N")
EXPECTED_OUTPUTS.add("Hello my name is Nellie and I am a student at the")
EXPECTED_OUTPUTS.add("Hello my name is Nate and I am a new member of the")
EXPECTED_OUTPUTS.add("Hello my name is Nils, I am a student of the University")
EXPECTED_OUTPUTS.add("Hello my name is John and I am a very friendly and caring")
EXPECTED_OUTPUTS.add("Hello my name is Nils, I am a student in the field")
EXPECTED_OUTPUTS.add("Hello my name is Michael, I am a professional photographer and I")
# this seems a little small considering that we are doing 4bit quant but we have a small model and ww don't quantize the embeddings
EXPECTED_RELATIVE_DIFFERENCE = 1.664253062
bits = 4
sym = True
group_size = 128
desc_act = False
act_group_aware = True
dataset = [
"gptqmodel is an easy-to-use model quantization library with user-friendly APIs, based on the GPTQ algorithm."
]
device_map = "cpu"
# called only once for all test in this class
@classmethod
def setUpClass(cls):
"""
Setup quantized model
"""
cls.model_fp16 = AutoModelForCausalLM.from_pretrained(
cls.model_name, dtype=torch.float16, device_map=cls.device_map
)
cls.mem_fp16 = cls.model_fp16.get_memory_footprint()
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name, use_fast=True)
cls.config = AutoConfig.from_pretrained(cls.model_name)
cls.quantization_config = GPTQConfig(
bits=cls.bits,
dataset=cls.dataset,
tokenizer=cls.tokenizer,
group_size=cls.group_size,
desc_act=cls.desc_act,
act_group_aware=cls.act_group_aware,
sym=cls.sym,
backend=BACKEND.AUTO,
)
cls.quantized_model = AutoModelForCausalLM.from_pretrained(
cls.model_name,
dtype=torch.float16,
device_map=cls.device_map,
quantization_config=cls.quantization_config,
)
def test_memory_footprint(self):
r"""
A simple test to check if the model conversion has been done correctly by checking on the
memory footprint of the converted model
"""
mem_quantized = self.quantized_model.get_memory_footprint()
self.assertAlmostEqual(self.mem_fp16 / mem_quantized, self.EXPECTED_RELATIVE_DIFFERENCE, places=4)
def test_device_and_dtype_assignment(self):
r"""
Test whether trying to cast (or assigning a device to) a model after quantization will throw an error.
Checks also if other models are casted correctly.
"""
# This should work
if self.device_map in (None, "cpu"):
_ = self.quantized_model.to(0)
with self.assertRaises(ValueError):
# Tries with a `dtype``
self.quantized_model.to(torch.float16)
def test_quantized_layers_class(self):
"""
Simple test to check if the model conversion has been done correctly by checking on
the class type of the linear layers of the converted models
"""
if hasattr(self.config, "quantization_config"):
checkpoint_format = self.config.quantization_config.get("checkpoint_format")
meta = self.config.quantization_config.get("meta")
else:
checkpoint_format = "gptq"
meta = None
QuantLinear = hf_select_quant_linear_v2(
bits=self.bits,
group_size=self.group_size,
desc_act=self.desc_act,
sym=self.sym,
device_map=self.device_map,
format=checkpoint_format,
quant_method=METHOD.GPTQ,
meta=meta,
backend=self.quantization_config.backend,
pack=True,
)
self.assertEqual(self.quantized_model.transformer.h[0].mlp.dense_4h_to_h.__class__, QuantLinear)
def check_inference_correctness(self, model):
r"""
Test the generation quality of the quantized model and see that we are matching the expected output.
Given that we are operating on small numbers + the testing model is relatively small, we might not get
the same output across GPUs. So we'll generate few tokens (5-10) and check their output.
"""
# Check that inference pass works on the model
encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
# Check the exactness of the results
output_sequences = model.generate(input_ids=encoded_input["input_ids"].to(model.device), max_new_tokens=10)
# Get the generation
self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
def check_quantized_layers_type(self, model, value):
self.assertEqual(model.transformer.h[0].mlp.dense_4h_to_h.QUANT_TYPE, value)
def test_generate_quality(self):
"""
Simple test to check the quality of the model by comparing the generated tokens with the expected tokens
"""
if self.device_map is None:
self.check_inference_correctness(self.quantized_model.to(0))
else:
if self.device_map == "cpu" and self.quantized_model.device.type != "cpu":
self.quantized_model.to("cpu")
self.check_inference_correctness(self.quantized_model)
def test_serialization(self):
"""
Test the serialization of the model and the loading of the quantized weights works
"""
with tempfile.TemporaryDirectory() as tmpdirname:
self.tokenizer.save_pretrained(tmpdirname)
self.quantized_model.save_pretrained(tmpdirname)
if self.device_map == "cpu":
quant_type = "torch_fused"
else:
quant_type = "exllamav2"
quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map=self.device_map)
self.check_quantized_layers_type(quantized_model_from_saved, quant_type)
self.check_inference_correctness(quantized_model_from_saved)
@require_accelerate
def test_serialization_big_model_inference(self):
"""
Test the serialization of the model and the loading of the quantized weights with big model inference
"""
with tempfile.TemporaryDirectory() as tmpdirname:
self.quantized_model.save_pretrained(tmpdirname)
device_map = self.device_map or "auto"
quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map=device_map)
self.check_inference_correctness(quantized_model_from_saved)
@require_torch_gpu
class GPTQTestCUDA(GPTQTest):
device_map = {"": 0}
def test_change_loading_attributes(self):
"""
Test the serialization of the model and the loading of the quantized weights works with another config file
"""
with tempfile.TemporaryDirectory() as tmpdirname:
self.quantized_model.save_pretrained(tmpdirname)
quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(
tmpdirname,
quantization_config=GPTQConfig(bits=self.bits),
device_map=self.device_map,
)
self.assertEqual(quantized_model_from_saved.config.quantization_config.bits, self.bits)
quant_type = "exllamav2" if self.device_map != "cpu" else "torch"
self.check_quantized_layers_type(quantized_model_from_saved, quant_type)
self.check_inference_correctness(quantized_model_from_saved)
@require_accelerate
@require_torch_multi_gpu
class GPTQTestDeviceMap(GPTQTestCUDA):
device_map = "auto"
@slow
@require_optimum
@require_gptqmodel
@require_torch_gpu
@require_accelerate
class GPTQTestActOrderExllamaV2(unittest.TestCase):
"""
Test GPTQ model with exllamav2 kernel and desc_act=True (also known as act-order).
More information on those arguments here:
https://huggingface.co/docs/transformers/main_classes/quantization#transformers.GPTQConfig
"""
# `act_group_aware` == `True` requires `desc_act` == `False` when both are explicitly set
desc_act = True
act_group_aware = False
EXPECTED_OUTPUTS = set()
# flaky test: gptqmodel kernels are not always bitwise deterministic even between transformer/torch versions
EXPECTED_OUTPUTS.add("Hello, how are you ? I'm doing good, thanks for asking.")
# 4bit + act_order + 128g
model_name = "hf-internal-testing/TinyLlama-1.1B-Chat-v0.3-GPTQ"
input_text = "Hello, how are you ?"
@classmethod
def setUpClass(cls):
"""
Setup quantized model
"""
cls.quantization_config = GPTQConfig(
bits=4,
max_input_length=4028,
desc_act=cls.desc_act,
act_group_aware=cls.act_group_aware,
backend=BACKEND.EXLLAMA_V2,
)
cls.quantized_model = AutoModelForCausalLM.from_pretrained(
cls.model_name,
dtype=torch.float16,
device_map={"": 0},
quantization_config=cls.quantization_config,
)
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name, use_fast=True)
def check_inference_correctness(self, model):
"""
Test the generation quality of the quantized model and see that we are matching the expected output.
Given that we are operating on small numbers + the testing model is relatively small, we might not get
the same output across GPUs. So we'll generate few tokens (5-10) and check their output.
"""
# Check that inference pass works on the model
encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
# Check the exactness of the results
output_sequences = model.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
# Get the generation
self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
def test_quantized_layers_type(self):
self.assertEqual(self.quantized_model.model.layers[0].self_attn.k_proj.QUANT_TYPE, "exllamav2")
def test_generate_quality(self):
"""
Simple test to check the quality of the model by comparing the generated tokens with the expected tokens
"""
self.check_inference_correctness(self.quantized_model)
@slow
@require_optimum
@require_gptqmodel
@require_torch_gpu
@require_accelerate
class GPTQTestExllamaV2(unittest.TestCase):
"""
Test GPTQ model with exllamav2 kernel and desc_act=True (also known as act-order).
More information on those arguments here:
https://huggingface.co/docs/transformers/main_classes/quantization#transformers.GPTQConfig
"""
EXPECTED_OUTPUTS = set()
# flaky test: gptqmodel kernels are not always bitwise deterministic even between transformer/torch versions
EXPECTED_OUTPUTS.add("Hello, how are you ? I'm doing good, thanks for asking.")
# 4bit + act_order + 128g
model_name = "hf-internal-testing/TinyLlama-1.1B-Chat-v0.3-GPTQ"
input_text = "Hello, how are you ?"
@classmethod
def setUpClass(cls):
"""
Setup quantized model
"""
cls.quantization_config = GPTQConfig(bits=4, backend=BACKEND.EXLLAMA_V2)
cls.quantized_model = AutoModelForCausalLM.from_pretrained(
cls.model_name,
dtype=torch.float16,
device_map={"": 0},
quantization_config=cls.quantization_config,
)
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name, use_fast=True)
def test_quantized_layers_type(self):
self.assertEqual(
self.quantized_model.model.layers[0].self_attn.k_proj.QUANT_TYPE,
"exllamav2",
)
def check_inference_correctness(self, model):
"""
Test the generation quality of the quantized model and see that we are matching the expected output.
Given that we are operating on small numbers + the testing model is relatively small, we might not get
the same output across GPUs. So we'll generate few tokens (5-10) and check their output.
"""
# Check that inference pass works on the model
encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
# Check the exactness of the results
output_sequences = model.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
# Get the generation
self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
def test_generate_quality(self):
"""
Simple test to check the quality of the model by comparing the generated tokens with the expected tokens
"""
self.check_inference_correctness(self.quantized_model)
# fail when run all together
@pytest.mark.skip
@require_accelerate
@require_torch_multi_gpu
class GPTQTestDeviceMapCPUOffload(GPTQTest):
device_map = {
"transformer.word_embeddings": 0,
"transformer.word_embeddings_layernorm": 0,
"lm_head": 0,
"transformer.h.0": 0,
"transformer.h.1": 0,
"transformer.h.2": 0,
"transformer.h.3": 0,
"transformer.h.4": 0,
"transformer.h.5": 0,
"transformer.h.6": 0,
"transformer.h.7": 0,
"transformer.h.8": 0,
"transformer.h.9": 0,
"transformer.h.10": 1,
"transformer.h.11": 1,
"transformer.h.12": 1,
"transformer.h.13": 1,
"transformer.h.14": 1,
"transformer.h.15": 1,
"transformer.h.16": 1,
"transformer.h.17": 0,
"transformer.h.18": "cpu",
"transformer.h.19": "cpu",
"transformer.h.20": "cpu",
"transformer.h.21": "cpu",
"transformer.h.22": "cpu",
"transformer.h.23": 1,
"transformer.ln_f": 0,
}

View File

View File

@@ -0,0 +1,194 @@
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import tempfile
import unittest
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, HiggsConfig, OPTForCausalLM
from transformers.testing_utils import (
backend_empty_cache,
require_accelerate,
require_flute_hadamard,
require_torch_gpu,
require_torch_multi_gpu,
slow,
torch_device,
)
from transformers.utils import is_torch_available
if is_torch_available():
import torch
@require_torch_gpu
class HiggsConfigTest(unittest.TestCase):
def test_to_dict(self):
"""
Simple test that checks if one uses a config and converts it to a dict, the dict is the same as the config object
"""
quantization_config = HiggsConfig()
config_to_dict = quantization_config.to_dict()
for key in config_to_dict:
self.assertEqual(getattr(quantization_config, key), config_to_dict[key])
def test_from_dict(self):
"""
Simple test that checks if one uses a dict and converts it to a config object, the config object is the same as the dict
"""
dict = {"modules_to_not_convert": ["embed_tokens", "lm_head"], "quant_method": "higgs"}
quantization_config = HiggsConfig.from_dict(dict)
self.assertEqual(dict["modules_to_not_convert"], quantization_config.modules_to_not_convert)
self.assertEqual(dict["quant_method"], quantization_config.quant_method)
@slow
@require_torch_gpu
@require_flute_hadamard
@require_accelerate
#
class HiggsTest(unittest.TestCase):
model_name = "unsloth/Llama-3.2-1B"
input_text = "Font test: A quick brown fox jumps over the"
max_new_tokens = 2
EXPECTED_OUTPUT = "Font test: A quick brown fox jumps over the lazy dog"
device_map = "cuda"
# called only once for all test in this class
@classmethod
def setUpClass(cls):
"""
Setup quantized model
"""
quantization_config = HiggsConfig()
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
cls.quantized_model = AutoModelForCausalLM.from_pretrained(
cls.model_name, device_map=cls.device_map, quantization_config=quantization_config
)
def tearDown(self):
gc.collect()
backend_empty_cache(torch_device)
gc.collect()
def test_quantized_model_conversion(self):
"""
Simple test that checks if the quantized model has been converted properly
"""
from transformers.integrations import HiggsLinear, replace_with_higgs_linear
model_id = "facebook/opt-350m"
config = AutoConfig.from_pretrained(model_id, revision="cb32f77e905cccbca1d970436fb0f5e6b58ee3c5")
quantization_config = HiggsConfig()
with torch.device("meta"):
model = OPTForCausalLM(config)
nb_linears = 0
for module in model.modules():
if isinstance(module, torch.nn.Linear):
nb_linears += 1
model, _ = replace_with_higgs_linear(model, quantization_config=quantization_config)
nb_higgs_linear = 0
for module in model.modules():
if isinstance(module, HiggsLinear):
nb_higgs_linear += 1
self.assertEqual(nb_linears - 1, nb_higgs_linear)
with torch.device("meta"):
model = OPTForCausalLM(config)
quantization_config = HiggsConfig(modules_to_not_convert=["fc1"])
model, _ = replace_with_higgs_linear(model, quantization_config=quantization_config)
nb_higgs_linear = 0
for module in model.modules():
if isinstance(module, HiggsLinear):
nb_higgs_linear += 1
self.assertEqual(nb_linears - 24, nb_higgs_linear)
def test_quantized_model(self):
"""
Simple test that checks if the quantized model is working properly
"""
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
output = self.quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
def test_save_pretrained(self):
"""
Simple test that checks if the quantized model is working properly after being saved and loaded
"""
with tempfile.TemporaryDirectory() as tmpdirname:
self.quantized_model.save_pretrained(tmpdirname)
model = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map=self.device_map)
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
output = model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
@require_torch_multi_gpu
def test_quantized_model_multi_gpu(self):
"""
Simple test that checks if the quantized model is working properly with multiple GPUs
set CUDA_VISIBLE_DEVICES=0,1 if you have more than 2 GPUs
"""
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
quantization_config = HiggsConfig()
quantized_model = AutoModelForCausalLM.from_pretrained(
self.model_name, device_map="auto", quantization_config=quantization_config
)
self.assertTrue(set(quantized_model.hf_device_map.values()) == {0, 1})
output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
@require_torch_multi_gpu
def test_save_pretrained_multi_gpu(self):
"""
Simple test that checks if the quantized model is working properly after being saved and loaded
"""
with tempfile.TemporaryDirectory() as tmpdirname:
self.quantized_model.save_pretrained(tmpdirname)
model = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map="auto")
self.assertTrue(set(model.hf_device_map.values()) == {0, 1})
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
output = model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
@unittest.skip("This will almost surely OOM. Enable when switched to a smaller model")
def test_dequantize(self):
"""
Test the ability to dequantize a model
"""
self.quantized_model.dequantize()
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
output = self.quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)

View File

@@ -0,0 +1,319 @@
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import unittest
from unittest import skip
import accelerate
from transformers import AutoModelForCausalLM, AutoTokenizer, HqqConfig
from transformers.testing_utils import (
backend_empty_cache,
require_accelerate,
require_deterministic_for_xpu,
require_hqq,
require_torch_accelerator,
require_torch_multi_accelerator,
slow,
torch_device,
)
from transformers.utils import is_hqq_available, is_torch_available
if is_torch_available():
import torch
if is_hqq_available():
from hqq.core.quantize import HQQBackend, HQQLinear
class HQQLLMRunner:
def __init__(self, model_id, quant_config, compute_dtype, device, cache_dir=None):
self.model = AutoModelForCausalLM.from_pretrained(
model_id,
dtype=compute_dtype,
device_map=device,
quantization_config=quant_config,
cache_dir=cache_dir,
)
self.tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=cache_dir)
self.device = self.model.device
HQQLinear.set_backend(HQQBackend.PYTORCH)
def cleanup():
backend_empty_cache(torch_device)
gc.collect()
def check_hqqlayer(test_module, hqq_layer, batch_size=1, context_size=1024):
# Test HQQ layer
W_dequant = hqq_layer.dequantize() # Reconstructed weights
inputs = (
torch.randn(
(batch_size, context_size, hqq_layer.meta["shape"][1]),
device=hqq_layer.device,
dtype=hqq_layer.compute_dtype,
)
/ 10.0
)
with torch.no_grad():
outputs = hqq_layer(inputs)
test_module.assertEqual(outputs.shape[-1], W_dequant.shape[0])
test_module.assertEqual(outputs.dtype, hqq_layer.compute_dtype)
del W_dequant, inputs, outputs
cleanup()
def check_forward(test_module, model, batch_size=1, context_size=1024):
# Test forward pass
with torch.no_grad():
out = model(torch.zeros([batch_size, context_size], device=model.device, dtype=torch.int32)).logits
test_module.assertEqual(out.shape[0], batch_size)
test_module.assertEqual(out.shape[1], context_size)
cleanup()
MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
@require_torch_accelerator
@require_hqq
class HqqConfigTest(unittest.TestCase):
def test_to_dict(self):
"""
Makes sure the config format is properly set
"""
quantization_config = HqqConfig()
hqq_orig_config = quantization_config.to_dict()
self.assertEqual(quantization_config.quant_config, hqq_orig_config["quant_config"])
@slow
@require_torch_accelerator
@require_accelerate
@require_hqq
@skip("skip for now until we add back support")
class HQQTest(unittest.TestCase):
def tearDown(self):
cleanup()
def test_fp16_quantized_model(self):
"""
Simple LLM model testing fp16
"""
quant_config = HqqConfig(nbits=8, group_size=64)
hqq_runner = HQQLLMRunner(
model_id=MODEL_ID, quant_config=quant_config, compute_dtype=torch.float16, device=torch_device
)
check_hqqlayer(self, hqq_runner.model.model.layers[0].self_attn.v_proj)
check_forward(self, hqq_runner.model)
def test_quantized_model_to_new_device_and_new_dtype(self):
"""
Simple LLM model testing different devices and dtypes
"""
quant_config = HqqConfig(nbits=8, group_size=64)
hqq_runner = HQQLLMRunner(
model_id=MODEL_ID, quant_config=quant_config, compute_dtype=torch.float16, device=torch_device
)
check_hqqlayer(self, hqq_runner.model.model.layers[0].self_attn.v_proj)
check_forward(self, hqq_runner.model)
# Remove `accelerate` hooks to enable move the model to a new device
accelerate.hooks.remove_hook_from_module(hqq_runner.model, recurse=True)
hqq_runner.model.to("cpu", torch.bfloat16)
check_hqqlayer(self, hqq_runner.model.model.layers[0].self_attn.v_proj)
check_forward(self, hqq_runner.model)
hqq_runner.model.to(torch_device)
check_hqqlayer(self, hqq_runner.model.model.layers[0].self_attn.v_proj)
check_forward(self, hqq_runner.model)
def test_quantized_model_fake_weight_dtype(self):
quant_config = HqqConfig(nbits=8, group_size=64)
hqq_runner = HQQLLMRunner(
model_id=MODEL_ID, quant_config=quant_config, compute_dtype=torch.float16, device=torch_device
)
# We use a hack to inject a fake weight to HQQLinear. Check that it works
self.assertEqual(hqq_runner.model.model.layers[0].self_attn.v_proj.weight.dtype, torch.float16)
@slow
@require_torch_accelerator
@require_torch_multi_accelerator
@require_accelerate
@require_hqq
@skip("skip for now until we add back support")
class HQQTestMultiGPU(unittest.TestCase):
def tearDown(self):
cleanup()
def test_fp16_quantized_model_multipgpu(self):
"""
Simple LLM model testing fp16 with multi-gpu
"""
quant_config = HqqConfig(nbits=8, group_size=64)
hqq_runner = HQQLLMRunner(
model_id=MODEL_ID, quant_config=quant_config, compute_dtype=torch.float16, device="auto"
)
check_hqqlayer(self, hqq_runner.model.model.layers[0].self_attn.v_proj)
check_forward(self, hqq_runner.model)
@slow
@require_torch_accelerator
@require_accelerate
@require_hqq
@skip("skip for now until we add back support")
class HQQTestBias(unittest.TestCase):
def tearDown(self):
cleanup()
def test_fp16_quantized_model(self):
"""
Simple LLM model testing fp16 with bias
"""
quant_config = HqqConfig(nbits=8, group_size=64)
hqq_runner = HQQLLMRunner(
model_id="facebook/opt-125m", quant_config=quant_config, compute_dtype=torch.float16, device=torch_device
)
check_hqqlayer(self, hqq_runner.model.model.decoder.layers[0].self_attn.v_proj)
check_forward(self, hqq_runner.model)
@require_deterministic_for_xpu
def test_save_and_load_quantized_model(self):
"""
Test saving and loading a quantized model with bias
"""
import tempfile
quant_config = HqqConfig(nbits=8, group_size=64)
hqq_runner = HQQLLMRunner(
model_id="facebook/opt-125m", quant_config=quant_config, compute_dtype=torch.float16, device=torch_device
)
input_tensor = torch.zeros((1, 8), dtype=torch.int32, device=torch_device)
# Get reference logits
with torch.no_grad():
logits_ref = hqq_runner.model.forward(input_tensor).logits
with tempfile.TemporaryDirectory() as tmpdirname:
hqq_runner.model.save_pretrained(tmpdirname)
del hqq_runner.model
backend_empty_cache(torch_device)
model_loaded = AutoModelForCausalLM.from_pretrained(
tmpdirname, dtype=torch.float16, device_map=torch_device
)
with torch.no_grad():
logits_loaded = model_loaded.forward(input_tensor).logits
self.assertEqual((logits_loaded - logits_ref).abs().mean().item(), 0)
@slow
@require_torch_accelerator
@require_accelerate
@require_hqq
@skip("skip for now until we add back support")
class HQQSerializationTest(unittest.TestCase):
def tearDown(self):
cleanup()
def test_model_serialization(self):
"""
Simple HQQ LLM save/load test
"""
quant_config = HqqConfig(nbits=4, group_size=64)
hqq_runner = HQQLLMRunner(
model_id=MODEL_ID, quant_config=quant_config, compute_dtype=torch.float16, device=torch_device
)
input_tensor = torch.zeros((1, 8), dtype=torch.int32, device=torch_device)
with torch.no_grad():
logits_ref = hqq_runner.model.forward(input_tensor).logits
# Save
saved_model_id = "quant_model"
hqq_runner.model.save_pretrained(saved_model_id)
# Remove old model
del hqq_runner.model
backend_empty_cache(torch_device)
# Load and check if the logits match
model_loaded = AutoModelForCausalLM.from_pretrained(
"quant_model",
dtype=torch.float16,
device_map=torch_device,
)
with torch.no_grad():
logits_loaded = model_loaded.forward(input_tensor).logits
self.assertEqual((logits_loaded - logits_ref).abs().mean().item(), 0)
def test_model_serialization_dynamic_quant_with_skip(self):
"""
Simple HQQ LLM save/load test with dynamic quant
"""
q4_config = {"nbits": 4, "group_size": 64}
q3_config = {"nbits": 3, "group_size": 64}
quant_config = HqqConfig(
dynamic_config={
"self_attn.q_proj": q4_config,
"self_attn.k_proj": q4_config,
"self_attn.v_proj": q4_config,
"self_attn.o_proj": q4_config,
"mlp.gate_proj": q3_config,
"mlp.up_proj": q3_config,
},
skip_modules=["lm_head", "down_proj"],
)
hqq_runner = HQQLLMRunner(
model_id=MODEL_ID, quant_config=quant_config, compute_dtype=torch.float16, device=torch_device
)
model = hqq_runner.model
input_tensor = torch.zeros((1, 8), dtype=torch.int32, device=torch_device)
with torch.no_grad():
model.forward(input_tensor).logits
self.assertEqual(isinstance(model.model.layers[1].mlp.down_proj, torch.nn.Linear), True)
self.assertEqual(model.model.layers[1].self_attn.v_proj.quant_config["weight_quant_params"]["nbits"], 4)
self.assertEqual(model.model.layers[1].mlp.gate_proj.quant_config["weight_quant_params"]["nbits"], 3)

View File

@@ -0,0 +1,642 @@
# Copyright 2026 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import unittest
from contextlib import ExitStack, contextmanager
from unittest.mock import patch
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, MetalConfig, OPTForCausalLM
from transformers.quantizers.quantizer_metal import MetalHfQuantizer
from transformers.testing_utils import (
require_torch,
slow,
torch_device,
)
from transformers.utils import is_torch_available
if is_torch_available():
import torch
import torch.nn as nn
@contextmanager
def _patch_mps_available(available: bool = True):
"""Patch ``torch.backends.mps.is_available`` to simulate MPS presence/absence."""
with patch("torch.backends.mps.is_available", return_value=available):
yield
@contextmanager
def _patch_no_mps():
"""Convenience: simulate a machine with no MPS device."""
with _patch_mps_available(False):
yield
@contextmanager
def _patch_has_mps():
"""Convenience: simulate a machine with an MPS device."""
with ExitStack() as stack:
stack.enter_context(_patch_mps_available(True))
stack.enter_context(patch("transformers.quantizers.quantizer_metal.is_kernels_available", return_value=True))
yield
@require_torch
class MetalConfigTest(unittest.TestCase):
"""Unit tests for ``MetalConfig`` (no device / model needed)."""
def test_default_values(self):
config = MetalConfig()
self.assertEqual(config.bits, 4)
self.assertEqual(config.group_size, 64)
self.assertIsNone(config.modules_to_not_convert)
self.assertFalse(config.dequantize)
self.assertEqual(config.quant_method, "metal")
def test_custom_values(self):
config = MetalConfig(bits=8, group_size=32, modules_to_not_convert=["lm_head"], dequantize=True)
self.assertEqual(config.bits, 8)
self.assertEqual(config.group_size, 32)
self.assertEqual(config.modules_to_not_convert, ["lm_head"])
self.assertTrue(config.dequantize)
def test_invalid_bits_raises(self):
for bad_bits in (0, 1, 3, 5, 6, 7, 16):
with self.assertRaises(ValueError, msg=f"bits={bad_bits} should raise"):
MetalConfig(bits=bad_bits)
def test_valid_bits(self):
for bits in (2, 4, 8):
config = MetalConfig(bits=bits)
self.assertEqual(config.bits, bits)
def test_invalid_group_size_raises(self):
with self.assertRaises(ValueError):
MetalConfig(group_size=0)
with self.assertRaises(ValueError):
MetalConfig(group_size=-1)
def test_to_dict(self):
config = MetalConfig(bits=4, group_size=64, modules_to_not_convert=["lm_head"])
d = config.to_dict()
self.assertEqual(d["quant_method"], "metal")
self.assertEqual(d["bits"], 4)
self.assertEqual(d["group_size"], 64)
self.assertEqual(d["modules_to_not_convert"], ["lm_head"])
def test_from_dict(self):
d = {"quant_method": "metal", "bits": 8, "group_size": 32, "modules_to_not_convert": None}
config = MetalConfig.from_dict(d)
self.assertEqual(config.bits, 8)
self.assertEqual(config.group_size, 32)
def test_to_dict_from_dict(self):
original = MetalConfig(bits=2, group_size=128, modules_to_not_convert=["lm_head"])
d = original.to_dict()
restored = MetalConfig.from_dict(d)
self.assertEqual(original.bits, restored.bits)
self.assertEqual(original.group_size, restored.group_size)
self.assertEqual(original.modules_to_not_convert, restored.modules_to_not_convert)
def test_get_loading_attributes(self):
config = MetalConfig(dequantize=True)
attrs = config.get_loading_attributes()
self.assertIn("dequantize", attrs)
self.assertTrue(attrs["dequantize"])
@require_torch
class MetalQuantizerEnvironmentTest(unittest.TestCase):
"""Validate ``MetalHfQuantizer.validate_environment`` under various conditions."""
def test_no_mps_prequantized_triggers_dequantize(self):
"""Pre-quantized model on non-MPS machine should auto-enable dequantize."""
with _patch_no_mps():
config = MetalConfig()
quantizer = MetalHfQuantizer(config)
quantizer.pre_quantized = True
quantizer.validate_environment()
self.assertTrue(quantizer.quantization_config.dequantize)
def test_no_mps_not_prequantized_raises(self):
"""Quantize-on-the-fly on non-MPS machine should raise."""
with _patch_no_mps():
config = MetalConfig()
quantizer = MetalHfQuantizer(config)
quantizer.pre_quantized = False
with self.assertRaises(RuntimeError):
quantizer.validate_environment()
def test_dequantize_flag_skips_mps_check(self):
"""When dequantize=True, no MPS check should occur."""
with _patch_no_mps():
config = MetalConfig(dequantize=True)
quantizer = MetalHfQuantizer(config)
quantizer.pre_quantized = True
quantizer.validate_environment()
def test_missing_kernels_raises(self):
"""Missing ``kernels`` package should raise ImportError."""
with ExitStack() as stack:
stack.enter_context(_patch_mps_available(True))
stack.enter_context(
patch("transformers.quantizers.quantizer_metal.is_kernels_available", return_value=False)
)
config = MetalConfig()
quantizer = MetalHfQuantizer(config)
quantizer.pre_quantized = False
with self.assertRaises(ImportError):
quantizer.validate_environment()
def test_cpu_in_device_map_not_prequantized_raises(self):
"""Quantize-on-the-fly with CPU in device_map should raise."""
with _patch_has_mps():
config = MetalConfig()
quantizer = MetalHfQuantizer(config)
quantizer.pre_quantized = False
with self.assertRaises(ValueError):
quantizer.validate_environment(device_map={"model": "cpu"})
def test_disk_in_device_map_not_prequantized_raises(self):
"""Quantize-on-the-fly with disk in device_map should raise."""
with _patch_has_mps():
config = MetalConfig()
quantizer = MetalHfQuantizer(config)
quantizer.pre_quantized = False
with self.assertRaises(ValueError):
quantizer.validate_environment(device_map={"model": "disk"})
def test_update_device_map_defaults_to_mps(self):
config = MetalConfig()
quantizer = MetalHfQuantizer(config)
result = quantizer.update_device_map(None)
self.assertEqual(result, {"": "mps"})
def test_is_serializable(self):
config = MetalConfig()
quantizer = MetalHfQuantizer(config)
self.assertTrue(quantizer.is_serializable())
def test_is_not_trainable(self):
config = MetalConfig()
quantizer = MetalHfQuantizer(config)
self.assertFalse(quantizer.is_trainable)
@require_torch
class AffineQuantizeDequantizeTest(unittest.TestCase):
"""Test the low-level ``_affine_quantize_tensor`` / ``_affine_dequantize_tensor`` functions."""
def _roundtrip(self, bits, group_size, N=64, K=256, dtype=torch.float32):
from transformers.integrations.metal_quantization import _affine_dequantize_tensor, _affine_quantize_tensor
weight = torch.randn(N, K, dtype=dtype)
w_packed, scales, biases = _affine_quantize_tensor(weight, group_size, bits)
self.assertEqual(w_packed.dtype, torch.uint32)
self.assertEqual(w_packed.shape, (N, K // (32 // bits)))
self.assertEqual(scales.shape, (N, K // group_size))
self.assertEqual(biases.shape, (N, K // group_size))
w_deq = _affine_dequantize_tensor(w_packed, scales, biases, group_size, bits)
self.assertEqual(w_deq.shape, (N, K))
return weight.float(), w_deq.float()
def test_roundtrip_4bit_gs64(self):
orig, deq = self._roundtrip(bits=4, group_size=64)
max_err = (orig - deq).abs().max().item()
self.assertLess(max_err, 0.30, "4-bit gs=64 round-trip error too large")
def test_roundtrip_4bit_gs128(self):
orig, deq = self._roundtrip(bits=4, group_size=128)
max_err = (orig - deq).abs().max().item()
self.assertLess(max_err, 0.5, "4-bit gs=128 round-trip error too large")
def test_roundtrip_8bit_gs64(self):
orig, deq = self._roundtrip(bits=8, group_size=64)
max_err = (orig - deq).abs().max().item()
self.assertLess(max_err, 0.02, "8-bit gs=64 round-trip error too large")
def test_roundtrip_2bit_gs64(self):
orig, deq = self._roundtrip(bits=2, group_size=64)
max_err = (orig - deq).abs().max().item()
self.assertLess(max_err, 1.50, "2-bit gs=64 round-trip error too large")
def test_quantize_shapes_2bit(self):
from transformers.integrations.metal_quantization import _affine_quantize_tensor
N, K = 32, 128
weight = torch.randn(N, K)
w_packed, scales, biases = _affine_quantize_tensor(weight, group_size=64, bits=2)
elems_per_int = 32 // 2
self.assertEqual(w_packed.shape, (N, K // elems_per_int))
self.assertEqual(scales.shape, (N, K // 64))
def test_quantize_preserves_device(self):
from transformers.integrations.metal_quantization import _affine_quantize_tensor
weight = torch.randn(32, 128, device="cpu")
w_packed, scales, biases = _affine_quantize_tensor(weight, group_size=64, bits=4)
self.assertEqual(w_packed.device.type, "cpu")
self.assertEqual(scales.device.type, "cpu")
self.assertEqual(biases.device.type, "cpu")
def test_dequantize_returns_correct_dtype(self):
"""Regression: dequantize should always return float32 (caller casts to target dtype)."""
from transformers.integrations.metal_quantization import _affine_dequantize_tensor, _affine_quantize_tensor
weight = torch.randn(32, 128, dtype=torch.bfloat16)
w_packed, scales, biases = _affine_quantize_tensor(weight, group_size=64, bits=4)
w_deq = _affine_dequantize_tensor(w_packed, scales, biases, group_size=64, bits=4)
self.assertEqual(w_deq.dtype, torch.float32)
@require_torch
class MetalLinearTest(unittest.TestCase):
"""Test the ``MetalLinear`` nn.Module directly (CPU, no kernel calls)."""
def test_prequantized_weight_shape(self):
"""Pre-quantized mode: weight should be uint32 with packed K dimension."""
from transformers.integrations.metal_quantization import MetalLinear
layer = MetalLinear(in_features=256, out_features=128, bits=4, group_size=64)
elems_per_int = 32 // 4
self.assertEqual(layer.weight.shape, (128, 256 // elems_per_int))
self.assertEqual(layer.weight.dtype, torch.uint32)
self.assertEqual(layer.scales.shape, (128, 256 // 64))
self.assertEqual(layer.qbiases.shape, (128, 256 // 64))
def test_quantize_on_the_fly_weight_shape(self):
"""Quantize-on-the-fly mode (dtype=None): weight should be full-shape float."""
from transformers.integrations.metal_quantization import MetalLinear
layer = MetalLinear(in_features=256, out_features=128, bits=4, group_size=64, dtype=None)
self.assertEqual(layer.weight.shape, (128, 256))
self.assertNotEqual(layer.weight.dtype, torch.uint32)
def test_no_bias_by_default(self):
from transformers.integrations.metal_quantization import MetalLinear
layer = MetalLinear(in_features=128, out_features=64, bits=4, group_size=64)
self.assertIsNone(layer.bias)
def test_with_bias(self):
from transformers.integrations.metal_quantization import MetalLinear
layer = MetalLinear(in_features=128, out_features=64, bias=True, bits=4, group_size=64)
self.assertIsNotNone(layer.bias)
self.assertEqual(layer.bias.shape, (64,))
def test_forward_fallback_when_not_uint32(self):
"""When weight is not uint32, forward should use standard nn.functional.linear (no kernel needed)."""
from transformers.integrations.metal_quantization import MetalLinear
layer = MetalLinear(in_features=128, out_features=64, bits=4, group_size=64, dtype=None)
layer.weight = nn.Parameter(torch.randn(64, 128))
x = torch.randn(2, 5, 128)
out = layer(x)
self.assertEqual(out.shape, (2, 5, 64))
def test_forward_fallback_with_bias(self):
from transformers.integrations.metal_quantization import MetalLinear
layer = MetalLinear(in_features=128, out_features=64, bias=True, bits=4, group_size=64, dtype=None)
layer.weight = nn.Parameter(torch.randn(64, 128))
layer.bias = nn.Parameter(torch.randn(64))
x = torch.randn(1, 10, 128)
out = layer(x)
self.assertEqual(out.shape, (1, 10, 64))
def test_prequantized_shapes_8bit(self):
from transformers.integrations.metal_quantization import MetalLinear
layer = MetalLinear(in_features=256, out_features=128, bits=8, group_size=64)
elems_per_int = 32 // 8 # 4
self.assertEqual(layer.weight.shape, (128, 256 // elems_per_int))
def test_prequantized_shapes_2bit(self):
from transformers.integrations.metal_quantization import MetalLinear
layer = MetalLinear(in_features=256, out_features=128, bits=2, group_size=64)
elems_per_int = 32 // 2 # 16
self.assertEqual(layer.weight.shape, (128, 256 // elems_per_int))
@require_torch
class ReplaceWithMetalLinearTest(unittest.TestCase):
"""Test module replacement logic."""
def _make_small_model(self):
config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-OPTForCausalLM")
with torch.device("meta"):
model = OPTForCausalLM(config)
return model
def test_all_linears_replaced(self):
from transformers.integrations.metal_quantization import MetalLinear, replace_with_metal_linear
model = self._make_small_model()
nb_linears = sum(1 for m in model.modules() if isinstance(m, nn.Linear))
self.assertGreater(nb_linears, 0)
config = MetalConfig(bits=4, group_size=64)
replace_with_metal_linear(model, quantization_config=config, pre_quantized=True)
nb_metal = sum(1 for m in model.modules() if isinstance(m, MetalLinear))
self.assertEqual(nb_linears, nb_metal)
def test_modules_to_not_convert(self):
from transformers.integrations.metal_quantization import MetalLinear, replace_with_metal_linear
model = self._make_small_model()
config = MetalConfig(bits=4, group_size=64)
replace_with_metal_linear(
model, modules_to_not_convert=["lm_head"], quantization_config=config, pre_quantized=True
)
self.assertNotIsInstance(model.lm_head, MetalLinear)
nb_metal = sum(1 for m in model.modules() if isinstance(m, MetalLinear))
nb_linears = sum(1 for m in model.modules() if isinstance(m, nn.Linear))
self.assertEqual(nb_metal, nb_linears - 1)
def test_dequantize_skips_replacement(self):
from transformers.integrations.metal_quantization import MetalLinear, replace_with_metal_linear
model = self._make_small_model()
config = MetalConfig(bits=4, group_size=64, dequantize=True)
replace_with_metal_linear(model, quantization_config=config, pre_quantized=True)
nb_metal = sum(1 for m in model.modules() if isinstance(m, MetalLinear))
self.assertEqual(nb_metal, 0, "No modules should be replaced when dequantize=True")
def test_prequantized_dtype_is_uint32(self):
from transformers.integrations.metal_quantization import MetalLinear, replace_with_metal_linear
model = self._make_small_model()
config = MetalConfig(bits=4, group_size=64)
replace_with_metal_linear(model, quantization_config=config, pre_quantized=True)
for m in model.modules():
if isinstance(m, MetalLinear):
self.assertEqual(m.weight.dtype, torch.uint32)
break
def test_quantize_on_the_fly_dtype_is_not_uint32(self):
from transformers.integrations.metal_quantization import MetalLinear, replace_with_metal_linear
model = self._make_small_model()
config = MetalConfig(bits=4, group_size=64)
replace_with_metal_linear(model, quantization_config=config, pre_quantized=False)
for m in model.modules():
if isinstance(m, MetalLinear):
self.assertNotEqual(m.weight.dtype, torch.uint32)
break
@require_torch
class MetalConversionOpsTest(unittest.TestCase):
"""Test the ``MetalQuantize`` and ``MetalDequantize`` weight conversion operations."""
def _make_quantizer(self, bits=4, group_size=64):
config = MetalConfig(bits=bits, group_size=group_size)
quantizer = MetalHfQuantizer(config)
quantizer.pre_quantized = False
return quantizer
def test_metal_quantize_produces_correct_keys(self):
from transformers.integrations.metal_quantization import MetalQuantize
quantizer = self._make_quantizer()
op = MetalQuantize(quantizer)
weight = torch.randn(64, 256)
result = op.convert({"model.layer.weight": weight})
self.assertIn("model.layer.weight", result)
self.assertIn("model.layer.scales", result)
self.assertIn("model.layer.qbiases", result)
self.assertEqual(result["model.layer.weight"].dtype, torch.uint32)
def test_metal_quantize_preserves_original_dtype(self):
from transformers.integrations.metal_quantization import MetalQuantize
quantizer = self._make_quantizer()
op = MetalQuantize(quantizer)
for dtype in (torch.float32, torch.float16, torch.bfloat16):
weight = torch.randn(64, 256, dtype=dtype)
result = op.convert({"layer.weight": weight})
self.assertEqual(result["layer.scales"].dtype, dtype, f"scales dtype mismatch for input {dtype}")
self.assertEqual(result["layer.qbiases"].dtype, dtype, f"qbiases dtype mismatch for input {dtype}")
def test_metal_dequantize_returns_target_dtype(self):
"""MetalDequantize should return a tensor in the same dtype as the scales."""
from transformers.integrations.metal_quantization import MetalDequantize, MetalQuantize
quantizer = self._make_quantizer()
for dtype in (torch.float16, torch.bfloat16):
weight = torch.randn(64, 256, dtype=dtype)
q_op = MetalQuantize(quantizer)
q_result = q_op.convert({"layer.weight": weight})
dq_quantizer = self._make_quantizer()
dq_quantizer.pre_quantized = True
dq_quantizer.quantization_config.dequantize = True
dq_op = MetalDequantize(dq_quantizer)
dq_result = dq_op.convert(
{
"weight$": [q_result["layer.weight"]],
"scales": [q_result["layer.scales"]],
"qbiases": [q_result["layer.qbiases"]],
},
full_layer_name="layer.weight",
)
self.assertEqual(
dq_result["layer.weight"].dtype, dtype, f"dequantized dtype should match scales ({dtype})"
)
def test_quantize_then_dequantize_roundtrip(self):
from transformers.integrations.metal_quantization import MetalDequantize, MetalQuantize
quantizer = self._make_quantizer(bits=4, group_size=64)
q_op = MetalQuantize(quantizer)
weight = torch.randn(64, 256)
q_result = q_op.convert({"layer.weight": weight})
dq_quantizer = self._make_quantizer(bits=4, group_size=64)
dq_op = MetalDequantize(dq_quantizer)
dq_result = dq_op.convert(
{
"weight$": [q_result["layer.weight"]],
"scales": [q_result["layer.scales"]],
"qbiases": [q_result["layer.qbiases"]],
},
full_layer_name="layer.weight",
)
w_deq = dq_result["layer.weight"].float()
max_err = (weight - w_deq).abs().max().item()
self.assertLess(max_err, 0.5, "Quantize -> Dequantize round-trip error too large")
@require_torch
class MetalWeightConversionsTest(unittest.TestCase):
def test_get_weight_conversions_empty_when_not_dequantize(self):
config = MetalConfig()
quantizer = MetalHfQuantizer(config)
quantizer.pre_quantized = True
self.assertEqual(quantizer.get_weight_conversions(), [])
def test_get_weight_conversions_has_entry_when_dequantize(self):
config = MetalConfig(dequantize=True)
quantizer = MetalHfQuantizer(config)
quantizer.pre_quantized = True
conversions = quantizer.get_weight_conversions()
self.assertEqual(len(conversions), 1)
def test_get_weight_conversions_empty_when_not_prequantized(self):
config = MetalConfig(dequantize=True)
quantizer = MetalHfQuantizer(config)
quantizer.pre_quantized = False
self.assertEqual(quantizer.get_weight_conversions(), [])
@require_torch
class MetalModelConversionTest(unittest.TestCase):
"""Test that a model is correctly converted on the meta device."""
def setUp(self):
gc.collect()
def tearDown(self):
gc.collect()
def test_quantized_model_conversion(self):
from transformers.integrations.metal_quantization import MetalLinear, replace_with_metal_linear
model_id = "hf-internal-testing/tiny-random-OPTForCausalLM"
config = AutoConfig.from_pretrained(model_id)
quantization_config = MetalConfig(bits=4, group_size=64)
with torch.device("meta"):
model = OPTForCausalLM(config)
nb_linears = sum(1 for m in model.modules() if isinstance(m, nn.Linear))
model = replace_with_metal_linear(model, quantization_config=quantization_config, pre_quantized=True)
nb_metal = sum(1 for m in model.modules() if isinstance(m, MetalLinear))
self.assertEqual(nb_linears, nb_metal)
def test_quantized_model_conversion_with_exclusion(self):
from transformers.integrations.metal_quantization import MetalLinear, replace_with_metal_linear
model_id = "hf-internal-testing/tiny-random-OPTForCausalLM"
config = AutoConfig.from_pretrained(model_id)
quantization_config = MetalConfig(bits=4, group_size=64)
with torch.device("meta"):
model = OPTForCausalLM(config)
nb_linears = sum(1 for m in model.modules() if isinstance(m, nn.Linear))
model = replace_with_metal_linear(
model, modules_to_not_convert=["out_proj"], quantization_config=quantization_config, pre_quantized=True
)
nb_metal = sum(1 for m in model.modules() if isinstance(m, MetalLinear))
nb_excluded = sum(1 for name, m in model.named_modules() if "out_proj" in name and isinstance(m, nn.Linear))
self.assertEqual(nb_metal + nb_excluded, nb_linears)
def test_param_needs_quantization(self):
from transformers.integrations.metal_quantization import MetalLinear, replace_with_metal_linear
model_id = "hf-internal-testing/tiny-random-OPTForCausalLM"
config = AutoConfig.from_pretrained(model_id)
quantization_config = MetalConfig(bits=4, group_size=64)
with torch.device("meta"):
model = OPTForCausalLM(config)
replace_with_metal_linear(model, quantization_config=quantization_config, pre_quantized=False)
quantizer = MetalHfQuantizer(quantization_config)
quantizer.pre_quantized = False
for name, module in model.named_modules():
if isinstance(module, MetalLinear):
self.assertTrue(quantizer.param_needs_quantization(model, f"{name}.weight"))
self.assertFalse(quantizer.param_needs_quantization(model, f"{name}.scales"))
self.assertFalse(quantizer.param_needs_quantization(model, f"{name}.qbiases"))
def test_param_needs_quantization_prequantized_is_false(self):
from transformers.integrations.metal_quantization import MetalLinear, replace_with_metal_linear
model_id = "hf-internal-testing/tiny-random-OPTForCausalLM"
config = AutoConfig.from_pretrained(model_id)
quantization_config = MetalConfig(bits=4, group_size=64)
with torch.device("meta"):
model = OPTForCausalLM(config)
replace_with_metal_linear(model, quantization_config=quantization_config, pre_quantized=True)
quantizer = MetalHfQuantizer(quantization_config)
quantizer.pre_quantized = True
for name, module in model.named_modules():
if isinstance(module, MetalLinear):
self.assertFalse(
quantizer.param_needs_quantization(model, f"{name}.weight"),
"Pre-quantized weights should not be re-quantized",
)
@slow
@require_torch
class MetalSlowIntegrationTest(unittest.TestCase):
"""Slow tests that actually load a model with Metal quantization.
These run on CPU with ``dequantize=True`` so they don't require MPS.
"""
model_id = "medmekk/Llama-3.2-1B-Instruct-metal"
def setUp(self):
gc.collect()
def tearDown(self):
gc.collect()
def test_load_prequantized_dequantize_on_cpu(self):
"""Load a quantized checkpoint with dequantize=True on CPU and run a forward pass."""
with _patch_no_mps():
config = MetalConfig(dequantize=True)
model = AutoModelForCausalLM.from_pretrained(self.model_id, quantization_config=config, device_map="cpu")
self.assertIsNotNone(model)
for param in model.parameters():
self.assertNotEqual(param.dtype, torch.uint32, "All weights should be dequantized")
def test_quantized_model(self):
with _patch_no_mps():
config = MetalConfig(bits=4, group_size=64)
model = AutoModelForCausalLM.from_pretrained(
self.model_id, quantization_config=config, device_map=torch_device
)
tokenizer = AutoTokenizer.from_pretrained(self.model_id)
self.assertIsNotNone(model)
input = "Hello, how are you?"
EXPECTED_OUTPUT = "Hello, how are you? I'm doing well, thanks for asking. I"
input_ids = tokenizer.encode(input, return_tensors="pt").to(torch_device)
output = model.generate(input_ids, max_new_tokens=10, do_sample=False)
self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT)

View File

View File

@@ -0,0 +1,591 @@
# Copyright 2025 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import tempfile
import unittest
from contextlib import ExitStack, contextmanager
from unittest.mock import patch
from transformers import AutoTokenizer, GptOssForCausalLM, Mxfp4Config
from transformers.testing_utils import (
require_kernels,
require_torch,
require_torch_gpu,
require_torch_large_accelerator,
require_triton,
slow,
torch_device,
)
from transformers.utils import (
is_torch_available,
)
if is_torch_available():
import torch
if torch.cuda.is_available():
REQUIRE_TRITON_MXFP4 = require_triton(min_version="3.4.0")
elif hasattr(torch, "xpu") and torch.xpu.is_available():
REQUIRE_TRITON_MXFP4 = require_triton(min_version="3.5.0")
elif torch_device == "cpu":
REQUIRE_TRITON_MXFP4 = require_triton(min_version="3.5.0")
else:
REQUIRE_TRITON_MXFP4 = unittest.skip("test requires CUDA or XPU")
def _empty_accelerator_cache():
if torch.cuda.is_available():
torch.cuda.empty_cache()
elif hasattr(torch, "xpu") and torch.xpu.is_available():
torch.xpu.empty_cache()
@contextmanager
def _patch_no_accelerator():
with ExitStack() as stack:
stack.enter_context(patch("torch.cuda.is_available", return_value=False))
if hasattr(torch, "xpu"):
stack.enter_context(patch("torch.xpu.is_available", return_value=False))
stack.enter_context(patch("torch.accelerator.current_accelerator", return_value=None))
yield
class Mxfp4ConfigTest(unittest.TestCase):
def test_basic_config_creation(self):
"""Test basic configuration creation with default values"""
config = Mxfp4Config()
self.assertEqual(config.quant_method.value, "mxfp4")
self.assertIsNone(config.modules_to_not_convert)
self.assertFalse(config.dequantize)
def test_config_with_modules_to_not_convert(self):
"""Test configuration with modules to not convert"""
modules = ["model.layers.*.self_attn", "lm_head"]
config = Mxfp4Config(modules_to_not_convert=modules)
self.assertEqual(config.modules_to_not_convert, modules)
def test_config_with_dequantize(self):
"""Test configuration with dequantize enabled"""
config = Mxfp4Config(dequantize=True)
self.assertTrue(config.dequantize)
def test_get_loading_attributes(self):
"""Test get_loading_attributes method"""
config = Mxfp4Config(dequantize=True)
attrs = config.get_loading_attributes()
self.assertEqual(attrs["dequantize"], True)
def test_to_dict(self):
"""Test configuration serialization to dict"""
config = Mxfp4Config(modules_to_not_convert=["lm_head"], dequantize=True)
config_dict = config.to_dict()
self.assertEqual(config_dict["quant_method"], "mxfp4")
self.assertEqual(config_dict["modules_to_not_convert"], ["lm_head"])
# we don't keep dequantize in config_dict
self.assertTrue("dequantize" not in config_dict)
def test_from_dict(self):
"""Test configuration creation from dict"""
config_dict = {"quant_method": "mxfp4", "modules_to_not_convert": ["lm_head"], "dequantize": True}
config = Mxfp4Config.from_dict(config_dict)
self.assertEqual(config.modules_to_not_convert, ["lm_head"])
self.assertTrue(config.dequantize)
class Mxfp4QuantizerTest(unittest.TestCase):
"""Test the Mxfp4HfQuantizer class"""
def setUp(self):
gc.collect()
_empty_accelerator_cache()
from transformers.utils.logging import warning_once
warning_once.cache_clear()
def test_quantizer_validation_no_torch(self):
"""Test quantizer validation when torch is not available"""
with patch("transformers.quantizers.quantizer_mxfp4.is_torch_available", return_value=False):
from transformers.quantizers.quantizer_mxfp4 import Mxfp4HfQuantizer
config = Mxfp4Config()
quantizer = Mxfp4HfQuantizer(config)
with self.assertRaises(ImportError):
quantizer.validate_environment()
def test_quantizer_validation_no_accelerator(self):
"""Test quantizer validation when CUDA/XPU is not available"""
with (
_patch_no_accelerator(),
patch("transformers.quantizers.quantizer_mxfp4.is_triton_available", return_value=True),
patch("transformers.quantizers.quantizer_mxfp4.is_kernels_available", return_value=True),
patch("transformers.quantizers.quantizer_mxfp4.Mxfp4HfQuantizer._lazy_import_kernels"),
):
from transformers.quantizers.quantizer_mxfp4 import Mxfp4HfQuantizer
config = Mxfp4Config()
quantizer = Mxfp4HfQuantizer(config)
quantizer.pre_quantized = False
# CPU already supported MXFP4
quantizer.validate_environment()
@require_torch_gpu
def test_quantizer_validation_low_compute_capability(self):
"""Test quantizer validation with CUDA low compute capability"""
with patch("torch.cuda.get_device_capability", return_value=(7, 0)):
from transformers.quantizers.quantizer_mxfp4 import Mxfp4HfQuantizer
config = Mxfp4Config()
quantizer = Mxfp4HfQuantizer(config)
quantizer.pre_quantized = False
with self.assertRaises(ValueError):
quantizer.validate_environment()
@require_torch_gpu
def test_quantizer_validation_low_compute_capability_with_prequantized(self):
"""Test quantizer validation with CUDA low compute capability"""
with patch("torch.cuda.get_device_capability", return_value=(7, 0)):
from transformers.quantizers.quantizer_mxfp4 import Mxfp4HfQuantizer
config = Mxfp4Config()
quantizer = Mxfp4HfQuantizer(config)
# Should automatically set dequantize=True and warn
quantizer.validate_environment()
self.assertTrue(quantizer.quantization_config.dequantize)
@require_torch_gpu
def test_quantizer_validation_low_compute_capability_with_dequantize(self):
"""Test quantizer validation with CUDA low compute capability but dequantize enabled"""
with patch("torch.cuda.get_device_capability", return_value=(7, 0)):
from transformers.quantizers.quantizer_mxfp4 import Mxfp4HfQuantizer
config = Mxfp4Config(dequantize=True)
quantizer = Mxfp4HfQuantizer(config)
# Should not raise error with dequantize=True
try:
quantizer.validate_environment()
except ValueError as e:
if "compute capability" in str(e):
self.fail("Should not raise compute capability error when dequantize=True")
def test_quantizer_validation_order_dequantize_before_accelerator_check(self):
"""Test that dequantize check happens before CUDA/XPU availability check"""
# Mock torch.cuda.is_available
with (
_patch_no_accelerator(),
patch("transformers.quantizers.quantizer_mxfp4.is_triton_available", return_value=True),
patch("transformers.quantizers.quantizer_mxfp4.is_kernels_available", return_value=True),
patch("transformers.quantizers.quantizer_mxfp4.Mxfp4HfQuantizer._lazy_import_kernels"),
):
from transformers.quantizers.quantizer_mxfp4 import Mxfp4HfQuantizer
# Test with dequantize=True - should pass even without CUDA/XPU and accelerate
config = Mxfp4Config(dequantize=True)
quantizer = Mxfp4HfQuantizer(config)
# This should not raise any error because dequantize check comes first
quantizer.validate_environment()
# Test with dequantize=False - should still fail due to missing CUDA/XPU
config = Mxfp4Config(dequantize=False)
quantizer = Mxfp4HfQuantizer(config)
quantizer.pre_quantized = False
# CPU already supported MXFP4
quantizer.validate_environment()
def test_quantizer_validation_missing_triton(self):
"""Test quantizer validation when triton is not available"""
with (
patch("transformers.quantizers.quantizer_mxfp4.is_triton_available", return_value=False),
patch("transformers.quantizers.quantizer_mxfp4.is_kernels_available", return_value=False),
):
from transformers.quantizers.quantizer_mxfp4 import Mxfp4HfQuantizer
config = Mxfp4Config()
quantizer = Mxfp4HfQuantizer(config)
quantizer.pre_quantized = False
with self.assertRaises(ValueError):
quantizer.validate_environment()
def test_quantizer_validation_missing_triton_pre_quantized_no_dequantize(self):
"""Test quantizer validation when triton is not available but model is pre-quantized and dequantize is False"""
with (
patch("transformers.quantizers.quantizer_mxfp4.is_triton_available", return_value=False),
patch("transformers.quantizers.quantizer_mxfp4.is_kernels_available", return_value=False),
):
from transformers.quantizers.quantizer_mxfp4 import Mxfp4HfQuantizer
config = Mxfp4Config()
quantizer = Mxfp4HfQuantizer(config)
quantizer.pre_quantized = True
# Should automatically set dequantize=True and warn
quantizer.validate_environment()
self.assertTrue(quantizer.quantization_config.dequantize)
def test_is_trainable(self):
"""Test trainability"""
from transformers.quantizers.quantizer_mxfp4 import Mxfp4HfQuantizer
config = Mxfp4Config()
quantizer = Mxfp4HfQuantizer(config)
# MXFP4 is not trainable
self.assertFalse(quantizer.is_trainable)
@require_torch_gpu
def test_warning_distinguishes_triton_from_kernels(self):
"""When only one dependency is missing, warning should mention it specifically."""
from transformers.quantizers.quantizer_mxfp4 import Mxfp4HfQuantizer
# Missing kernels only -> warning should mention kernels
config = Mxfp4Config()
quantizer = Mxfp4HfQuantizer(config)
quantizer.pre_quantized = True
with (
patch("transformers.quantizers.quantizer_mxfp4.is_triton_available", return_value=True),
patch("transformers.quantizers.quantizer_mxfp4.is_kernels_available", return_value=False),
self.assertLogs("transformers", level="WARNING") as cm,
):
quantizer.validate_environment()
warning_text = " ".join(cm.output)
self.assertIn("kernels", warning_text.lower())
self.assertTrue(quantizer.quantization_config.dequantize)
# Missing triton only -> warning should mention triton
config = Mxfp4Config()
quantizer = Mxfp4HfQuantizer(config)
quantizer.pre_quantized = True
with (
patch("transformers.quantizers.quantizer_mxfp4.is_triton_available", return_value=False),
patch("transformers.quantizers.quantizer_mxfp4.is_kernels_available", return_value=True),
self.assertLogs("transformers", level="WARNING") as cm,
):
quantizer.validate_environment()
warning_text = " ".join(cm.output)
self.assertIn("triton", warning_text.lower())
self.assertTrue(quantizer.quantization_config.dequantize)
@require_torch_gpu
def test_error_distinguishes_triton_from_kernels(self):
"""When quantizing without a dependency, ValueError should mention it specifically."""
from transformers.quantizers.quantizer_mxfp4 import Mxfp4HfQuantizer
# Missing kernels only -> error should mention kernels
config = Mxfp4Config()
quantizer = Mxfp4HfQuantizer(config)
quantizer.pre_quantized = False
with (
patch("transformers.quantizers.quantizer_mxfp4.is_triton_available", return_value=True),
patch("transformers.quantizers.quantizer_mxfp4.is_kernels_available", return_value=False),
):
with self.assertRaises(ValueError) as ctx:
quantizer.validate_environment()
self.assertIn("kernels", str(ctx.exception).lower())
# Missing triton only -> error should mention triton
config = Mxfp4Config()
quantizer = Mxfp4HfQuantizer(config)
quantizer.pre_quantized = False
with (
patch("transformers.quantizers.quantizer_mxfp4.is_triton_available", return_value=False),
patch("transformers.quantizers.quantizer_mxfp4.is_kernels_available", return_value=True),
):
with self.assertRaises(ValueError) as ctx:
quantizer.validate_environment()
self.assertIn("triton", str(ctx.exception).lower())
class Mxfp4IntegrationTest(unittest.TestCase):
"""Test mxfp4 integration functions"""
def test_should_convert_module(self):
"""Test module conversion decision logic"""
from transformers.quantizers.quantizers_utils import should_convert_module
# Should convert by default
self.assertTrue(should_convert_module("model", None))
self.assertTrue(should_convert_module("model", []))
# Should not convert if in exclusion list
patterns = ["model.layers.*.self_attn", "lm_head"]
self.assertFalse(should_convert_module("lm_head", patterns))
self.assertTrue(should_convert_module("experts", patterns))
@require_torch
def test_convert_moe_packed_tensors(self):
"""Test unpacking of quantized tensors"""
from transformers.integrations.mxfp4 import convert_moe_packed_tensors
# Create dummy packed tensors
blocks = torch.randint(0, 255, (2, 4, 8, 16), dtype=torch.uint8)
scales = torch.randint(100, 150, (2, 4, 8), dtype=torch.uint8)
result = convert_moe_packed_tensors(blocks, scales, dtype=torch.bfloat16)
self.assertEqual(result.shape, (2, 8 * 16 * 2, 4))
self.assertEqual(result.dtype, torch.bfloat16)
@REQUIRE_TRITON_MXFP4
@require_kernels
@require_torch
def test_quantize_to_mxfp4(self):
"""Test quantization function"""
from transformers.integrations.mxfp4 import quantize_to_mxfp4
from transformers.quantizers.quantizer_mxfp4 import Mxfp4HfQuantizer
config = Mxfp4Config()
quantizer = Mxfp4HfQuantizer(config)
# Create dummy weight tensor
device = torch_device
w = torch.randn(32, 64, 128, dtype=torch.bfloat16, device=torch.device(device))
quantized_w, w_scale = quantize_to_mxfp4(w, quantizer._lazy_import_kernels())
# Check that shapes are reasonable
self.assertEqual(quantized_w.dtype, torch.uint8)
@require_torch
@require_torch_large_accelerator
@REQUIRE_TRITON_MXFP4
@require_kernels
@slow
class Mxfp4ModelTest(unittest.TestCase):
"""Test mxfp4 with actual models (requires specific model and hardware)"""
# These should be paths to real OpenAI MoE models for proper testing
model_name = "openai/gpt-oss-20b"
input_text = "Once upon a time"
# Expected outputs for generation tests
EXPECTED_OUTPUTS = set()
EXPECTED_OUTPUTS.add("Once upon a time, in a small town, there lived a young")
def setUp(self):
gc.collect()
_empty_accelerator_cache()
def tearDown(self):
gc.collect()
_empty_accelerator_cache()
def check_inference_correctness_quantized(self, model, tokenizer):
# Check that inference pass works on the model
encoded_input = tokenizer(self.input_text, return_tensors="pt").to(model.device)
# Set pad token if not set
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
with torch.no_grad():
output_sequences = model.generate(
**encoded_input,
max_new_tokens=10,
do_sample=False,
pad_token_id=tokenizer.eos_token_id,
use_cache=False,
)
generated_text = tokenizer.decode(output_sequences[0], skip_special_tokens=True)
self.assertIn(generated_text, self.EXPECTED_OUTPUTS)
def test_gpt_oss_model_loading_quantized_with_device_map(self):
"""Test loading OpenAI MoE model with mxfp4 quantization and device_map"""
model = GptOssForCausalLM.from_pretrained(
self.model_name,
dtype=torch.bfloat16,
device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
self.check_inference_correctness_quantized(model, tokenizer)
def test_gpt_oss_model_loading_dequantized_with_device_map(self):
"""Test loading OpenAI MoE model with mxfp4 dequantization and device_map"""
quantization_config = Mxfp4Config(dequantize=True)
# Test that config is properly set up
self.assertTrue(quantization_config.dequantize)
model = GptOssForCausalLM.from_pretrained(
self.model_name,
quantization_config=quantization_config,
dtype=torch.bfloat16,
device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
self.check_inference_correctness_quantized(model, tokenizer)
def test_model_device_map_validation(self):
"""Test device map validation"""
from transformers.quantizers.quantizer_mxfp4 import Mxfp4HfQuantizer
config = Mxfp4Config()
quantizer = Mxfp4HfQuantizer(config)
quantizer.pre_quantized = False
# Test with CPU in device map (CPU already support mxfp4)
quantizer.validate_environment(device_map={"": "cpu"})
def test_memory_footprint_comparison(self):
"""Test memory footprint differences between quantized and unquantized models"""
# Expected: quantized < dequantized < unquantized memory usage
quantization_config = Mxfp4Config(dequantize=True)
quantized_model = GptOssForCausalLM.from_pretrained(
self.model_name,
dtype=torch.bfloat16,
device_map="auto",
)
dequantized_model = GptOssForCausalLM.from_pretrained(
self.model_name,
dtype=torch.bfloat16,
device_map="auto",
quantization_config=quantization_config,
)
quantized_mem = quantized_model.get_memory_footprint()
dequantized_mem = dequantized_model.get_memory_footprint()
self.assertLess(quantized_mem, dequantized_mem)
def test_save_mxfp4(self):
"""Test saving quantized OpenAI MoE model with device_map"""
model = GptOssForCausalLM.from_pretrained(
self.model_name,
torch_dtype=torch.bfloat16,
device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
with tempfile.TemporaryDirectory() as tmp:
# Save the model in mxfp4 format
model.save_pretrained(tmp)
_empty_accelerator_cache()
gc.collect()
# test quantized model
loaded_model = GptOssForCausalLM.from_pretrained(
tmp,
torch_dtype=torch.bfloat16,
device_map="auto",
)
self.check_inference_correctness_quantized(loaded_model, tokenizer)
# test dequantized model
loaded_model = GptOssForCausalLM.from_pretrained(
tmp,
quantization_config=Mxfp4Config(dequantize=True),
torch_dtype=torch.bfloat16,
device_map="auto",
)
self.check_inference_correctness_quantized(loaded_model, tokenizer)
def test_save_mxfp4_non_quantized(self):
"""Test saving dequantized OpenAI MoE model with mxfp4 quantization and device_map"""
non_quantized_model_name = "hf-internal-testing/gpt-oss-20b-bf16"
tokenizer = AutoTokenizer.from_pretrained(non_quantized_model_name)
loaded_model = GptOssForCausalLM.from_pretrained(
non_quantized_model_name,
quantization_config=Mxfp4Config(),
torch_dtype=torch.bfloat16,
device_map="auto",
)
# save the quantized model
with tempfile.TemporaryDirectory() as tmp:
loaded_model.save_pretrained(tmp)
_empty_accelerator_cache()
gc.collect()
# load it back to check with everything works as expected
loaded_model = GptOssForCausalLM.from_pretrained(
tmp,
torch_dtype=torch.bfloat16,
device_map="auto",
)
self.check_inference_correctness_quantized(loaded_model, tokenizer)
loaded_model = GptOssForCausalLM.from_pretrained(
tmp,
quantization_config=Mxfp4Config(dequantized=True),
torch_dtype=torch.bfloat16,
device_map="auto",
)
self.check_inference_correctness_quantized(loaded_model, tokenizer)
def test_compute_module_sizes(self):
r"""
Test if we compute the right module sizes needed to generate the device map.
Also test if we get the right values for `total_byte_count` in `caching_allocator_warmup`.
"""
from transformers import AutoConfig, AutoModelForCausalLM
from transformers.integrations import Mxfp4GptOssExperts
from transformers.integrations.accelerate import compute_module_sizes
from transformers.modeling_utils import expand_device_map, get_total_byte_count
from transformers.quantizers import AutoHfQuantizer
# we need to preprocess the model like that because device_map calculation happens before we load the weights inside the model.
# For normal wieghts, it's fine but for quantized weights, the tensors dtype might change during loading.
with torch.device("meta"):
config = AutoConfig.from_pretrained(self.model_name)
model = AutoModelForCausalLM.from_config(config, dtype=torch.bfloat16)
model_size, _ = compute_module_sizes(model, only_modules=False)
expected_keys = [name for name, _ in model.named_parameters()] + [
name for name, _ in model.named_buffers()
]
expanded_device_map = expand_device_map({"": torch_device}, expected_keys)
total_byte_count = list(get_total_byte_count(model, expanded_device_map).values())[0]
# testing prequantized = False should be enough, the shape should be the same whether it is pre-quantized or not
hf_quantizer = AutoHfQuantizer.from_config(Mxfp4Config(), pre_quantized=False)
hf_quantizer.preprocess_model(model=model, config=model.config)
quantized_model_size, _ = compute_module_sizes(model, hf_quantizer, only_modules=False)
expected_keys = [name for name, _ in model.named_parameters()] + [
name for name, _ in model.named_buffers()
]
expanded_device_map = expand_device_map({"": torch_device}, expected_keys)
quantized_total_byte_count = list(get_total_byte_count(model, expanded_device_map, hf_quantizer).values())[
0
]
for name, module in model.named_modules():
if isinstance(module, Mxfp4GptOssExperts):
# from 16 bits to 4 bits
assert int(model_size[f"{name}.gate_up_proj"] // 4) == int(
quantized_model_size[f"{name}.gate_up_proj"]
)
assert int(model_size[f"{name}.down_proj"] // 4) == int(quantized_model_size[f"{name}.down_proj"])
# check that we get the same value, as we use `compute_module_sizes` in `get_total_byte_count`
assert total_byte_count == model_size[""]
assert quantized_total_byte_count == quantized_model_size[""]
# we should at least have 3 times memory reduction in total for this model
assert model_size[""] > quantized_model_size[""] * 3

View File

@@ -0,0 +1,328 @@
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import tempfile
import unittest
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, QuantoConfig
from transformers.testing_utils import (
require_accelerate,
require_optimum_quanto,
require_torch_accelerator,
slow,
torch_device,
)
from transformers.utils import is_optimum_quanto_available, is_torch_available
if is_torch_available():
import torch
if is_optimum_quanto_available():
from optimum.quanto import QLayerNorm, QLinear
from transformers.integrations.quanto import replace_with_quanto_layers
@require_optimum_quanto
@require_accelerate
class QuantoTestIntegration(unittest.TestCase):
model_id = "HuggingFaceTB/SmolLM3-3B"
def setUp(self):
config = AutoConfig.from_pretrained(self.model_id)
with torch.device("meta"):
self.model = AutoModelForCausalLM.from_config(config)
self.nb_linear = 0
self.nb_layernorm = 0
for module in self.model.modules():
if isinstance(module, torch.nn.Linear):
self.nb_linear += 1
elif isinstance(module, torch.nn.LayerNorm):
self.nb_layernorm += 1
def test_weight_only_quantization_conversion(self):
"""
Simple test that checks if the quantized model has been converted properly when using weight only quantization
"""
# Try with weight only quantization
quantization_config = QuantoConfig(weights="int8", activations=None)
self.model = replace_with_quanto_layers(self.model, quantization_config=quantization_config)
nb_qlinear = 0
for module in self.model.modules():
if isinstance(module, QLinear):
nb_qlinear += 1
self.assertEqual(self.nb_linear, nb_qlinear)
def test_weight_and_activation_quantization_conversion(self):
"""
Simple test that checks if the quantized model has been converted properly when using weight + activation quantization
"""
# Try with weight + activation quantization
quantization_config = QuantoConfig(weights="int8", activations="int8")
self.model = replace_with_quanto_layers(self.model, quantization_config=quantization_config)
nb_qlinear = 0
nb_qlayernorm = 0
for module in self.model.modules():
if isinstance(module, QLinear):
nb_qlinear += 1
if isinstance(module, QLayerNorm):
nb_qlayernorm += 1
self.assertEqual(self.nb_linear, nb_qlinear)
self.assertEqual(self.nb_layernorm, nb_qlayernorm)
def test_conversion_with_modules_to_not_convert(self):
"""
Simple test that checks if the quantized model has been converted properly when specifying modules_to_not_convert argument
"""
# Try with weight + activatioin quantization
quantization_config = QuantoConfig(weights="int8", activations="int8")
self.model = replace_with_quanto_layers(
self.model, quantization_config=quantization_config, modules_to_not_convert=["lm_head"]
)
nb_qlinear = 0
nb_qlayernorm = 0
for module in self.model.modules():
if isinstance(module, QLinear):
nb_qlinear += 1
if isinstance(module, QLayerNorm):
nb_qlayernorm += 1
self.assertEqual(self.nb_linear - 1, nb_qlinear)
@slow
@require_torch_accelerator
@require_optimum_quanto
@require_accelerate
class QuantoQuantizationTest(unittest.TestCase):
"""
Test 8-bit weights only quantization
"""
model_name = "HuggingFaceTB/SmolLM2-135M"
weights = "int8"
activations = None
device_map = "cpu"
input_text = "Hello my name is"
EXPECTED_OUTPUTS = "Hello my name is John. I am a student of the University of"
def setUp(self):
"""
Setup quantized model
"""
quantization_config = QuantoConfig(
weights=self.weights,
activations=self.activations,
)
self.quantized_model = AutoModelForCausalLM.from_pretrained(
self.model_name,
device_map=self.device_map,
quantization_config=quantization_config,
dtype=torch.float32,
)
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
self.have_accelerate_hooks = (
getattr(self.quantized_model, "hf_device_map", False) and len(self.quantized_model.hf_device_map) > 1
)
def check_inference_correctness(self, model, device):
r"""
Test the generation quality of the quantized model and see that we are matching the expected output.
Given that we are operating on small numbers + the testing model is relatively small, we might not get
the same output across GPUs. So we'll generate few tokens (5-10) and check their output.
"""
if not self.have_accelerate_hooks:
model.to(device)
encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
output_sequences = model.generate(input_ids=encoded_input["input_ids"].to(device), max_new_tokens=10)
self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
def test_generate_quality_cpu(self):
"""
Simple test to check the quality of the model on cpu by comparing the generated tokens with the expected tokens
"""
self.check_inference_correctness(self.quantized_model, "cpu")
def test_generate_quality_accelerator(self):
"""
Simple test to check the quality of the model on accelerators by comparing the generated tokens with the expected tokens
"""
self.check_inference_correctness(self.quantized_model, torch_device)
def test_quantized_model_layers(self):
from optimum.quanto import QBitsTensor, QModuleMixin, QTensor
"""
Suite of simple test to check if the layers are quantized and are working properly
"""
# Test the type of the quantized layer
self.assertTrue(isinstance(self.quantized_model.model.layers[0].self_attn.k_proj, QModuleMixin))
self.assertTrue(isinstance(self.quantized_model.model.layers[0].self_attn.k_proj.weight, QTensor))
if self.weights == "int4":
self.assertTrue(isinstance(self.quantized_model.model.layers[0].self_attn.k_proj.weight, QBitsTensor))
# check that the lm_head was indeed not quantized, just like bnb
self.assertTrue(
isinstance(self.quantized_model.lm_head, torch.nn.Linear)
and not isinstance(self.quantized_model.lm_head, QModuleMixin)
)
if self.device_map in ["cpu", "cuda"]:
self.assertEqual(
self.quantized_model.model.layers[0].self_attn.k_proj.weight._data.device.type,
self.device_map,
)
self.quantized_model.to(0)
self.assertEqual(self.quantized_model.model.layers[0].self_attn.k_proj.weight._data.device.type, torch_device)
def test_serialization_safetensors(self):
"""
Test the serialization, the loading and the inference of the quantized weights
"""
with tempfile.TemporaryDirectory() as tmpdirname:
with self.assertRaises(ValueError) as e:
self.quantized_model.save_pretrained(tmpdirname)
self.assertIn("The model is quantized with quanto and is not serializable", str(e.exception))
def check_same_model(self, model1, model2):
d0 = dict(model1.named_parameters())
d1 = dict(model2.named_parameters())
self.assertTrue(d0.keys() == d1.keys())
for k in d0:
self.assertTrue(d0[k].shape == d1[k].shape)
self.assertTrue(d0[k].device.type == d1[k].device.type)
self.assertTrue(d0[k].device == d1[k].device)
self.assertTrue(d0[k].dtype == d1[k].dtype)
self.assertTrue(torch.equal(d0[k], d1[k].to(d0[k].device)))
def test_compare_with_quanto(self):
from optimum.quanto import freeze, qint4, qint8, quantize
w_mapping = {"int8": qint8, "int4": qint4}
model = AutoModelForCausalLM.from_pretrained(
self.model_name,
device_map=self.device_map,
dtype=torch.float32,
)
# we do not quantize the lm_head since we don't do that in transformers
quantize(model.model, weights=w_mapping[self.weights])
freeze(model.model)
self.check_same_model(model, self.quantized_model)
self.check_inference_correctness(model, device=torch_device)
def test_compute_module_sizes(self):
r"""
Test if we compute the right module sizes needed to generate the device map.
Also test if we get the right values for `total_byte_count` in `caching_allocator_warmup`.
Note that `compute_module_sizes` is being used in `get_total_byte_count`
"""
from transformers.integrations.accelerate import compute_module_sizes
from transformers.modeling_utils import expand_device_map, get_total_byte_count
from transformers.quantizers import AutoHfQuantizer
# we need to preprocess the model like that because device_map calculation happens before we load the weights inside the model.
# For normal wieghts, it's fine but for quantized weights, the tensors dtype might change during loading.
with torch.device("meta"):
config = AutoConfig.from_pretrained(self.model_name)
model = AutoModelForCausalLM.from_config(config, dtype=torch.bfloat16)
model_size, _ = compute_module_sizes(model, only_modules=False)
expected_keys = [name for name, _ in model.named_parameters()] + [
name for name, _ in model.named_buffers()
]
expanded_device_map = expand_device_map({"": torch_device}, expected_keys)
total_byte_count = list(get_total_byte_count(model, expanded_device_map).values())[0]
# testing prequantized = False should be enough, the shape should be the same whether it is pre-quantized or not
hf_quantizer = AutoHfQuantizer.from_config(QuantoConfig(weights="int4"), pre_quantized=False)
hf_quantizer.preprocess_model(model=model, config=model.config)
quantized_model_size, _ = compute_module_sizes(model, hf_quantizer, only_modules=False)
expected_keys = [name for name, _ in model.named_parameters()] + [
name for name, _ in model.named_buffers()
]
expanded_device_map = expand_device_map({"": torch_device}, expected_keys)
quantized_total_byte_count = list(get_total_byte_count(model, expanded_device_map, hf_quantizer).values())[
0
]
for name, module in model.named_modules():
if isinstance(module, torch.nn.Linear) and "lm_head" not in name:
# from 16 bits to 4 bits
assert int(model_size[f"{name}.weight"] // 4) == int(quantized_model_size[f"{name}.weight"])
# check that we get the same value, as we use `compute_module_sizes` in `get_total_byte_count`
assert total_byte_count == model_size[""]
assert quantized_total_byte_count == quantized_model_size[""]
# we should at least have 1.5 times memory reduction in total
assert model_size[""] > quantized_model_size[""] * 1.5
class QuantoQuantizationQBitsTensorTest(QuantoQuantizationTest):
EXPECTED_OUTPUTS = "Hello my name is joe and i am a little girl\n\n"
weights = "int4"
@require_torch_accelerator
class QuantoQuantizationActivationTest(unittest.TestCase):
def test_quantize_activation(self):
quantization_config = QuantoConfig(
weights="int8",
activations="int8",
)
with self.assertRaises(ValueError) as e:
AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM2-135M", quantization_config=quantization_config)
self.assertIn("We don't support quantizing the activations with transformers library", str(e.exception))
@require_optimum_quanto
@require_torch_accelerator
class QuantoKVCacheQuantizationTest(unittest.TestCase):
@slow
def test_quantized_cache(self):
EXPECTED_TEXT_COMPLETION = [
"Simply put, the theory of relativity states that 1) time and space are not absolute, but are relative to the observer, and 2) the laws of physics are the same everywhere in the universe. This means that the speed of light is",
"My favorite all time favorite condiment is ketchup. I love how it adds a sweet and tangy flavor to my food. I also enjoy using it as a dip for fries, burgers, and grilled meats. It's a classic condiment that never",
]
prompts = [
"Simply put, the theory of relativity states that ",
"My favorite all time favorite condiment is ketchup.",
]
tokenizer = AutoTokenizer.from_pretrained(
"unsloth/Llama-3.2-1B-Instruct", pad_token="</s>", padding_side="left"
)
model = AutoModelForCausalLM.from_pretrained(
"unsloth/Llama-3.2-1B-Instruct", device_map="sequential", dtype=torch.float16
)
inputs = tokenizer(prompts, return_tensors="pt", padding=True).to(torch_device)
generated_ids = model.generate(**inputs, max_new_tokens=40, do_sample=False, cache_implementation="quantized")
text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
self.assertEqual(EXPECTED_TEXT_COMPLETION, text)

View File

@@ -0,0 +1,152 @@
# Copyright 2025 Advanced Micro Devices, Inc. and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, GenerationConfig, QuarkConfig
from transformers.testing_utils import (
cleanup,
is_torch_available,
require_accelerate,
require_quark,
require_torch_gpu,
require_torch_multi_gpu,
slow,
torch_device,
)
from transformers.utils.import_utils import is_quark_available
if is_torch_available():
import torch
if is_quark_available():
from quark.torch.export.nn.modules.qparamslinear import QParamsLinear
@require_quark
class QuarkConfigTest(unittest.TestCase):
def test_common_args(self):
config = AutoConfig.from_pretrained("amd/Llama-3.1-8B-Instruct-w-int8-a-int8-sym-test")
QuarkConfig(**config.quantization_config)
@slow
@require_quark
@require_torch_gpu
class QuarkTest(unittest.TestCase):
reference_model_name = "unsloth/Meta-Llama-3.1-8B-Instruct"
quantized_model_name = "amd/Llama-3.1-8B-Instruct-w-int8-a-int8-sym-test"
input_text = "Today I am in Paris and"
EXPECTED_OUTPUTS = set()
EXPECTED_OUTPUTS.add("Today I am in Paris and I am not in Paris, France\nToday I am in Paris, Illinois")
EXPECTED_OUTPUTS.add("Today I am in Paris and I am enjoying the city of light. I am not just any ordinary Paris")
EXPECTED_OUTPUTS.add("Today I am in Paris and I am enjoying my day off! The sun is shining, the birds are")
EXPECTED_OUTPUTS.add("Today I am in Paris and I'm here to tell you about it. It's a beautiful day,")
EXPECTED_OUTPUTS.add("Today I am in Paris and I am not in Paris at all! I am not in Paris, but")
EXPECTED_OUTPUTS.add("Today I am in Paris and I am in Paris, but I am not in Paris\nToday I am")
EXPECTED_OUTPUTS.add("Today I am in Paris and I am at the Luxembourg Congress Center\nToday I am in Paris and I")
EXPECTED_OUTPUTS.add("Today I am in Paris and I'm at the Eiffel Tower, which is the tallest building in")
EXPECTED_RELATIVE_DIFFERENCE = 1.66
device_map = None
@classmethod
def setUpClass(cls):
"""
Setup reference & quantized model
"""
cls.model_fp16 = AutoModelForCausalLM.from_pretrained(
cls.reference_model_name, dtype=torch.float16, device_map=cls.device_map
)
cls.mem_fp16 = cls.model_fp16.get_memory_footprint()
cls.tokenizer = AutoTokenizer.from_pretrained(cls.reference_model_name, use_fast=True)
cls.quantized_model = AutoModelForCausalLM.from_pretrained(
cls.quantized_model_name,
dtype=torch.float16,
device_map=cls.device_map,
)
def tearDown(self):
r"""
TearDown function needs to be called at the end of each test to free the accelerator memory and cache, also to
avoid unexpected behaviors. Please see: https://discuss.pytorch.org/t/how-can-we-release-gpu-memory-cache/14530/27
"""
cleanup(torch_device, gc_collect=True)
def test_memory_footprint(self):
mem_quantized = self.quantized_model.get_memory_footprint()
self.assertTrue(self.mem_fp16 / mem_quantized > self.EXPECTED_RELATIVE_DIFFERENCE)
def test_device_and_dtype_assignment(self):
r"""
Test whether trying to cast (or assigning a device to) a model after quantization will throw an error.
Checks also if other models are casted correctly .
"""
# This should work
if self.device_map is None:
_ = self.quantized_model.to(0)
with self.assertRaises(ValueError):
# Tries with a `dtype``
self.quantized_model.to(torch.float16)
def test_quantized_layers_class(self):
r"""
A simple test to check if the model successfully changes the class type of the linear layers
"""
self.assertTrue(isinstance(self.quantized_model.model.layers[0].mlp.gate_proj, QParamsLinear))
def check_inference_correctness(self, model):
r"""
Test the generation quality of the quantized model and see that we are matching the expected output.
Given that we are operating on small numbers + the testing model is relatively small, we might not get
the same output across GPUs. So we'll generate few tokens (5-10) and check their output.
"""
# Check that inference pass works on the model
encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
gen_config = GenerationConfig(
max_new_tokens=15,
min_new_tokens=15,
use_cache=True,
num_beams=1,
do_sample=False,
)
# Check the exactness of the results
output_sequences = model.generate(input_ids=encoded_input["input_ids"].to(0), generation_config=gen_config)
# Get the generation
self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
def test_generate_quality(self):
"""
Simple test to check the quality of the model by comparing the generated tokens with the expected tokens
"""
if self.device_map is None:
self.check_inference_correctness(self.quantized_model.to(0))
else:
self.check_inference_correctness(self.quantized_model)
@require_accelerate
@require_torch_multi_gpu
@require_quark
class QuarkTestDeviceMap(QuarkTest):
device_map = "auto"

View File

@@ -0,0 +1,192 @@
# Copyright 2026 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import tempfile
import unittest
from unittest.mock import patch
from transformers import AutoModelForCausalLM, AutoTokenizer, SinqConfig
from transformers.testing_utils import (
backend_empty_cache,
require_torch_gpu,
slow,
torch_device,
)
from transformers.utils import is_torch_available
if is_torch_available():
import torch
class SinqConfigTest(unittest.TestCase):
"""Test the SinqConfig class."""
def test_default_config(self):
"""Test default configuration values."""
config = SinqConfig()
self.assertEqual(config.nbits, 4)
self.assertEqual(config.group_size, 64)
self.assertEqual(config.tiling_mode, "1D")
self.assertEqual(config.method, "sinq")
def test_custom_config(self):
"""Test custom configuration values."""
config = SinqConfig(
nbits=8,
group_size=128,
tiling_mode="2D",
method="sinq",
)
self.assertEqual(config.nbits, 8)
self.assertEqual(config.group_size, 128)
self.assertEqual(config.tiling_mode, "2D")
self.assertEqual(config.method, "sinq")
def test_modules_to_not_convert(self):
"""Test modules_to_not_convert configuration."""
modules = ["layer1", "layer2.weight"]
config = SinqConfig(modules_to_not_convert=modules)
self.assertEqual(config.modules_to_not_convert, modules)
def test_to_dict(self):
"""Test that config converts to dict correctly."""
quantization_config = SinqConfig()
config_to_dict = quantization_config.to_dict()
for key in config_to_dict:
self.assertEqual(getattr(quantization_config, key), config_to_dict[key])
def test_from_dict(self):
"""Test that config can be created from dict."""
config_dict = {
"nbits": 8,
"group_size": 128,
"method": "sinq",
}
config = SinqConfig.from_dict(config_dict)
self.assertEqual(config.nbits, 8)
self.assertEqual(config.group_size, 128)
self.assertEqual(config.method, "sinq")
def test_method_validation(self):
"""Test that invalid method raises error."""
with self.assertRaises(ValueError):
SinqConfig(method="invalid_method")
@slow
@require_torch_gpu
class SinqTest(unittest.TestCase):
"""Integration tests for SINQ quantization."""
model_name = "Qwen/Qwen3-0.6B"
input_text = "What is the capital of France?"
max_new_tokens = 10
device_map = torch_device
EXPECTED_OUTPUTS = {
"What is the capital of France? Paris.",
"What is the capital of France? The capital of France is Paris.",
"What is the capital of France? The capital of France is Paris. The statement is",
"What is the capital of France? Paris is the capital and most populous city of France.",
}
@classmethod
def setUpClass(cls):
"""Setup quantized model and tokenizer once for all tests."""
cls.quantization_config = SinqConfig(
nbits=4,
group_size=64,
method="sinq",
modules_to_not_convert=["lm_head"],
)
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
cls.quantized_model = AutoModelForCausalLM.from_pretrained(
cls.model_name,
torch_dtype=torch.bfloat16,
quantization_config=cls.quantization_config,
)
def tearDown(self):
gc.collect()
backend_empty_cache(torch_device)
gc.collect()
def test_quantizer_validation_no_cuda(self):
"""Test that quantizer logs warning when CUDA is not available."""
from transformers.quantizers.quantizer_sinq import SinqHfQuantizer
config = SinqConfig()
quantizer = SinqHfQuantizer(quantization_config=config)
with patch("torch.cuda.is_available", return_value=False):
with self.assertLogs("transformers", level="WARNING") as cm:
quantizer.validate_environment()
self.assertTrue(any("No CUDA is available" in msg for msg in cm.output))
def test_asinq_not_supported(self):
"""Test that asinq method raises error for non-pre-quantized models."""
from transformers.quantizers.quantizer_sinq import SinqHfQuantizer
config = SinqConfig(method="asinq")
quantizer = SinqHfQuantizer(quantization_config=config)
quantizer.pre_quantized = False
with self.assertRaises(ValueError):
quantizer.validate_environment()
def test_quantized_model_conversion(self):
"""Test that Linear modules are converted to SINQLinear."""
from sinq.sinqlinear_hf import SINQLinear
nb_sinq_linear = 0
for module in self.quantized_model.modules():
if isinstance(module, SINQLinear):
nb_sinq_linear += 1
self.assertGreater(nb_sinq_linear, 0)
self.assertNotIsInstance(self.quantized_model.lm_head, SINQLinear)
def test_quantized_model(self):
"""Test that quantized model can generate text."""
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(self.device_map)
output = self.quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens, do_sample=False)
decoded = self.tokenizer.decode(output[0], skip_special_tokens=True)
self.assertIsNotNone(decoded)
self.assertGreater(len(decoded), len(self.input_text))
self.assertIn(decoded, self.EXPECTED_OUTPUTS)
def test_save_pretrained(self):
"""Test that quantized model can be saved and loaded."""
with tempfile.TemporaryDirectory() as tmpdirname:
self.quantized_model.save_pretrained(tmpdirname)
loaded_model = AutoModelForCausalLM.from_pretrained(
tmpdirname,
device_map=self.device_map,
)
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(self.device_map)
output = loaded_model.generate(**input_ids, max_new_tokens=self.max_new_tokens, do_sample=False)
decoded = self.tokenizer.decode(output[0], skip_special_tokens=True)
self.assertIsNotNone(decoded)
self.assertGreater(len(decoded), len(self.input_text))
del loaded_model

View File

@@ -0,0 +1,239 @@
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import tempfile
import unittest
import pytest
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, SpQRConfig, StaticCache
from transformers.testing_utils import (
backend_empty_cache,
require_accelerate,
require_spqr,
require_torch_gpu,
require_torch_multi_gpu,
slow,
torch_device,
)
from transformers.utils import is_torch_available
if is_torch_available():
import torch
@require_torch_gpu
class SpQRConfigTest(unittest.TestCase):
def test_to_dict(self):
"""
Simple test that checks if one uses a config and converts it to a dict, the dict is the same as the config object
"""
quantization_config = SpQRConfig()
config_to_dict = quantization_config.to_dict()
for key in config_to_dict:
self.assertEqual(getattr(quantization_config, key), config_to_dict[key])
def test_from_dict(self):
"""
Simple test that checks if one uses a dict and converts it to a config object, the config object is the same as the dict
"""
dict = {
"beta1": 16,
"beta2": 16,
"bits": 3,
"modules_to_not_convert": ["lm_head.weight"],
"shapes": {"model.layers.0.self_attn.q_proj.dense_weights.shape": 16},
}
quantization_config = SpQRConfig.from_dict(dict)
self.assertEqual(dict["beta1"], quantization_config.beta1)
self.assertEqual(dict["beta2"], quantization_config.beta2)
self.assertEqual(dict["bits"], quantization_config.bits)
self.assertEqual(dict["modules_to_not_convert"], quantization_config.modules_to_not_convert)
self.assertEqual(dict["shapes"], quantization_config.shapes)
@slow
@require_torch_gpu
@require_spqr
@require_accelerate
class SpQRTest(unittest.TestCase):
model_name = "elvircrn/Llama-2-7b-SPQR-3Bit-16x16-red_pajama-hf"
input_text = "Hello my name is"
max_new_tokens = 32
EXPECTED_OUTPUT = (
"Hello my name is Jesse. (I'm also known as Jesse) I'm a 25 year old male from United States. I'm looking for"
)
EXPECTED_OUTPUT_COMPILE = "Hello my name is Jake and I am a 20 year old student at the University of North Texas. (Go Mean Green!) I am a huge fan of the Dallas"
# called only once for all test in this class
@classmethod
def setUpClass(cls):
"""
Setup quantized model
"""
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
cls.quantized_model = AutoModelForCausalLM.from_pretrained(
cls.model_name,
device_map=torch_device,
)
def tearDown(self):
gc.collect()
backend_empty_cache(torch_device)
gc.collect()
def test_quantized_model_conversion(self):
"""
Simple test that checks if the quantized model has been converted properly
"""
from spqr_quant import QuantizedLinear
from transformers.integrations import replace_with_spqr_linear
model_id = "meta-llama/Llama-2-7b-hf"
config = AutoConfig.from_pretrained(model_id)
quantization_config = AutoConfig.from_pretrained(self.model_name, return_dict=False).quantization_config
quantization_config = SpQRConfig.from_dict(quantization_config)
with torch.device("meta"):
model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_id, config=config)
nb_linears = 0
for module in model.modules():
if isinstance(module, torch.nn.Linear):
nb_linears += 1
model, _ = replace_with_spqr_linear(
model,
quantization_config=quantization_config,
modules_to_not_convert=quantization_config.modules_to_not_convert,
)
nb_spqr_linear = 0
for module in model.modules():
if isinstance(module, QuantizedLinear):
nb_spqr_linear += 1
self.assertEqual(nb_linears - 1, nb_spqr_linear)
def test_quantized_model(self):
"""
Simple test that checks if the quantized model is working properly
"""
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
output = self.quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
def test_raise_if_non_quantized(self):
model_id = "meta-llama/Llama-2-7b-hf"
quantization_config = SpQRConfig()
with self.assertRaises(ValueError):
_ = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config)
@unittest.skip
def test_save_pretrained(self):
"""
Simple test that checks if the quantized model is working properly after being saved and loaded
"""
with tempfile.TemporaryDirectory() as tmpdirname:
self.quantized_model.save_pretrained(tmpdirname)
model = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map=torch_device)
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
output = model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
@require_torch_multi_gpu
def test_quantized_model_multi_gpu(self):
"""
Simple test that checks if the quantized model is working properly with multiple GPUs
"""
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
quantized_model = AutoModelForCausalLM.from_pretrained(self.model_name, device_map="auto")
self.assertTrue(set(quantized_model.hf_device_map.values()) == {0, 1})
output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
@pytest.mark.torch_compile_test
def test_quantized_model_compile(self):
"""
Simple test that checks if the quantized model is working properly
"""
# Sample tokens greedily
def decode_one_tokens(model, cur_token, input_pos, past_key_values):
logits = model(
cur_token,
position_ids=input_pos,
past_key_values=past_key_values,
return_dict=False,
use_cache=True,
)[0]
new_token = torch.argmax(logits[:, [-1]], dim=-1).to(torch.int)
return new_token
# Tokenize the test input
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)["input_ids"]
seq_length = input_ids.shape[1]
# Setup static KV cache for generation
past_key_values = StaticCache(
config=self.quantized_model.config, max_cache_len=seq_length + self.max_new_tokens + 1
)
# Allocate token ids to be generated and copy prefix ids
position = torch.arange(seq_length, device=torch_device)
generated_ids = torch.zeros(1, seq_length + self.max_new_tokens, dtype=torch.int, device=torch_device)
generated_ids[:, position] = input_ids.to(torch_device).to(torch.int)
# Do a forward pass to fill the prefix cache and compile the kernels if necessary
logits = self.quantized_model(
input_ids,
past_key_values=past_key_values,
return_dict=False,
use_cache=True,
)[0]
next_token = torch.argmax(logits[:, [-1]], dim=-1).to(torch.int)
generated_ids[:, [seq_length]] = next_token
with torch.no_grad():
# Compile the CUDA graph
decode_one_tokens = torch.compile(decode_one_tokens, mode="default", backend="inductor", fullgraph=True)
# Generate tokens one by one
position = torch.tensor([seq_length + 1], device=torch_device)
for _ in range(1, self.max_new_tokens):
with torch.backends.cuda.sdp_kernel(enable_flash=False, enable_mem_efficient=False, enable_math=True):
next_token = decode_one_tokens(self.quantized_model, next_token.clone(), None, past_key_values)
generated_ids.index_copy_(1, position, next_token)
position += 1
# Check generated text
self.assertEqual(
self.tokenizer.decode(generated_ids[0], skip_special_tokens=True), self.EXPECTED_OUTPUT_COMPILE
)

View File

@@ -0,0 +1,662 @@
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import tempfile
import unittest
from parameterized import parameterized
from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig
from transformers.testing_utils import (
Expectations,
backend_empty_cache,
require_cuda_capability_at_least,
require_torch_accelerator,
require_torch_multi_accelerator,
require_torchao,
slow,
torch_device,
)
from transformers.utils import is_torch_available, is_torchao_available
if is_torch_available():
import torch
if is_torchao_available():
from torchao.dtypes import (
AffineQuantizedTensor,
)
from torchao.prototype.mx_formats import NVFP4DynamicActivationNVFP4WeightConfig
from torchao.quantization import (
Float8DynamicActivationFloat8WeightConfig,
Float8Tensor,
Float8WeightOnlyConfig,
FqnToConfig,
Int4WeightOnlyConfig,
Int8DynamicActivationInt8WeightConfig,
Int8DynamicActivationIntxWeightConfig,
Int8WeightOnlyConfig,
IntxWeightOnlyConfig,
MappingType,
PerAxis,
)
@require_torchao
class TorchAoConfigTest(unittest.TestCase):
def test_to_dict(self):
"""
Makes sure the config format is properly set
"""
quantization_config = TorchAoConfig(Int4WeightOnlyConfig(group_size=32))
torchao_orig_config = quantization_config.to_dict()
self.assertIn("quant_type", torchao_orig_config)
self.assertIn("quant_method", torchao_orig_config)
self.assertEqual(torchao_orig_config["quant_method"], "torchao")
def test_repr(self):
"""
Check that there is no error in the repr
"""
config = Int4WeightOnlyConfig(group_size=8)
quantization_config = TorchAoConfig(config, modules_to_not_convert=["conv"])
repr(quantization_config)
def test_json_serializable(self):
"""
Check that the config dict can be JSON serialized.
"""
config = Int4WeightOnlyConfig(group_size=32)
quantization_config = TorchAoConfig(config)
d = quantization_config.to_dict()
self.assertTrue("group_size" in d["quant_type"]["default"]["_data"])
quantization_config.to_json_string(use_diff=False)
@require_torchao
@slow
class TorchAoTestBase:
"""Base mixin with all torchao test methods. Not a TestCase — subclass with unittest.TestCase to run."""
input_text = "What are we having for dinner?"
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
device = None # must be set by subclass
def tearDown(self):
gc.collect()
backend_empty_cache(torch_device)
gc.collect()
def test_int4wo_quant(self):
"""
Simple LLM model testing int4 weight only quantization
"""
int4_packing_format = "plain_int32" if self.device == "xpu" else "tile_packed_to_4d"
config = Int4WeightOnlyConfig(int4_packing_format=int4_packing_format)
quant_config = TorchAoConfig(config)
quantized_model = AutoModelForCausalLM.from_pretrained(
self.model_name,
dtype=torch.bfloat16,
device_map=self.device,
quantization_config=quant_config,
)
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
self.assertIn("Int4", type(quantized_model.model.layers[0].self_attn.v_proj.weight).__name__)
input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
output = quantized_model.generate(**input_ids, max_new_tokens=10)
# fmt: off
EXPECTED_OUTPUT = Expectations(
{
("cuda", None): "What are we having for dinner?\nRed, white, and green beans,",
("xpu", None): "What are we having for dinner?\n\nJessica: (smiling)",
("xpu", 5): "What are we having for dinner?\n\n[Scene 2]\n\n[",
}
)
# fmt: on
self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT.get_expectation())
def test_int8_dynamic_activation_int8_weight_quant(self):
"""
Simple LLM model testing int8_dynamic_activation_int8_weight
"""
config = Int8DynamicActivationInt8WeightConfig()
quant_config = TorchAoConfig(config)
quantized_model = AutoModelForCausalLM.from_pretrained(
self.model_name,
device_map=self.device,
quantization_config=quant_config,
torch_dtype=torch.bfloat16,
)
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
output = quantized_model.generate(**input_ids, max_new_tokens=10)
EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT)
def test_include_input_output_embeddings(self):
weight_dtype = torch.int8
granularity = PerAxis(0)
mapping_type = MappingType.ASYMMETRIC
embedding_config = IntxWeightOnlyConfig(
weight_dtype=weight_dtype,
granularity=granularity,
mapping_type=mapping_type,
)
config = FqnToConfig({"_default": None, "model.embed_tokens": embedding_config, "lm_head": embedding_config})
# need set `include_input_output_embeddings` to True
quant_config = TorchAoConfig(quant_type=config, include_input_output_embeddings=True)
quantized_model = AutoModelForCausalLM.from_pretrained(
self.model_name,
device_map=self.device,
quantization_config=quant_config,
torch_dtype=torch.bfloat16,
)
# making sure embedding is quantized
self.assertNotEqual(type(quantized_model.model.embed_tokens.weight).__name__, "Parameter")
self.assertNotEqual(type(quantized_model.lm_head.weight).__name__, "Parameter")
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
output = quantized_model.generate(**input_ids, max_new_tokens=10)
EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT)
def test_per_module_config_skip(self):
linear_config = Int8WeightOnlyConfig()
config = FqnToConfig({"_default": linear_config, "model.layers.0.self_attn.q_proj": None})
quant_config = TorchAoConfig(quant_type=config)
quantized_model = AutoModelForCausalLM.from_pretrained(
self.model_name,
device_map=self.device,
quantization_config=quant_config,
torch_dtype=torch.bfloat16,
)
# making sure `model.layers.0.self_attn.q_proj` is skipped
self.assertTrue(not isinstance(quantized_model.model.layers[0].self_attn.q_proj.weight, AffineQuantizedTensor))
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
output = quantized_model.generate(**input_ids, max_new_tokens=10)
EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT)
def test_fqn_to_config_regex_basic(self):
linear_config = Int8WeightOnlyConfig()
config = FqnToConfig({"_default": linear_config, r"re:model\.layers\..+\.self_attn\.q_proj": None})
quant_config = TorchAoConfig(quant_type=config)
quantized_model = AutoModelForCausalLM.from_pretrained(
self.model_name,
device_map=self.device,
quantization_config=quant_config,
torch_dtype=torch.bfloat16,
)
# making sure `model.layers.0.self_attn.q_proj` is skipped
self.assertTrue(not isinstance(quantized_model.model.layers[0].self_attn.q_proj.weight, AffineQuantizedTensor))
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
output = quantized_model.generate(**input_ids, max_new_tokens=10)
EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT)
def test_fqn_to_config_regex_fullmatch(self):
"""Testing that we will only match the fqns that fully
matches the regex
"""
linear1_config = Int8WeightOnlyConfig()
linear2_config = Float8WeightOnlyConfig()
# intentially removing `j` after `q_proj` so it's not a full match
config = FqnToConfig(
{
r"re:model\.layers\.+\.self_attn\.q_pro": linear1_config,
"model.layers.3.self_attn.q_proj": linear2_config,
}
)
quant_config = TorchAoConfig(quant_type=config)
quantized_model = AutoModelForCausalLM.from_pretrained(
self.model_name,
device_map=self.device,
quantization_config=quant_config,
torch_dtype=torch.bfloat16,
)
# highest precedence is fully specified module fqn
self.assertTrue(isinstance(quantized_model.model.layers[3].self_attn.q_proj.weight, Float8Tensor))
# because regex `model\.layers\.+*\.self_attn\.q_pro` didin't fully match `model.layers.1.self_attn.q_proj` (missing last `j`)
# this layer is not expected to be quantized to int8
self.assertTrue(not isinstance(quantized_model.model.layers[1].self_attn.q_proj.weight, AffineQuantizedTensor))
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
output = quantized_model.generate(**input_ids, max_new_tokens=10)
EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT)
def test_fqn_to_config_module_regex_precedence(self):
linear1_config = Int8WeightOnlyConfig()
linear2_config = Float8WeightOnlyConfig()
config = FqnToConfig(
{
r"re:model\.layers\..+\.self_attn\.q_proj": None,
"model.layers.3.self_attn.q_proj": linear2_config,
"_default": linear1_config,
}
)
quant_config = TorchAoConfig(quant_type=config)
quantized_model = AutoModelForCausalLM.from_pretrained(
self.model_name,
device_map=self.device,
quantization_config=quant_config,
torch_dtype=torch.bfloat16,
)
# highest precedence is fully specified module fqn
self.assertTrue(isinstance(quantized_model.model.layers[3].self_attn.q_proj.weight, Float8Tensor))
# second precedence: regex
self.assertTrue(not isinstance(quantized_model.model.layers[1].self_attn.q_proj.weight, AffineQuantizedTensor))
# last precedence: _default
self.assertTrue(isinstance(quantized_model.model.layers[1].self_attn.k_proj.weight, AffineQuantizedTensor))
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
output = quantized_model.generate(**input_ids, max_new_tokens=10)
EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT)
def test_fqn_to_config_regex_precedence(self):
linear1_config = Int8WeightOnlyConfig()
linear2_config = Float8WeightOnlyConfig()
config = FqnToConfig(
{
r"re:model\.layers\..+\.self_attn\.q_proj.weight": None,
"model.layers.3.self_attn.q_proj.weight": linear2_config,
"_default": linear1_config,
}
)
quant_config = TorchAoConfig(quant_type=config)
quantized_model = AutoModelForCausalLM.from_pretrained(
self.model_name,
device_map=self.device,
quantization_config=quant_config,
torch_dtype=torch.bfloat16,
)
self.assertTrue(isinstance(quantized_model.model.layers[3].self_attn.q_proj.weight, Float8Tensor))
self.assertTrue(not isinstance(quantized_model.model.layers[1].self_attn.q_proj.weight, AffineQuantizedTensor))
self.assertTrue(isinstance(quantized_model.model.layers[1].self_attn.k_proj.weight, AffineQuantizedTensor))
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
output = quantized_model.generate(**input_ids, max_new_tokens=10)
EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT)
def test_fqn_to_config_param_over_module_regex_precedence(self):
linear1_config = Int8WeightOnlyConfig()
linear2_config = Float8WeightOnlyConfig()
config = FqnToConfig(
{
r"re:model\.layers\..+\.self_attn\.q_proj.weight": None,
r"re:model\.layers\..+\.self_attn\.q_proj": linear2_config,
"_default": linear1_config,
}
)
quant_config = TorchAoConfig(quant_type=config)
quantized_model = AutoModelForCausalLM.from_pretrained(
self.model_name,
device_map=self.device,
quantization_config=quant_config,
torch_dtype=torch.bfloat16,
)
self.assertTrue(not isinstance(quantized_model.model.layers[1].self_attn.q_proj.weight, AffineQuantizedTensor))
self.assertTrue(isinstance(quantized_model.model.layers[1].self_attn.k_proj.weight, AffineQuantizedTensor))
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
output = quantized_model.generate(**input_ids, max_new_tokens=10)
EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT)
def test_fqn_to_config_param_over_module_precedence(self):
linear1_config = Int8WeightOnlyConfig()
linear2_config = Float8WeightOnlyConfig()
config = FqnToConfig(
{
"model.layers.3.self_attn.q_proj.weight": None,
"model.layers.3.self_attn.q_proj": linear2_config,
"_default": linear1_config,
}
)
quant_config = TorchAoConfig(quant_type=config)
quantized_model = AutoModelForCausalLM.from_pretrained(
self.model_name,
device_map=self.device,
quantization_config=quant_config,
torch_dtype=torch.bfloat16,
)
self.assertTrue(not isinstance(quantized_model.model.layers[3].self_attn.q_proj.weight, AffineQuantizedTensor))
self.assertTrue(isinstance(quantized_model.model.layers[3].self_attn.k_proj.weight, AffineQuantizedTensor))
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
output = quantized_model.generate(**input_ids, max_new_tokens=10)
EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT)
def test_fqn_to_config_exact_over_regex_precedence(self):
linear1_config = Int8WeightOnlyConfig()
linear2_config = Float8WeightOnlyConfig()
config = FqnToConfig(
{
"model.layers.3.self_attn.q_proj.weight": None,
"model.layers.1.self_attn.q_proj": linear1_config,
r"re:model\.layers\..+\.self_attn\.q_proj.weight": linear2_config,
}
)
quant_config = TorchAoConfig(quant_type=config)
quantized_model = AutoModelForCausalLM.from_pretrained(
self.model_name,
device_map=self.device,
quantization_config=quant_config,
torch_dtype=torch.bfloat16,
)
self.assertTrue(not isinstance(quantized_model.model.layers[3].self_attn.q_proj.weight, AffineQuantizedTensor))
self.assertTrue(isinstance(quantized_model.model.layers[1].self_attn.q_proj.weight, AffineQuantizedTensor))
self.assertTrue(isinstance(quantized_model.model.layers[2].self_attn.q_proj.weight, Float8Tensor))
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
output = quantized_model.generate(**input_ids, max_new_tokens=10)
EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT)
@require_cuda_capability_at_least(8, 9)
def test_fqn_to_config_non_weight_param(self):
linear1_config = Int8WeightOnlyConfig()
linear2_config = Float8WeightOnlyConfig()
config = FqnToConfig(
{
r"re:.*gate_up_proj": linear2_config,
"model.layers.0.feed_forward.experts.gate_up_proj": None,
"_default": linear1_config,
}
)
quant_config = TorchAoConfig(quant_type=config)
quantized_model = AutoModelForCausalLM.from_pretrained(
"jcaip/Llama-4-Scout-17B-two-layers-only-testing",
device_map=self.device,
dtype=torch.bfloat16,
quantization_config=quant_config,
)
self.assertTrue(isinstance(quantized_model.model.layers[1].feed_forward.experts.gate_up_proj, Float8Tensor))
self.assertTrue(
not isinstance(quantized_model.model.layers[0].feed_forward.experts.gate_up_proj, Float8Tensor)
)
self.assertTrue(isinstance(quantized_model.model.layers[1].self_attn.q_proj.weight, AffineQuantizedTensor))
def test_compute_module_sizes(self):
r"""
Test if we compute the right module sizes needed to generate the device map.
Also test if we get the right values for `total_byte_count` in `caching_allocator_warmup`.
"""
from transformers import AutoConfig
from transformers.integrations.accelerate import compute_module_sizes
from transformers.modeling_utils import expand_device_map, get_total_byte_count
from transformers.quantizers import AutoHfQuantizer
# we need to preprocess the model like that because device_map calculation happens before we load the weights inside the model.
# For normal wieghts, it's fine but for quantized weights, the tensors dtype might change during loading.
with torch.device("meta"):
config = AutoConfig.from_pretrained(self.model_name)
model = AutoModelForCausalLM.from_config(config, dtype=torch.bfloat16)
model_size, _ = compute_module_sizes(model, only_modules=False)
expected_keys = [name for name, _ in model.named_parameters()] + [
name for name, _ in model.named_buffers()
]
expanded_device_map = expand_device_map({"": torch_device}, expected_keys)
total_byte_count = list(get_total_byte_count(model, expanded_device_map).values())[0]
# testing prequantized = False should be enough, the shape should be the same whether it is pre-quantized or not
hf_quantizer = AutoHfQuantizer.from_config(
TorchAoConfig(quant_type=Int4WeightOnlyConfig()), pre_quantized=False
)
hf_quantizer.preprocess_model(model=model, config=model.config)
quantized_model_size, _ = compute_module_sizes(model, hf_quantizer, only_modules=False)
expected_keys = [name for name, _ in model.named_parameters()] + [
name for name, _ in model.named_buffers()
]
expanded_device_map = expand_device_map({"": torch_device}, expected_keys)
quantized_total_byte_count = list(get_total_byte_count(model, expanded_device_map, hf_quantizer).values())[
0
]
for name, module in model.named_modules():
# modules are not replaced when using torchao
if isinstance(module, torch.nn.Linear) and "lm_head" not in name:
# from 16 bits to 4 bits
assert int(model_size[f"{name}.weight"] // 4) == int(quantized_model_size[f"{name}.weight"])
# check that we get the same value, as we use `compute_module_sizes` in `get_total_byte_count`
assert total_byte_count == model_size[""]
assert quantized_total_byte_count == quantized_model_size[""]
# we should at least have 1.5 times memory reduction in total
assert model_size[""] > quantized_model_size[""] * 2
class TorchAoCPUTest(TorchAoTestBase, unittest.TestCase):
device = "cpu"
@unittest.skip("Int4 does not support CPU")
def test_int4wo_quant(self):
pass
@require_torch_accelerator
class TorchAoAcceleratorTest(TorchAoTestBase, unittest.TestCase):
device = torch_device
def test_int4wo_offload(self):
"""
Test Int4 weight-only quantization with CPU offload.
"""
device_map_offload = {
"model.embed_tokens": 0,
"model.layers.0": 0,
"model.layers.1": 0,
"model.layers.2": 0,
"model.layers.3": 0,
"model.layers.4": 0,
"model.layers.5": 0,
"model.layers.6": 0,
"model.layers.7": 0,
"model.layers.8": 0,
"model.layers.9": 0,
"model.layers.10": 0,
"model.layers.11": 0,
"model.layers.12": 0,
"model.layers.13": 0,
"model.layers.14": 0,
"model.layers.15": 0,
"model.layers.16": 0,
"model.layers.17": 0,
"model.layers.18": 0,
"model.layers.19": "cpu",
"model.layers.20": "cpu",
"model.layers.21": "cpu",
"model.norm": 0,
"model.rotary_emb": 0,
"lm_head": 0,
}
int4_packing_format = "plain_int32" if self.device == "xpu" else "tile_packed_to_4d"
config = Int4WeightOnlyConfig(int4_packing_format=int4_packing_format)
quant_config = TorchAoConfig(config)
quantized_model = AutoModelForCausalLM.from_pretrained(
self.model_name,
torch_dtype=torch.bfloat16,
device_map=device_map_offload,
quantization_config=quant_config,
)
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
output = quantized_model.generate(**input_ids, max_new_tokens=10)
# fmt: off
EXPECTED_OUTPUT = Expectations(
{
("cuda", None): "What are we having for dinner?\nRed, white, and green beans,",
("xpu", None): "What are we having for dinner?\n\nJessica: (smiling)",
("xpu", 5): "What are we having for dinner?\n\n[Scene 2]\n\n[",
}
)
# fmt: on
self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT.get_expectation())
@require_torch_multi_accelerator
def test_int4wo_quant_multi_accelerator(self):
"""
Simple test that checks if the quantized model int4 weight only is working properly with multiple accelerators
set CUDA_VISIBLE_DEVICES=0,1 if you have more than 2 CUDA GPUs
set ZE_AFFINITY_MASK=0,1 if you have more than 2 Intel XPUs
"""
int4_packing_format = "plain_int32" if self.device == "xpu" else "tile_packed_to_4d"
config = Int4WeightOnlyConfig(int4_packing_format=int4_packing_format)
quant_config = TorchAoConfig(config)
quantized_model = AutoModelForCausalLM.from_pretrained(
self.model_name,
torch_dtype=torch.bfloat16,
device_map="auto",
quantization_config=quant_config,
)
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
self.assertTrue(set(quantized_model.hf_device_map.values()) == {0, 1})
input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
output = quantized_model.generate(**input_ids, max_new_tokens=10)
EXPECTED_OUTPUT = Expectations(
{
("cuda", None): "What are we having for dinner?\nRed, white, and green beans,",
("xpu", None): "What are we having for dinner?\n\nJessica: (smiling)",
("xpu", 5): "What are we having for dinner?\n\n[Scene 2]\n\n[",
}
)
self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT.get_expectation())
@slow
@require_torchao
class TorchAoSerializationTest(unittest.TestCase):
"""Parameterized serialization tests: quantize, save, reload, check output."""
input_text = "What are we having for dinner?"
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
# fmt: off
COMMON_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
ALL_DEVICES_COMMON = Expectations({("cpu", None): COMMON_OUTPUT, ("cuda", None): COMMON_OUTPUT, ("xpu", None): COMMON_OUTPUT})
test_params = (
[
("Int8WeightOnlyConfig", Int8WeightOnlyConfig(version=2), ALL_DEVICES_COMMON),
("Int8DynamicActivationInt8WeightConfig", Int8DynamicActivationInt8WeightConfig(version=2), ALL_DEVICES_COMMON),
("Float8DynamicActivationFloat8WeightConfig", Float8DynamicActivationFloat8WeightConfig(), Expectations({("cuda", None): COMMON_OUTPUT, ("xpu", None): "What are we having for dinner?\n\nJess: (smiling) I", ("xpu", 5): COMMON_OUTPUT})),
("Float8WeightOnlyConfig", Float8WeightOnlyConfig(), Expectations({("cuda", None): COMMON_OUTPUT, ("xpu", None): COMMON_OUTPUT})),
("Int4WeightOnlyConfig", Int4WeightOnlyConfig(int4_packing_format="plain_int32" if torch_device == "xpu" else "tile_packed_to_4d"), Expectations({("cuda", None): "What are we having for dinner?\nRed, white, and green beans,", ("xpu", None): COMMON_OUTPUT, ("xpu", 5): "What are we having for dinner?\n\n[Scene 2]\n\n["})),
("Int8DynamicActivationIntxWeightConfig", Int8DynamicActivationIntxWeightConfig(), Expectations({("cpu", None): COMMON_OUTPUT, ("cuda", 9): COMMON_OUTPUT, ("cuda", 8): "What are we having for dinner?\n\nJEN: (smiling) I", ("xpu", None): COMMON_OUTPUT})),
("IntxWeightOnlyConfig", IntxWeightOnlyConfig(), ALL_DEVICES_COMMON),
("NVFP4DynamicActivationNVFP4WeightConfig", NVFP4DynamicActivationNVFP4WeightConfig(), Expectations({("cuda", None): "What are we having for dinner?\n\n10. Avoid using \"I"})),
]
if is_torchao_available()
else []
)
# fmt: on
def tearDown(self):
gc.collect()
backend_empty_cache(torch_device)
gc.collect()
def _check_serialization(self, device, config, expected_output):
if isinstance(config, (Float8DynamicActivationFloat8WeightConfig, Float8WeightOnlyConfig)):
if torch.cuda.is_available() and torch.cuda.get_device_capability() < (8, 9):
self.skipTest(f"{type(config).__name__} requires CUDA capability >= (8, 9)")
if isinstance(config, NVFP4DynamicActivationNVFP4WeightConfig):
if torch.cuda.is_available() and torch.cuda.get_device_capability() < (10, 0):
self.skipTest(f"{type(config).__name__} requires CUDA capability >= (10, 0) (SM100)")
quant_config = TorchAoConfig(config)
needs_bfloat16 = isinstance(config, Int4WeightOnlyConfig | NVFP4DynamicActivationNVFP4WeightConfig)
dtype = torch.bfloat16 if needs_bfloat16 else "auto"
quantized_model = AutoModelForCausalLM.from_pretrained(
self.model_name,
dtype=dtype,
device_map=device,
quantization_config=quant_config,
)
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
input_ids = tokenizer(self.input_text, return_tensors="pt").to(device)
output = quantized_model.generate(**input_ids, max_new_tokens=10)
self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), expected_output)
with tempfile.TemporaryDirectory() as tmpdirname:
quantized_model.save_pretrained(tmpdirname)
loaded_model = AutoModelForCausalLM.from_pretrained(tmpdirname, dtype=dtype, device_map=device)
input_ids = tokenizer(self.input_text, return_tensors="pt").to(device)
output = loaded_model.generate(**input_ids, max_new_tokens=10)
self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), expected_output)
@parameterized.expand(test_params, skip_on_empty=True)
def test_serialization_cpu(self, _name, config, expected_outputs):
try:
expected = expected_outputs.find_expectation(("cpu", None, None))
except ValueError:
self.skipTest(f"{type(config).__name__} does not support CPU")
self._check_serialization("cpu", config, expected)
@parameterized.expand(test_params, skip_on_empty=True)
@require_torch_accelerator
def test_serialization_accelerator(self, _name, config, expected_outputs):
try:
expected = expected_outputs.get_expectation()
except ValueError:
self.skipTest(f"{type(config).__name__} does not support {torch_device}")
self._check_serialization(torch_device, config, expected)
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,191 @@
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import tempfile
import unittest
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, VptqConfig
from transformers.testing_utils import (
backend_empty_cache,
require_accelerate,
require_torch_gpu,
require_torch_multi_gpu,
require_vptq,
slow,
torch_device,
)
from transformers.utils import is_torch_available
if is_torch_available():
import torch
class VptqConfigTest(unittest.TestCase):
def test_to_dict(self):
"""
Makes sure the config format is properly set
"""
quantization_config = VptqConfig()
vptq_orig_config = quantization_config.to_dict()
self.assertEqual(vptq_orig_config["quant_method"], quantization_config.quant_method)
@slow
@require_torch_gpu
@require_vptq
@require_accelerate
class VptqTest(unittest.TestCase):
model_name = "VPTQ-community/Meta-Llama-3.1-8B-Instruct-v12-k65536-4096-woft"
input_text = "Hello my name is"
max_new_tokens = 32
EXPECTED_OUTPUT = "Hello my name is Sarah and I am a 25 year old woman from the United States. I am a college graduate and I am currently working as a marketing specialist for a small"
device_map = "cuda"
# called only once for all test in this class
@classmethod
def setUpClass(cls):
"""
Setup quantized model
"""
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
cls.quantized_model = AutoModelForCausalLM.from_pretrained(
cls.model_name,
device_map=cls.device_map,
)
def tearDown(self):
gc.collect()
backend_empty_cache(torch_device)
gc.collect()
def test_quantized_model(self):
"""
Simple test that checks if the quantized model is working properly
"""
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
output = self.quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens, do_sample=False)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
def test_raise_if_non_quantized(self):
model_id = "facebook/opt-125m"
quantization_config = VptqConfig()
with self.assertRaises(ValueError):
_ = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config)
def test_save_pretrained(self):
"""
Simple test that checks if the quantized model is working properly after being saved and loaded
"""
with tempfile.TemporaryDirectory() as tmpdirname:
self.quantized_model.save_pretrained(tmpdirname)
model = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map=self.device_map)
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
output = model.generate(**input_ids, max_new_tokens=self.max_new_tokens, do_sample=False)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
@require_torch_multi_gpu
def test_quantized_model_multi_gpu(self):
"""
Simple test that checks if the quantized model is working properly with multiple GPUs
"""
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
quantized_model = AutoModelForCausalLM.from_pretrained(self.model_name, device_map="auto")
self.assertTrue(set(quantized_model.hf_device_map.values()) == {0, 1})
output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens, do_sample=False)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
def test_quantized_model_conversion(self):
"""
Simple test that checks if the quantized model has been converted properly
"""
from vptq import VQuantLinear
from transformers.integrations import replace_with_vptq_linear
model_id = "facebook/opt-350m"
config = AutoConfig.from_pretrained(model_id, revision="cb32f77e905cccbca1d970436fb0f5e6b58ee3c5")
modules_to_not_convert = ["lm_head"]
names = [
"q_proj",
"k_proj",
"v_proj",
"out_proj",
"fc1",
"fc2",
]
value = {
"enable_norm": True,
"enable_perm": True,
"group_num": 1,
"group_size": 128,
"indices_as_float": False,
"num_centroids": [-1, 128],
"num_res_centroids": [-1, 128],
"outlier_size": 0,
"vector_lens": [-1, 12],
}
shared_layer_config = {}
for name in names:
shared_layer_config[name] = value
for i in range(24):
modules_to_not_convert.append(f"model.decoder.layers.{i}.fc1")
layer_configs = {}
layer_configs["model.decoder.project_out"] = value
layer_configs["model.decoder.project_in"] = value
quantization_config = VptqConfig(config_for_layers=layer_configs, shared_layer_config=shared_layer_config)
with torch.device("meta"):
model = AutoModelForCausalLM.from_config(config)
nb_linears = 0
for module in model.modules():
if isinstance(module, torch.nn.Linear):
nb_linears += 1
model, _ = replace_with_vptq_linear(model, quantization_config=quantization_config)
nb_vptq_linear = 0
for module in model.modules():
if isinstance(module, VQuantLinear):
nb_vptq_linear += 1
self.assertEqual(nb_linears - 1, nb_vptq_linear)
# Try with `linear_weights_not_to_quantize`
with torch.device("meta"):
model = AutoModelForCausalLM.from_config(config)
quantization_config = VptqConfig(config_for_layers=layer_configs, shared_layer_config=shared_layer_config)
model, _ = replace_with_vptq_linear(
model, quantization_config=quantization_config, modules_to_not_convert=modules_to_not_convert
)
nb_vptq_linear = 0
for module in model.modules():
if isinstance(module, VQuantLinear):
nb_vptq_linear += 1
# 25 comes from 24 decoder.layers.{layer_idx}.fc1
# and the last lm_head
self.assertEqual(nb_linears - 25, nb_vptq_linear)