first commit
Some checks failed
Self-hosted runner (nightly-past-ci-caller) / Get number (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.11 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.10 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.9 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.8 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.7 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.6 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.5 (push) Has been cancelled
Self-hosted runner (benchmark) / Benchmark (aws-g5-4xlarge-cache) (push) Has been cancelled
Build documentation / build (push) Has been cancelled
Build documentation / build_other_lang (push) Has been cancelled
CodeQL Security Analysis / CodeQL Analysis (push) Has been cancelled
New model PR merged notification / Notify new model (push) Has been cancelled
PR CI / pr-ci (push) Has been cancelled
Slow tests on important models (on Push - A10) / Get all modified files (push) Has been cancelled
Secret Leaks / trufflehog (push) Has been cancelled
Update Transformers metadata / build_and_package (push) Has been cancelled
Slow tests on important models (on Push - A10) / Model CI (push) Has been cancelled
Check Tiny Models / Check tiny models (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Model CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Pipeline CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Example CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / DeepSpeed CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Trainer/FSDP CI (push) Has been cancelled
Nvidia CI - Flash Attn / Setup (push) Has been cancelled
Nvidia CI - Flash Attn / Model CI (push) Has been cancelled
Nvidia CI / Setup (push) Has been cancelled
Nvidia CI / Model CI (push) Has been cancelled
Nvidia CI / Torch pipeline CI (push) Has been cancelled
Nvidia CI / Example CI (push) Has been cancelled
Nvidia CI / Trainer/FSDP CI (push) Has been cancelled
Nvidia CI / DeepSpeed CI (push) Has been cancelled
Nvidia CI / Quantization CI (push) Has been cancelled
Nvidia CI / Kernels CI (push) Has been cancelled
Doctests / Setup (push) Has been cancelled
Doctests / Call doctest jobs (push) Has been cancelled
Doctests / Send results to webhook (push) Has been cancelled
Extras Smoke Test / Get supported Python versions (push) Has been cancelled
Extras Smoke Test / Test extras on Python ${{ matrix.python-version }} (push) Has been cancelled
Extras Smoke Test / Check Slack token availability (push) Has been cancelled
Extras Smoke Test / Notify failures to Slack (push) Has been cancelled
Self-hosted runner (AMD scheduled CI caller) / Trigger Scheduled AMD CI (push) Has been cancelled
Stale Bot / Close Stale Issues (push) Has been cancelled
Some checks failed
Self-hosted runner (nightly-past-ci-caller) / Get number (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.11 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.10 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.9 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.8 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.7 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.6 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.5 (push) Has been cancelled
Self-hosted runner (benchmark) / Benchmark (aws-g5-4xlarge-cache) (push) Has been cancelled
Build documentation / build (push) Has been cancelled
Build documentation / build_other_lang (push) Has been cancelled
CodeQL Security Analysis / CodeQL Analysis (push) Has been cancelled
New model PR merged notification / Notify new model (push) Has been cancelled
PR CI / pr-ci (push) Has been cancelled
Slow tests on important models (on Push - A10) / Get all modified files (push) Has been cancelled
Secret Leaks / trufflehog (push) Has been cancelled
Update Transformers metadata / build_and_package (push) Has been cancelled
Slow tests on important models (on Push - A10) / Model CI (push) Has been cancelled
Check Tiny Models / Check tiny models (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Model CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Pipeline CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Example CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / DeepSpeed CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Trainer/FSDP CI (push) Has been cancelled
Nvidia CI - Flash Attn / Setup (push) Has been cancelled
Nvidia CI - Flash Attn / Model CI (push) Has been cancelled
Nvidia CI / Setup (push) Has been cancelled
Nvidia CI / Model CI (push) Has been cancelled
Nvidia CI / Torch pipeline CI (push) Has been cancelled
Nvidia CI / Example CI (push) Has been cancelled
Nvidia CI / Trainer/FSDP CI (push) Has been cancelled
Nvidia CI / DeepSpeed CI (push) Has been cancelled
Nvidia CI / Quantization CI (push) Has been cancelled
Nvidia CI / Kernels CI (push) Has been cancelled
Doctests / Setup (push) Has been cancelled
Doctests / Call doctest jobs (push) Has been cancelled
Doctests / Send results to webhook (push) Has been cancelled
Extras Smoke Test / Get supported Python versions (push) Has been cancelled
Extras Smoke Test / Test extras on Python ${{ matrix.python-version }} (push) Has been cancelled
Extras Smoke Test / Check Slack token availability (push) Has been cancelled
Extras Smoke Test / Notify failures to Slack (push) Has been cancelled
Self-hosted runner (AMD scheduled CI caller) / Trigger Scheduled AMD CI (push) Has been cancelled
Stale Bot / Close Stale Issues (push) Has been cancelled
This commit is contained in:
662
tests/quantization/torchao_integration/test_torchao.py
Normal file
662
tests/quantization/torchao_integration/test_torchao.py
Normal file
@@ -0,0 +1,662 @@
|
||||
# Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import gc
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
from parameterized import parameterized
|
||||
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig
|
||||
from transformers.testing_utils import (
|
||||
Expectations,
|
||||
backend_empty_cache,
|
||||
require_cuda_capability_at_least,
|
||||
require_torch_accelerator,
|
||||
require_torch_multi_accelerator,
|
||||
require_torchao,
|
||||
slow,
|
||||
torch_device,
|
||||
)
|
||||
from transformers.utils import is_torch_available, is_torchao_available
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
if is_torchao_available():
|
||||
from torchao.dtypes import (
|
||||
AffineQuantizedTensor,
|
||||
)
|
||||
from torchao.prototype.mx_formats import NVFP4DynamicActivationNVFP4WeightConfig
|
||||
from torchao.quantization import (
|
||||
Float8DynamicActivationFloat8WeightConfig,
|
||||
Float8Tensor,
|
||||
Float8WeightOnlyConfig,
|
||||
FqnToConfig,
|
||||
Int4WeightOnlyConfig,
|
||||
Int8DynamicActivationInt8WeightConfig,
|
||||
Int8DynamicActivationIntxWeightConfig,
|
||||
Int8WeightOnlyConfig,
|
||||
IntxWeightOnlyConfig,
|
||||
MappingType,
|
||||
PerAxis,
|
||||
)
|
||||
|
||||
|
||||
@require_torchao
|
||||
class TorchAoConfigTest(unittest.TestCase):
|
||||
def test_to_dict(self):
|
||||
"""
|
||||
Makes sure the config format is properly set
|
||||
"""
|
||||
quantization_config = TorchAoConfig(Int4WeightOnlyConfig(group_size=32))
|
||||
torchao_orig_config = quantization_config.to_dict()
|
||||
|
||||
self.assertIn("quant_type", torchao_orig_config)
|
||||
self.assertIn("quant_method", torchao_orig_config)
|
||||
self.assertEqual(torchao_orig_config["quant_method"], "torchao")
|
||||
|
||||
def test_repr(self):
|
||||
"""
|
||||
Check that there is no error in the repr
|
||||
"""
|
||||
config = Int4WeightOnlyConfig(group_size=8)
|
||||
quantization_config = TorchAoConfig(config, modules_to_not_convert=["conv"])
|
||||
repr(quantization_config)
|
||||
|
||||
def test_json_serializable(self):
|
||||
"""
|
||||
Check that the config dict can be JSON serialized.
|
||||
"""
|
||||
config = Int4WeightOnlyConfig(group_size=32)
|
||||
quantization_config = TorchAoConfig(config)
|
||||
d = quantization_config.to_dict()
|
||||
self.assertTrue("group_size" in d["quant_type"]["default"]["_data"])
|
||||
quantization_config.to_json_string(use_diff=False)
|
||||
|
||||
|
||||
@require_torchao
|
||||
@slow
|
||||
class TorchAoTestBase:
|
||||
"""Base mixin with all torchao test methods. Not a TestCase — subclass with unittest.TestCase to run."""
|
||||
|
||||
input_text = "What are we having for dinner?"
|
||||
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
|
||||
device = None # must be set by subclass
|
||||
|
||||
def tearDown(self):
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
gc.collect()
|
||||
|
||||
def test_int4wo_quant(self):
|
||||
"""
|
||||
Simple LLM model testing int4 weight only quantization
|
||||
"""
|
||||
int4_packing_format = "plain_int32" if self.device == "xpu" else "tile_packed_to_4d"
|
||||
config = Int4WeightOnlyConfig(int4_packing_format=int4_packing_format)
|
||||
quant_config = TorchAoConfig(config)
|
||||
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
self.model_name,
|
||||
dtype=torch.bfloat16,
|
||||
device_map=self.device,
|
||||
quantization_config=quant_config,
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
||||
|
||||
self.assertIn("Int4", type(quantized_model.model.layers[0].self_attn.v_proj.weight).__name__)
|
||||
|
||||
input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
|
||||
|
||||
output = quantized_model.generate(**input_ids, max_new_tokens=10)
|
||||
# fmt: off
|
||||
EXPECTED_OUTPUT = Expectations(
|
||||
{
|
||||
("cuda", None): "What are we having for dinner?\nRed, white, and green beans,",
|
||||
("xpu", None): "What are we having for dinner?\n\nJessica: (smiling)",
|
||||
("xpu", 5): "What are we having for dinner?\n\n[Scene 2]\n\n[",
|
||||
}
|
||||
)
|
||||
# fmt: on
|
||||
self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT.get_expectation())
|
||||
|
||||
def test_int8_dynamic_activation_int8_weight_quant(self):
|
||||
"""
|
||||
Simple LLM model testing int8_dynamic_activation_int8_weight
|
||||
"""
|
||||
config = Int8DynamicActivationInt8WeightConfig()
|
||||
quant_config = TorchAoConfig(config)
|
||||
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
self.model_name,
|
||||
device_map=self.device,
|
||||
quantization_config=quant_config,
|
||||
torch_dtype=torch.bfloat16,
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
||||
|
||||
input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
|
||||
|
||||
output = quantized_model.generate(**input_ids, max_new_tokens=10)
|
||||
EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
|
||||
self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT)
|
||||
|
||||
def test_include_input_output_embeddings(self):
|
||||
weight_dtype = torch.int8
|
||||
granularity = PerAxis(0)
|
||||
mapping_type = MappingType.ASYMMETRIC
|
||||
embedding_config = IntxWeightOnlyConfig(
|
||||
weight_dtype=weight_dtype,
|
||||
granularity=granularity,
|
||||
mapping_type=mapping_type,
|
||||
)
|
||||
config = FqnToConfig({"_default": None, "model.embed_tokens": embedding_config, "lm_head": embedding_config})
|
||||
# need set `include_input_output_embeddings` to True
|
||||
quant_config = TorchAoConfig(quant_type=config, include_input_output_embeddings=True)
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
self.model_name,
|
||||
device_map=self.device,
|
||||
quantization_config=quant_config,
|
||||
torch_dtype=torch.bfloat16,
|
||||
)
|
||||
# making sure embedding is quantized
|
||||
self.assertNotEqual(type(quantized_model.model.embed_tokens.weight).__name__, "Parameter")
|
||||
self.assertNotEqual(type(quantized_model.lm_head.weight).__name__, "Parameter")
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
||||
|
||||
input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
|
||||
|
||||
output = quantized_model.generate(**input_ids, max_new_tokens=10)
|
||||
EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
|
||||
self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT)
|
||||
|
||||
def test_per_module_config_skip(self):
|
||||
linear_config = Int8WeightOnlyConfig()
|
||||
config = FqnToConfig({"_default": linear_config, "model.layers.0.self_attn.q_proj": None})
|
||||
quant_config = TorchAoConfig(quant_type=config)
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
self.model_name,
|
||||
device_map=self.device,
|
||||
quantization_config=quant_config,
|
||||
torch_dtype=torch.bfloat16,
|
||||
)
|
||||
# making sure `model.layers.0.self_attn.q_proj` is skipped
|
||||
self.assertTrue(not isinstance(quantized_model.model.layers[0].self_attn.q_proj.weight, AffineQuantizedTensor))
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
||||
|
||||
input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
|
||||
|
||||
output = quantized_model.generate(**input_ids, max_new_tokens=10)
|
||||
EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
|
||||
self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT)
|
||||
|
||||
def test_fqn_to_config_regex_basic(self):
|
||||
linear_config = Int8WeightOnlyConfig()
|
||||
config = FqnToConfig({"_default": linear_config, r"re:model\.layers\..+\.self_attn\.q_proj": None})
|
||||
quant_config = TorchAoConfig(quant_type=config)
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
self.model_name,
|
||||
device_map=self.device,
|
||||
quantization_config=quant_config,
|
||||
torch_dtype=torch.bfloat16,
|
||||
)
|
||||
# making sure `model.layers.0.self_attn.q_proj` is skipped
|
||||
self.assertTrue(not isinstance(quantized_model.model.layers[0].self_attn.q_proj.weight, AffineQuantizedTensor))
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
||||
|
||||
input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
|
||||
|
||||
output = quantized_model.generate(**input_ids, max_new_tokens=10)
|
||||
EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
|
||||
self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT)
|
||||
|
||||
def test_fqn_to_config_regex_fullmatch(self):
|
||||
"""Testing that we will only match the fqns that fully
|
||||
matches the regex
|
||||
"""
|
||||
linear1_config = Int8WeightOnlyConfig()
|
||||
linear2_config = Float8WeightOnlyConfig()
|
||||
# intentially removing `j` after `q_proj` so it's not a full match
|
||||
config = FqnToConfig(
|
||||
{
|
||||
r"re:model\.layers\.+\.self_attn\.q_pro": linear1_config,
|
||||
"model.layers.3.self_attn.q_proj": linear2_config,
|
||||
}
|
||||
)
|
||||
quant_config = TorchAoConfig(quant_type=config)
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
self.model_name,
|
||||
device_map=self.device,
|
||||
quantization_config=quant_config,
|
||||
torch_dtype=torch.bfloat16,
|
||||
)
|
||||
# highest precedence is fully specified module fqn
|
||||
self.assertTrue(isinstance(quantized_model.model.layers[3].self_attn.q_proj.weight, Float8Tensor))
|
||||
# because regex `model\.layers\.+*\.self_attn\.q_pro` didin't fully match `model.layers.1.self_attn.q_proj` (missing last `j`)
|
||||
# this layer is not expected to be quantized to int8
|
||||
self.assertTrue(not isinstance(quantized_model.model.layers[1].self_attn.q_proj.weight, AffineQuantizedTensor))
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
||||
|
||||
input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
|
||||
|
||||
output = quantized_model.generate(**input_ids, max_new_tokens=10)
|
||||
EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
|
||||
self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT)
|
||||
|
||||
def test_fqn_to_config_module_regex_precedence(self):
|
||||
linear1_config = Int8WeightOnlyConfig()
|
||||
linear2_config = Float8WeightOnlyConfig()
|
||||
config = FqnToConfig(
|
||||
{
|
||||
r"re:model\.layers\..+\.self_attn\.q_proj": None,
|
||||
"model.layers.3.self_attn.q_proj": linear2_config,
|
||||
"_default": linear1_config,
|
||||
}
|
||||
)
|
||||
quant_config = TorchAoConfig(quant_type=config)
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
self.model_name,
|
||||
device_map=self.device,
|
||||
quantization_config=quant_config,
|
||||
torch_dtype=torch.bfloat16,
|
||||
)
|
||||
# highest precedence is fully specified module fqn
|
||||
self.assertTrue(isinstance(quantized_model.model.layers[3].self_attn.q_proj.weight, Float8Tensor))
|
||||
# second precedence: regex
|
||||
self.assertTrue(not isinstance(quantized_model.model.layers[1].self_attn.q_proj.weight, AffineQuantizedTensor))
|
||||
# last precedence: _default
|
||||
self.assertTrue(isinstance(quantized_model.model.layers[1].self_attn.k_proj.weight, AffineQuantizedTensor))
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
||||
|
||||
input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
|
||||
|
||||
output = quantized_model.generate(**input_ids, max_new_tokens=10)
|
||||
EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
|
||||
self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT)
|
||||
|
||||
def test_fqn_to_config_regex_precedence(self):
|
||||
linear1_config = Int8WeightOnlyConfig()
|
||||
linear2_config = Float8WeightOnlyConfig()
|
||||
config = FqnToConfig(
|
||||
{
|
||||
r"re:model\.layers\..+\.self_attn\.q_proj.weight": None,
|
||||
"model.layers.3.self_attn.q_proj.weight": linear2_config,
|
||||
"_default": linear1_config,
|
||||
}
|
||||
)
|
||||
quant_config = TorchAoConfig(quant_type=config)
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
self.model_name,
|
||||
device_map=self.device,
|
||||
quantization_config=quant_config,
|
||||
torch_dtype=torch.bfloat16,
|
||||
)
|
||||
self.assertTrue(isinstance(quantized_model.model.layers[3].self_attn.q_proj.weight, Float8Tensor))
|
||||
self.assertTrue(not isinstance(quantized_model.model.layers[1].self_attn.q_proj.weight, AffineQuantizedTensor))
|
||||
self.assertTrue(isinstance(quantized_model.model.layers[1].self_attn.k_proj.weight, AffineQuantizedTensor))
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
||||
|
||||
input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
|
||||
|
||||
output = quantized_model.generate(**input_ids, max_new_tokens=10)
|
||||
EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
|
||||
self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT)
|
||||
|
||||
def test_fqn_to_config_param_over_module_regex_precedence(self):
|
||||
linear1_config = Int8WeightOnlyConfig()
|
||||
linear2_config = Float8WeightOnlyConfig()
|
||||
config = FqnToConfig(
|
||||
{
|
||||
r"re:model\.layers\..+\.self_attn\.q_proj.weight": None,
|
||||
r"re:model\.layers\..+\.self_attn\.q_proj": linear2_config,
|
||||
"_default": linear1_config,
|
||||
}
|
||||
)
|
||||
quant_config = TorchAoConfig(quant_type=config)
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
self.model_name,
|
||||
device_map=self.device,
|
||||
quantization_config=quant_config,
|
||||
torch_dtype=torch.bfloat16,
|
||||
)
|
||||
self.assertTrue(not isinstance(quantized_model.model.layers[1].self_attn.q_proj.weight, AffineQuantizedTensor))
|
||||
self.assertTrue(isinstance(quantized_model.model.layers[1].self_attn.k_proj.weight, AffineQuantizedTensor))
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
||||
|
||||
input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
|
||||
|
||||
output = quantized_model.generate(**input_ids, max_new_tokens=10)
|
||||
EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
|
||||
self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT)
|
||||
|
||||
def test_fqn_to_config_param_over_module_precedence(self):
|
||||
linear1_config = Int8WeightOnlyConfig()
|
||||
linear2_config = Float8WeightOnlyConfig()
|
||||
config = FqnToConfig(
|
||||
{
|
||||
"model.layers.3.self_attn.q_proj.weight": None,
|
||||
"model.layers.3.self_attn.q_proj": linear2_config,
|
||||
"_default": linear1_config,
|
||||
}
|
||||
)
|
||||
quant_config = TorchAoConfig(quant_type=config)
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
self.model_name,
|
||||
device_map=self.device,
|
||||
quantization_config=quant_config,
|
||||
torch_dtype=torch.bfloat16,
|
||||
)
|
||||
self.assertTrue(not isinstance(quantized_model.model.layers[3].self_attn.q_proj.weight, AffineQuantizedTensor))
|
||||
self.assertTrue(isinstance(quantized_model.model.layers[3].self_attn.k_proj.weight, AffineQuantizedTensor))
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
||||
|
||||
input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
|
||||
|
||||
output = quantized_model.generate(**input_ids, max_new_tokens=10)
|
||||
EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
|
||||
self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT)
|
||||
|
||||
def test_fqn_to_config_exact_over_regex_precedence(self):
|
||||
linear1_config = Int8WeightOnlyConfig()
|
||||
linear2_config = Float8WeightOnlyConfig()
|
||||
config = FqnToConfig(
|
||||
{
|
||||
"model.layers.3.self_attn.q_proj.weight": None,
|
||||
"model.layers.1.self_attn.q_proj": linear1_config,
|
||||
r"re:model\.layers\..+\.self_attn\.q_proj.weight": linear2_config,
|
||||
}
|
||||
)
|
||||
quant_config = TorchAoConfig(quant_type=config)
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
self.model_name,
|
||||
device_map=self.device,
|
||||
quantization_config=quant_config,
|
||||
torch_dtype=torch.bfloat16,
|
||||
)
|
||||
self.assertTrue(not isinstance(quantized_model.model.layers[3].self_attn.q_proj.weight, AffineQuantizedTensor))
|
||||
self.assertTrue(isinstance(quantized_model.model.layers[1].self_attn.q_proj.weight, AffineQuantizedTensor))
|
||||
self.assertTrue(isinstance(quantized_model.model.layers[2].self_attn.q_proj.weight, Float8Tensor))
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
||||
|
||||
input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
|
||||
|
||||
output = quantized_model.generate(**input_ids, max_new_tokens=10)
|
||||
EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
|
||||
self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT)
|
||||
|
||||
@require_cuda_capability_at_least(8, 9)
|
||||
def test_fqn_to_config_non_weight_param(self):
|
||||
linear1_config = Int8WeightOnlyConfig()
|
||||
linear2_config = Float8WeightOnlyConfig()
|
||||
config = FqnToConfig(
|
||||
{
|
||||
r"re:.*gate_up_proj": linear2_config,
|
||||
"model.layers.0.feed_forward.experts.gate_up_proj": None,
|
||||
"_default": linear1_config,
|
||||
}
|
||||
)
|
||||
quant_config = TorchAoConfig(quant_type=config)
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
"jcaip/Llama-4-Scout-17B-two-layers-only-testing",
|
||||
device_map=self.device,
|
||||
dtype=torch.bfloat16,
|
||||
quantization_config=quant_config,
|
||||
)
|
||||
|
||||
self.assertTrue(isinstance(quantized_model.model.layers[1].feed_forward.experts.gate_up_proj, Float8Tensor))
|
||||
self.assertTrue(
|
||||
not isinstance(quantized_model.model.layers[0].feed_forward.experts.gate_up_proj, Float8Tensor)
|
||||
)
|
||||
self.assertTrue(isinstance(quantized_model.model.layers[1].self_attn.q_proj.weight, AffineQuantizedTensor))
|
||||
|
||||
def test_compute_module_sizes(self):
|
||||
r"""
|
||||
Test if we compute the right module sizes needed to generate the device map.
|
||||
Also test if we get the right values for `total_byte_count` in `caching_allocator_warmup`.
|
||||
"""
|
||||
from transformers import AutoConfig
|
||||
from transformers.integrations.accelerate import compute_module_sizes
|
||||
from transformers.modeling_utils import expand_device_map, get_total_byte_count
|
||||
from transformers.quantizers import AutoHfQuantizer
|
||||
|
||||
# we need to preprocess the model like that because device_map calculation happens before we load the weights inside the model.
|
||||
# For normal wieghts, it's fine but for quantized weights, the tensors dtype might change during loading.
|
||||
with torch.device("meta"):
|
||||
config = AutoConfig.from_pretrained(self.model_name)
|
||||
model = AutoModelForCausalLM.from_config(config, dtype=torch.bfloat16)
|
||||
model_size, _ = compute_module_sizes(model, only_modules=False)
|
||||
|
||||
expected_keys = [name for name, _ in model.named_parameters()] + [
|
||||
name for name, _ in model.named_buffers()
|
||||
]
|
||||
expanded_device_map = expand_device_map({"": torch_device}, expected_keys)
|
||||
total_byte_count = list(get_total_byte_count(model, expanded_device_map).values())[0]
|
||||
|
||||
# testing prequantized = False should be enough, the shape should be the same whether it is pre-quantized or not
|
||||
hf_quantizer = AutoHfQuantizer.from_config(
|
||||
TorchAoConfig(quant_type=Int4WeightOnlyConfig()), pre_quantized=False
|
||||
)
|
||||
hf_quantizer.preprocess_model(model=model, config=model.config)
|
||||
quantized_model_size, _ = compute_module_sizes(model, hf_quantizer, only_modules=False)
|
||||
|
||||
expected_keys = [name for name, _ in model.named_parameters()] + [
|
||||
name for name, _ in model.named_buffers()
|
||||
]
|
||||
expanded_device_map = expand_device_map({"": torch_device}, expected_keys)
|
||||
quantized_total_byte_count = list(get_total_byte_count(model, expanded_device_map, hf_quantizer).values())[
|
||||
0
|
||||
]
|
||||
|
||||
for name, module in model.named_modules():
|
||||
# modules are not replaced when using torchao
|
||||
if isinstance(module, torch.nn.Linear) and "lm_head" not in name:
|
||||
# from 16 bits to 4 bits
|
||||
assert int(model_size[f"{name}.weight"] // 4) == int(quantized_model_size[f"{name}.weight"])
|
||||
|
||||
# check that we get the same value, as we use `compute_module_sizes` in `get_total_byte_count`
|
||||
assert total_byte_count == model_size[""]
|
||||
assert quantized_total_byte_count == quantized_model_size[""]
|
||||
|
||||
# we should at least have 1.5 times memory reduction in total
|
||||
assert model_size[""] > quantized_model_size[""] * 2
|
||||
|
||||
|
||||
class TorchAoCPUTest(TorchAoTestBase, unittest.TestCase):
|
||||
device = "cpu"
|
||||
|
||||
@unittest.skip("Int4 does not support CPU")
|
||||
def test_int4wo_quant(self):
|
||||
pass
|
||||
|
||||
|
||||
@require_torch_accelerator
|
||||
class TorchAoAcceleratorTest(TorchAoTestBase, unittest.TestCase):
|
||||
device = torch_device
|
||||
|
||||
def test_int4wo_offload(self):
|
||||
"""
|
||||
Test Int4 weight-only quantization with CPU offload.
|
||||
"""
|
||||
device_map_offload = {
|
||||
"model.embed_tokens": 0,
|
||||
"model.layers.0": 0,
|
||||
"model.layers.1": 0,
|
||||
"model.layers.2": 0,
|
||||
"model.layers.3": 0,
|
||||
"model.layers.4": 0,
|
||||
"model.layers.5": 0,
|
||||
"model.layers.6": 0,
|
||||
"model.layers.7": 0,
|
||||
"model.layers.8": 0,
|
||||
"model.layers.9": 0,
|
||||
"model.layers.10": 0,
|
||||
"model.layers.11": 0,
|
||||
"model.layers.12": 0,
|
||||
"model.layers.13": 0,
|
||||
"model.layers.14": 0,
|
||||
"model.layers.15": 0,
|
||||
"model.layers.16": 0,
|
||||
"model.layers.17": 0,
|
||||
"model.layers.18": 0,
|
||||
"model.layers.19": "cpu",
|
||||
"model.layers.20": "cpu",
|
||||
"model.layers.21": "cpu",
|
||||
"model.norm": 0,
|
||||
"model.rotary_emb": 0,
|
||||
"lm_head": 0,
|
||||
}
|
||||
|
||||
int4_packing_format = "plain_int32" if self.device == "xpu" else "tile_packed_to_4d"
|
||||
config = Int4WeightOnlyConfig(int4_packing_format=int4_packing_format)
|
||||
quant_config = TorchAoConfig(config)
|
||||
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
self.model_name,
|
||||
torch_dtype=torch.bfloat16,
|
||||
device_map=device_map_offload,
|
||||
quantization_config=quant_config,
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
||||
|
||||
input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
|
||||
|
||||
output = quantized_model.generate(**input_ids, max_new_tokens=10)
|
||||
# fmt: off
|
||||
EXPECTED_OUTPUT = Expectations(
|
||||
{
|
||||
("cuda", None): "What are we having for dinner?\nRed, white, and green beans,",
|
||||
("xpu", None): "What are we having for dinner?\n\nJessica: (smiling)",
|
||||
("xpu", 5): "What are we having for dinner?\n\n[Scene 2]\n\n[",
|
||||
}
|
||||
)
|
||||
# fmt: on
|
||||
self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT.get_expectation())
|
||||
|
||||
@require_torch_multi_accelerator
|
||||
def test_int4wo_quant_multi_accelerator(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model int4 weight only is working properly with multiple accelerators
|
||||
set CUDA_VISIBLE_DEVICES=0,1 if you have more than 2 CUDA GPUs
|
||||
set ZE_AFFINITY_MASK=0,1 if you have more than 2 Intel XPUs
|
||||
"""
|
||||
|
||||
int4_packing_format = "plain_int32" if self.device == "xpu" else "tile_packed_to_4d"
|
||||
config = Int4WeightOnlyConfig(int4_packing_format=int4_packing_format)
|
||||
quant_config = TorchAoConfig(config)
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
self.model_name,
|
||||
torch_dtype=torch.bfloat16,
|
||||
device_map="auto",
|
||||
quantization_config=quant_config,
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
||||
|
||||
self.assertTrue(set(quantized_model.hf_device_map.values()) == {0, 1})
|
||||
|
||||
input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
|
||||
|
||||
output = quantized_model.generate(**input_ids, max_new_tokens=10)
|
||||
EXPECTED_OUTPUT = Expectations(
|
||||
{
|
||||
("cuda", None): "What are we having for dinner?\nRed, white, and green beans,",
|
||||
("xpu", None): "What are we having for dinner?\n\nJessica: (smiling)",
|
||||
("xpu", 5): "What are we having for dinner?\n\n[Scene 2]\n\n[",
|
||||
}
|
||||
)
|
||||
self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT.get_expectation())
|
||||
|
||||
|
||||
@slow
|
||||
@require_torchao
|
||||
class TorchAoSerializationTest(unittest.TestCase):
|
||||
"""Parameterized serialization tests: quantize, save, reload, check output."""
|
||||
|
||||
input_text = "What are we having for dinner?"
|
||||
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
|
||||
|
||||
# fmt: off
|
||||
COMMON_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
|
||||
ALL_DEVICES_COMMON = Expectations({("cpu", None): COMMON_OUTPUT, ("cuda", None): COMMON_OUTPUT, ("xpu", None): COMMON_OUTPUT})
|
||||
|
||||
test_params = (
|
||||
[
|
||||
("Int8WeightOnlyConfig", Int8WeightOnlyConfig(version=2), ALL_DEVICES_COMMON),
|
||||
("Int8DynamicActivationInt8WeightConfig", Int8DynamicActivationInt8WeightConfig(version=2), ALL_DEVICES_COMMON),
|
||||
("Float8DynamicActivationFloat8WeightConfig", Float8DynamicActivationFloat8WeightConfig(), Expectations({("cuda", None): COMMON_OUTPUT, ("xpu", None): "What are we having for dinner?\n\nJess: (smiling) I", ("xpu", 5): COMMON_OUTPUT})),
|
||||
("Float8WeightOnlyConfig", Float8WeightOnlyConfig(), Expectations({("cuda", None): COMMON_OUTPUT, ("xpu", None): COMMON_OUTPUT})),
|
||||
("Int4WeightOnlyConfig", Int4WeightOnlyConfig(int4_packing_format="plain_int32" if torch_device == "xpu" else "tile_packed_to_4d"), Expectations({("cuda", None): "What are we having for dinner?\nRed, white, and green beans,", ("xpu", None): COMMON_OUTPUT, ("xpu", 5): "What are we having for dinner?\n\n[Scene 2]\n\n["})),
|
||||
("Int8DynamicActivationIntxWeightConfig", Int8DynamicActivationIntxWeightConfig(), Expectations({("cpu", None): COMMON_OUTPUT, ("cuda", 9): COMMON_OUTPUT, ("cuda", 8): "What are we having for dinner?\n\nJEN: (smiling) I", ("xpu", None): COMMON_OUTPUT})),
|
||||
("IntxWeightOnlyConfig", IntxWeightOnlyConfig(), ALL_DEVICES_COMMON),
|
||||
("NVFP4DynamicActivationNVFP4WeightConfig", NVFP4DynamicActivationNVFP4WeightConfig(), Expectations({("cuda", None): "What are we having for dinner?\n\n10. Avoid using \"I"})),
|
||||
]
|
||||
if is_torchao_available()
|
||||
else []
|
||||
)
|
||||
# fmt: on
|
||||
|
||||
def tearDown(self):
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
gc.collect()
|
||||
|
||||
def _check_serialization(self, device, config, expected_output):
|
||||
if isinstance(config, (Float8DynamicActivationFloat8WeightConfig, Float8WeightOnlyConfig)):
|
||||
if torch.cuda.is_available() and torch.cuda.get_device_capability() < (8, 9):
|
||||
self.skipTest(f"{type(config).__name__} requires CUDA capability >= (8, 9)")
|
||||
if isinstance(config, NVFP4DynamicActivationNVFP4WeightConfig):
|
||||
if torch.cuda.is_available() and torch.cuda.get_device_capability() < (10, 0):
|
||||
self.skipTest(f"{type(config).__name__} requires CUDA capability >= (10, 0) (SM100)")
|
||||
quant_config = TorchAoConfig(config)
|
||||
needs_bfloat16 = isinstance(config, Int4WeightOnlyConfig | NVFP4DynamicActivationNVFP4WeightConfig)
|
||||
dtype = torch.bfloat16 if needs_bfloat16 else "auto"
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
self.model_name,
|
||||
dtype=dtype,
|
||||
device_map=device,
|
||||
quantization_config=quant_config,
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
||||
input_ids = tokenizer(self.input_text, return_tensors="pt").to(device)
|
||||
output = quantized_model.generate(**input_ids, max_new_tokens=10)
|
||||
self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), expected_output)
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
quantized_model.save_pretrained(tmpdirname)
|
||||
loaded_model = AutoModelForCausalLM.from_pretrained(tmpdirname, dtype=dtype, device_map=device)
|
||||
input_ids = tokenizer(self.input_text, return_tensors="pt").to(device)
|
||||
output = loaded_model.generate(**input_ids, max_new_tokens=10)
|
||||
self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), expected_output)
|
||||
|
||||
@parameterized.expand(test_params, skip_on_empty=True)
|
||||
def test_serialization_cpu(self, _name, config, expected_outputs):
|
||||
try:
|
||||
expected = expected_outputs.find_expectation(("cpu", None, None))
|
||||
except ValueError:
|
||||
self.skipTest(f"{type(config).__name__} does not support CPU")
|
||||
self._check_serialization("cpu", config, expected)
|
||||
|
||||
@parameterized.expand(test_params, skip_on_empty=True)
|
||||
@require_torch_accelerator
|
||||
def test_serialization_accelerator(self, _name, config, expected_outputs):
|
||||
try:
|
||||
expected = expected_outputs.get_expectation()
|
||||
except ValueError:
|
||||
self.skipTest(f"{type(config).__name__} does not support {torch_device}")
|
||||
self._check_serialization(torch_device, config, expected)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user