# Copyright 2023 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import gc import tempfile import unittest from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, AwqConfig, OPTForCausalLM from transformers.testing_utils import ( backend_empty_cache, require_accelerate, require_gptqmodel, require_torch_accelerator, require_torch_gpu, require_torch_multi_accelerator, slow, torch_device, ) from transformers.utils import is_torch_available from transformers.utils.quantization_config import AwqBackend if is_torch_available(): import torch @require_torch_accelerator class AwqConfigTest(unittest.TestCase): def test_wrong_backend(self): """ Simple test that checks if a user passes a wrong backend an error is raised """ # This should work fine _ = AwqConfig(bits=4) with self.assertRaises(ValueError): AwqConfig(bits=4, backend="") # These should work fine _ = AwqConfig(bits=4, version="GEMM") _ = AwqConfig(bits=4, version="gemm") with self.assertRaises(ValueError): AwqConfig(bits=4, backend="unexisting-backend") def test_to_dict(self): """ Simple test that checks if one uses a config and converts it to a dict, the dict is the same as the config object """ quantization_config = AwqConfig(bits=4) config_to_dict = quantization_config.to_dict() for key in config_to_dict: if key == "version": # "version" is legacy filed. # It will be written in to_dict() for compatibility, but AwqConfig will not have this field. self.assertFalse(hasattr(quantization_config, key)) else: self.assertEqual(getattr(quantization_config, key), config_to_dict[key]) def test_from_dict(self): """ Simple test that checks if one uses a dict and converts it to a config object, the config object is the same as the dict """ dict = {"bits": 2, "zero_point": False, "backend": "auto"} quantization_config = AwqConfig.from_dict(dict) self.assertEqual(dict["bits"], quantization_config.bits) self.assertEqual(dict["zero_point"], quantization_config.zero_point) self.assertEqual(dict["backend"], quantization_config.backend) @slow @require_torch_accelerator @require_gptqmodel @require_accelerate class AwqTest(unittest.TestCase): model_name = "TheBloke/Mistral-7B-v0.1-AWQ" dummy_transformers_model_name = "bigscience/bloom-560m" model_with_no_k_proj_quantized = "hf-internal-testing/opt-125m-awq-no-k-proj" input_text = "Hello my name is" EXPECTED_OUTPUT = set() EXPECTED_OUTPUT.add( "Hello my name is Katie and I am a 20 year old student at the University of North Carolina at Chapel Hill. I am a junior and I am majoring in Journalism and minoring in Spanish" ) EXPECTED_OUTPUT.add( "Hello my name is Katie and I am a 20 year old student at the University of North Carolina at Chapel Hill. I am a junior and I am majoring in Journalism and minoring in Spanish. I am" ) EXPECTED_OUTPUT.add( "Hello my name is Katie and I am a 20 year old student at the University of North Carolina at Chapel Hill. I am a junior and I am majoring in Exercise and Sport Science with a" ) EXPECTED_OUTPUT.add( "Hello my name is Katie and I am a 20 year old student from the UK. I am currently studying for a degree in English Literature and History at the University of York. I am a very out" ) EXPECTED_OUTPUT.add( "Hello my name is Katie and I am a 20 year old student from the UK. I am currently studying for a degree in English Literature and History at the University of York. I am a very creative" ) EXPECTED_OUTPUT_BF16 = [ "Hello my name is Katie and I am a 20 year old student at the University of North Carolina at Chapel Hill. I am a junior and I am majoring in Journalism and minoring in Spanish", "Hello my name is Katie and I am a 20 year old student at the University of North Carolina at Chapel Hill. I am a junior and I am majoring in Exercise and Sport Science with a", ] EXPECTED_OUTPUT_EXLLAMA = [ "Hello my name is Katie and I am a 20 year old student from the UK. I am currently studying for a degree in English Literature and History at the University of York. I am a very out", "Hello my name is Katie and I am a 20 year old student from the UK. I am currently studying for a degree in English Literature and History at the University of York. I am a very creative", "Hello my name is Katie and I am a 20 year old student at the University of North Carolina at Chapel Hill. I am a junior and I am majoring in Journalism and minoring in Spanish", ] device_map = torch_device # called only once for all test in this class @classmethod def setUpClass(cls): """ Setup quantized model """ cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name) cls.quantized_model = AutoModelForCausalLM.from_pretrained(cls.model_name, device_map=cls.device_map) def tearDown(self): gc.collect() backend_empty_cache(torch_device) gc.collect() def test_quantized_model_conversion(self): """ Simple test that checks if the quantized model has been converted properly """ from gptqmodel.nn_modules.qlinear import BaseQuantLinear from transformers.integrations.awq import replace_with_awq_linear model_id = "facebook/opt-350m" config = AutoConfig.from_pretrained(model_id, revision="cb32f77e905cccbca1d970436fb0f5e6b58ee3c5") quantization_config = AwqConfig(bits=4) with torch.device("meta"): model = OPTForCausalLM(config) nb_linears = 0 for module in model.modules(): if isinstance(module, torch.nn.Linear): nb_linears += 1 model = replace_with_awq_linear(model, quantization_config=quantization_config) nb_awq_linear = 0 for module in model.modules(): if isinstance(module, BaseQuantLinear): nb_awq_linear += 1 self.assertEqual(nb_linears, nb_awq_linear) # Try with `modules_not_to_convert` with torch.device("meta"): model = OPTForCausalLM(config) model = replace_with_awq_linear( model, quantization_config=quantization_config, modules_to_not_convert=["lm_head"] ) nb_awq_linear = 0 for module in model.modules(): if isinstance(module, BaseQuantLinear): nb_awq_linear += 1 self.assertEqual(nb_linears - 1, nb_awq_linear) def test_quantized_model(self): """ Simple test that checks if the quantized model is working properly """ input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device) output = self.quantized_model.generate(**input_ids, max_new_tokens=40) self.assertIn(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT) def test_raise_if_non_quantized(self): model_id = "facebook/opt-125m" quantization_config = AwqConfig(bits=4) with self.assertRaises(ValueError): _ = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config) def test_quantized_model_bf16(self): """ Simple test that checks if the quantized model is working properly with bf16 """ input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device) quantized_model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype=torch.bfloat16).to(torch_device) output = quantized_model.generate(**input_ids, max_new_tokens=40) self.assertIn(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT_BF16) @require_torch_gpu def test_quantized_model_exllama(self): """ Simple test that checks if the quantized model is working properly with exllama backend """ input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device) quantization_config = AwqConfig(backend=AwqBackend.EXLLAMA_V2) quantized_model = AutoModelForCausalLM.from_pretrained( self.model_name, quantization_config=quantization_config, device_map=torch_device ) output = quantized_model.generate(**input_ids, max_new_tokens=40) self.assertIn(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT_EXLLAMA) def test_quantized_model_no_device_map(self): """ Simple test that checks if the quantized model is working properly """ input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device) quantized_model = AutoModelForCausalLM.from_pretrained(self.model_name).to(torch_device) output = quantized_model.generate(**input_ids, max_new_tokens=40) self.assertIn(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT) def test_save_pretrained(self): """ Simple test that checks if the quantized model is working properly after being saved and loaded """ # Load a fresh model for saving — the shared self.quantized_model may have # already been in-place transformed by a prior generate() call, and saving # those transformed buffers then re-transforming on reload would corrupt data. fresh_model = AutoModelForCausalLM.from_pretrained(self.model_name) with tempfile.TemporaryDirectory() as tmpdirname: fresh_model.save_pretrained(tmpdirname) model = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map=self.device_map) input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device) output = model.generate(**input_ids, max_new_tokens=40) self.assertIn(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT) @require_torch_multi_accelerator def test_quantized_model_multi_accelerator(self): """ Simple test that checks if the quantized model is working properly with multiple GPUs """ input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device) quantized_model = AutoModelForCausalLM.from_pretrained(self.model_name, device_map="auto") self.assertTrue(len(set(quantized_model.hf_device_map.values())) >= 2) output = quantized_model.generate(**input_ids, max_new_tokens=40) self.assertIn(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT) def test_quantized_model_no_k_proj_quantized(self): """ Simple test that checks if the quantized model is working properly with multiple GPUs """ dummy_input = torch.LongTensor([[0, 1, 0]]).to(torch_device) quantized_model = AutoModelForCausalLM.from_pretrained(self.model_with_no_k_proj_quantized).to(torch_device) self.assertTrue(isinstance(quantized_model.model.decoder.layers[0].self_attn.k_proj, torch.nn.Linear)) self.assertFalse(isinstance(quantized_model.model.decoder.layers[0].self_attn.v_proj, torch.nn.Linear)) EXPECTED_OUTPUT = torch.LongTensor([[0, 1, 0, 50118, 50118, 133, 248, 12, 134, 16, 10, 372, 2031]]).to( torch_device ) output = quantized_model.generate(dummy_input, max_new_tokens=10) self.assertTrue((output == EXPECTED_OUTPUT).all()) @slow @require_torch_accelerator @require_gptqmodel @require_accelerate class AwqScaleTest(unittest.TestCase): model_name = "TechxGenus/starcoder2-3b-AWQ" def test_load_quantized_model(self): from gptqmodel.quantization.awq.modules.act import ScaledActivation """ Simple test that checks if the scales have been replaced in the quantized model """ quantized_model = AutoModelForCausalLM.from_pretrained( "TechxGenus/starcoder2-3b-AWQ", dtype=torch.float16, device_map=torch_device ) self.assertTrue(isinstance(quantized_model.model.layers[0].mlp.act, ScaledActivation)) @slow @require_gptqmodel @require_accelerate class AwqTorchFusedTest(unittest.TestCase): def test_quantized_model_torch_fused(self): """ Simple test that checks if the quantized model is working properly with torch_fused backend """ quantization_config = AwqConfig(backend=AwqBackend.TORCH_FUSED_AWQ) model = AutoModelForCausalLM.from_pretrained( "TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", quantization_config=quantization_config, device_map="cpu", ) tokenizer = AutoTokenizer.from_pretrained("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ") input_ids = tokenizer.encode("How to make a cake", return_tensors="pt") pad_token_id = tokenizer.eos_token_id output = model.generate(input_ids, do_sample=False, max_length=20, pad_token_id=pad_token_id) print(tokenizer.decode(output[0], skip_special_tokens=True)) expected_output = ( "How to make a cake with a round tin?\nHow to make a cake with a round tin?\n1. Preheat the oven to 180°" ) self.assertIn(tokenizer.decode(output[0], skip_special_tokens=True), expected_output)