# Copyright 2025 The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Testing suite for the PyTorch SmolLM3 model.""" import gc import unittest import pytest from parameterized import parameterized from transformers import AutoTokenizer, BitsAndBytesConfig, SmolLM3Config, is_torch_available from transformers.generation.configuration_utils import GenerationConfig from transformers.testing_utils import ( backend_empty_cache, is_flaky, require_bitsandbytes, require_flash_attn, require_torch, slow, torch_device, ) from transformers.utils.import_utils import is_torch_greater_or_equal if is_torch_available(): import torch from transformers import ( SmolLM3ForCausalLM, SmolLM3ForQuestionAnswering, SmolLM3ForSequenceClassification, SmolLM3ForTokenClassification, SmolLM3Model, ) from ...causal_lm_tester import CausalLMModelTest, CausalLMModelTester from ...test_modeling_common import ( TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION, ModelTesterMixin, ) class SmolLM3ModelTester(CausalLMModelTester): config_class = SmolLM3Config if is_torch_available(): base_model_class = SmolLM3Model causal_lm_class = SmolLM3ForCausalLM question_answering_class = SmolLM3ForQuestionAnswering sequence_classification_class = SmolLM3ForSequenceClassification token_classification_class = SmolLM3ForTokenClassification @require_torch class SmolLM3ModelTest(CausalLMModelTest, unittest.TestCase): model_tester_class = SmolLM3ModelTester @parameterized.expand(TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION) @is_flaky() def test_eager_matches_sdpa_inference(self, *args): # flaky test_eager_matches_sdpa_inference_24_fp32_pad_left_output_attentions return getattr(ModelTesterMixin, self._testMethodName)(self) @require_torch class SmolLM3IntegrationTest(unittest.TestCase): model_id = "HuggingFaceTB/SmolLM3-3B" @slow def test_model_3b_logits(self): input_ids = [1, 306, 4658, 278, 6593, 310, 2834, 338] model = SmolLM3ForCausalLM.from_pretrained(self.model_id, device_map="auto") input_ids = torch.tensor([input_ids]).to(model.model.embed_tokens.weight.device) with torch.no_grad(): out = model(input_ids).logits.float().cpu() # Expected mean on dim = -1 EXPECTED_MEAN = torch.tensor([[9.3306, 8.1721, 6.4764, 7.6011, 11.1218, 7.5343, 7.1195, 8.0956]]) torch.testing.assert_close(out.mean(-1), EXPECTED_MEAN, rtol=1e-2, atol=1e-2) # slicing logits[0, 0, 0:30] EXPECTED_SLICE = torch.tensor( [15.7759, 17.6274, 16.3404, 14.5543, 13.1366, 14.2475, 15.8710, 15.6753, 12.3856, 13.0386, 14.0792, 12.7253, 13.9634, 12.1271, 12.4320, 16.0329, 17.3975, 17.1396, 17.8666, 17.0103, 17.2962, 16.8777, 16.7144, 16.3023, 16.6084, 12.4649, 12.0723, 14.1148, 14.8239, 15.2733]) # fmt: skip torch.testing.assert_close(out[0, 0, :30], EXPECTED_SLICE, rtol=1e-4, atol=1e-4) del model backend_empty_cache(torch_device) gc.collect() @slow def test_model_3b_generation(self): EXPECTED_TEXT_COMPLETION = """Gravity is the force that pulls objects toward each other. It is the force that keeps your feet on the ground and makes""" prompt = "Gravity is the force" tokenizer = AutoTokenizer.from_pretrained(self.model_id) model = SmolLM3ForCausalLM.from_pretrained(self.model_id, device_map="auto") input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.model.embed_tokens.weight.device) # greedy generation outputs generated_ids = model.generate(input_ids, max_new_tokens=20, do_sample=False) text = tokenizer.decode(generated_ids[0], skip_special_tokens=True) self.assertEqual(EXPECTED_TEXT_COMPLETION, text) del model backend_empty_cache(torch_device) gc.collect() @require_bitsandbytes @slow @require_flash_attn @pytest.mark.flash_attn_test def test_model_3b_long_prompt(self): EXPECTED_OUTPUT_TOKEN_IDS = [306, 338] # An input with 4097 tokens that is above the size of the sliding window input_ids = [1] + [306, 338] * 2048 model = SmolLM3ForCausalLM.from_pretrained( self.model_id, device_map="auto", quantization_config=BitsAndBytesConfig(load_in_4bit=True), attn_implementation="flash_attention_2", ) input_ids = torch.tensor([input_ids]).to(model.model.embed_tokens.weight.device) generated_ids = model.generate(input_ids, max_new_tokens=4, do_sample=False) self.assertEqual(EXPECTED_OUTPUT_TOKEN_IDS, generated_ids[0][-2:].tolist()) # Assisted generation assistant_model = model assistant_model.generation_config.num_assistant_tokens = 2 assistant_model.generation_config.num_assistant_tokens_schedule = "constant" generated_ids = model.generate(input_ids, max_new_tokens=4, do_sample=False) self.assertEqual(EXPECTED_OUTPUT_TOKEN_IDS, generated_ids[0][-2:].tolist()) del assistant_model del model backend_empty_cache(torch_device) gc.collect() @pytest.mark.torch_export_test @slow def test_export_static_cache(self): from transformers.integrations.executorch import ( TorchExportableModuleWithStaticCache, convert_and_export_with_cache, ) tokenizer = AutoTokenizer.from_pretrained( self.model_id, pad_token="<|finetune_right_pad_id|>", padding_side="right" ) EXPECTED_TEXT_COMPLETION = "Gravity is the force that pulls objects toward the center of the Earth. It is a force that is always present, and" max_generation_length = tokenizer(EXPECTED_TEXT_COMPLETION, return_tensors="pt", padding=True)[ "input_ids" ].shape[-1] # Load model device = "cpu" # TODO (joao / export experts): should be on `torch_device`, but causes GPU OOM dtype = torch.bfloat16 cache_implementation = "static" attn_implementation = "sdpa" batch_size = 1 model = SmolLM3ForCausalLM.from_pretrained( self.model_id, device_map=device, dtype=dtype, attn_implementation=attn_implementation, generation_config=GenerationConfig( use_cache=True, cache_implementation=cache_implementation, max_length=max_generation_length, cache_config={ "batch_size": batch_size, "max_cache_len": max_generation_length, }, ), ) prompt = ["Gravity is the force"] prompt_tokens = tokenizer(prompt, return_tensors="pt", padding=True).to(model.device) prompt_token_ids = prompt_tokens["input_ids"] max_new_tokens = max_generation_length - prompt_token_ids.shape[-1] # Static Cache + export strict = is_torch_greater_or_equal("2.7.0") # Due to https://github.com/pytorch/pytorch/issues/150994 exported_program = convert_and_export_with_cache(model, strict=strict) ep_generated_ids = TorchExportableModuleWithStaticCache.generate( exported_program=exported_program, prompt_token_ids=prompt_token_ids, max_new_tokens=max_new_tokens ) ep_generated_text = tokenizer.batch_decode(ep_generated_ids, skip_special_tokens=True) self.assertEqual(EXPECTED_TEXT_COMPLETION, ep_generated_text)