# Copyright 2025 The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Testing suite for the PyTorch Florence2 model.""" import unittest import requests from transformers import ( AutoProcessor, Florence2Config, Florence2ForConditionalGeneration, Florence2Model, is_torch_available, is_vision_available, ) from transformers.testing_utils import ( Expectations, cleanup, require_deterministic_for_xpu, require_torch, slow, torch_device, ) from ...generation.test_utils import GenerationTesterMixin from ...test_configuration_common import ConfigTester from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor from ...test_pipeline_mixin import PipelineTesterMixin if is_torch_available(): import torch if is_vision_available(): from PIL import Image class Florence2VisionText2TextModelTester: def __init__( self, parent, batch_size=13, num_channels=3, image_size=8, seq_length=13, encoder_seq_length=18, is_training=True, vocab_size=99, max_position_embeddings=64, encoder_layers=1, encoder_ffn_dim=16, decoder_layers=1, decoder_ffn_dim=16, num_attention_heads=1, d_model=16, activation_function="gelu", dropout=0.1, eos_token_id=2, bos_token_id=0, pad_token_id=1, image_token_id=4, depths=[1], patch_size=[7], patch_stride=[4], patch_padding=[3], patch_prenorm=[False], embed_dim=[16], num_heads=[1], num_groups=[1], window_size=12, drop_path_rate=0.1, projection_dim=16, ): self.parent = parent self.batch_size = batch_size self.num_channels = num_channels self.image_size = image_size self.is_training = is_training self.num_hidden_layers = decoder_layers self.hidden_size = d_model # Language model configs self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.encoder_layers = encoder_layers self.encoder_ffn_dim = encoder_ffn_dim self.decoder_layers = decoder_layers self.decoder_ffn_dim = decoder_ffn_dim self.num_attention_heads = num_attention_heads self.d_model = d_model self.activation_function = activation_function self.dropout = dropout self.eos_token_id = eos_token_id self.bos_token_id = bos_token_id self.pad_token_id = pad_token_id self.image_token_id = image_token_id # Vision model configs self.drop_path_rate = drop_path_rate self.patch_size = patch_size self.depths = depths self.patch_stride = patch_stride self.patch_padding = patch_padding self.patch_prenorm = patch_prenorm self.embed_dim = embed_dim self.num_heads = num_heads self.num_groups = num_groups self.window_size = window_size self.projection_dim = projection_dim self.num_channels = 3 self.num_image_tokens = 5 self.seq_length = seq_length + self.num_image_tokens self.encoder_seq_length = encoder_seq_length def get_config(self): text_config = { "model_type": "bart", "vocab_size": self.vocab_size, "max_position_embeddings": self.max_position_embeddings, "encoder_layers": self.encoder_layers, "encoder_ffn_dim": self.encoder_ffn_dim, "encoder_attention_heads": self.num_attention_heads, "decoder_layers": self.decoder_layers, "decoder_ffn_dim": self.decoder_ffn_dim, "decoder_attention_heads": self.num_attention_heads, "d_model": self.d_model, "activation_function": self.activation_function, "dropout": self.dropout, "attention_dropout": self.dropout, "activation_dropout": self.dropout, "eos_token_id": self.eos_token_id, "bos_token_id": self.bos_token_id, "pad_token_id": self.pad_token_id, } vision_config = { "drop_path_rate": self.drop_path_rate, "patch_size": self.patch_size, "depths": self.depths, "patch_stride": self.patch_stride, "patch_padding": self.patch_padding, "patch_prenorm": self.patch_prenorm, "embed_dim": self.embed_dim, "num_heads": self.num_heads, "num_groups": self.num_groups, "window_size": self.window_size, "activation_function": self.activation_function, "projection_dim": self.projection_dim, } return Florence2Config( text_config=text_config, vision_config=vision_config, image_token_id=self.image_token_id, initializer_range=0.02, ) def prepare_config_and_inputs(self): pixel_values = floats_tensor( [ self.batch_size, self.num_channels, self.image_size, self.image_size, ] ) input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size - 1) + 1 input_ids[input_ids == self.image_token_id] = self.pad_token_id input_ids[:, : self.num_image_tokens] = self.image_token_id input_ids[:, -1] = self.eos_token_id decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) decoder_attention_mask = decoder_input_ids.ne(self.pad_token_id) inputs_dict = { "input_ids": input_ids, "pixel_values": pixel_values, "decoder_input_ids": decoder_input_ids, "decoder_attention_mask": decoder_attention_mask, } config = self.get_config() return config, inputs_dict def prepare_config_and_inputs_for_common(self): config, inputs_dict = self.prepare_config_and_inputs() return config, inputs_dict def create_and_check_florence2_model_fp16_forward(self, config, input_ids, pixel_values, attention_mask): model = Florence2ForConditionalGeneration(config=config) model.to(torch_device) model.eval() with torch.autocast(device_type="cuda", dtype=torch.float16): logits = model( input_ids=input_ids, attention_mask=attention_mask, pixel_values=pixel_values.to(torch.float16), return_dict=True, )["logits"] self.parent.assertFalse(torch.isnan(logits).any().item()) @unittest.skip( reason="This architecture (bart) has tied weights by default and there is no way to remove it, check: https://github.com/huggingface/transformers/pull/31771#issuecomment-2210915245" ) def test_load_save_without_tied_weights(self): pass @unittest.skip(reason="SDPA can't dispatch on flash due to unsupported qkv stride") def test_sdpa_can_dispatch_on_flash(self): pass @require_torch class Florence2ForConditionalGenerationModelTest( ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase ): """ Model tester for `Florence2ForConditionalGeneration`. """ all_model_classes = (Florence2Model, Florence2ForConditionalGeneration) if is_torch_available() else () pipeline_model_mapping = ( { "image-text-to-text": Florence2ForConditionalGeneration, "any-to-any": Florence2ForConditionalGeneration, } if is_torch_available() else {} ) skip_test_image_features_output_shape = True # Florence2 uses index -3 for hidden_size instead of -1 has_attentions = False _is_composite = True def setUp(self): self.model_tester = Florence2VisionText2TextModelTester(self) self.config_tester = ConfigTester(self, config_class=Florence2Config, has_text_modality=False) def test_config(self): self.config_tester.run_common_tests() @unittest.skip( reason="Backnone architecture (BART) has tied weights by default and there is no way to remove it, check: https://github.com/huggingface/transformers/pull/31771#issuecomment-2210915245" ) def test_load_save_without_tied_weights(self): pass def prepare_img(): url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/australia.jpg?download=true" image = Image.open(requests.get(url, stream=True).raw) return image @slow @require_torch class Florence2ForConditionalGenerationIntegrationTest(unittest.TestCase): def setUp(self): self.image1 = Image.open( requests.get( "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/australia.jpg?download=true", stream=True, ).raw ) self.image2 = Image.open( requests.get( "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true", stream=True, ).raw ) def tearDown(self): cleanup(torch_device, gc_collect=True) def test_base_model_inference_eager(self): model_name = "florence-community/Florence-2-base" processor = AutoProcessor.from_pretrained(model_name) model = Florence2ForConditionalGeneration.from_pretrained(model_name, attn_implementation="eager").to( torch_device ) prompt = "" inputs = processor(images=self.image1, text=prompt, return_tensors="pt") inputs.to(device=torch_device) EXPECTED_INPUT_IDS = [[processor.image_token_id] * processor.num_image_tokens + [0, 47066, 21700, 11, 4617, 99, 16, 2343, 11, 5, 2274, 4, 2]] # fmt: skip self.assertEqual(inputs["input_ids"].tolist(), EXPECTED_INPUT_IDS) predictions = model.generate(**inputs, max_new_tokens=100) EXPECTED_PREDICTION_IDS = [[2, 0, 133, 2274, 924, 10, 912, 1203, 2828, 15, 5, 526, 9, 10, 2014, 11, 35910, 6, 188, 469, 412, 4, 20, 2014, 16, 9321, 19, 3413, 6, 3980, 6, 8, 19638, 6, 8, 89, 32, 82, 3051, 15, 5, 2767, 22609, 4, 20, 6360, 16, 7097, 11, 5, 3618, 4, 2]] # fmt: skip self.assertEqual(predictions.tolist(), EXPECTED_PREDICTION_IDS) generated_text = processor.batch_decode(predictions, skip_special_tokens=True)[0] EXPECTED_GENERATED_TEXT = "The image shows a stop sign sitting on the side of a street in Chinatown, New York City. The street is lined with buildings, trees, and statues, and there are people walking on the footpath. The sky is visible in the background." # fmt: skip self.assertEqual(generated_text, EXPECTED_GENERATED_TEXT) @require_deterministic_for_xpu def test_base_model_batching_inference_eager(self): model_name = "florence-community/Florence-2-base" processor = AutoProcessor.from_pretrained(model_name) model = Florence2ForConditionalGeneration.from_pretrained(model_name, attn_implementation="eager").to( torch_device ) images = [self.image1, self.image2] prompts = ["", "wheels"] inputs = processor(images=images, text=prompts, padding="longest", return_tensors="pt") EXPECTED_INPUT_IDS = [ [processor.image_token_id] * processor.num_image_tokens + [0, 574, 22486, 5, 976, 5327, 11, 5, 2274, 4, 2], [processor.image_token_id] * processor.num_image_tokens + [0, 574, 22486, 10562, 11, 5, 2274, 4, 2, 1, 1], ] self.assertEqual(inputs["input_ids"].tolist(), EXPECTED_INPUT_IDS) inputs.to(device=torch_device) predictions = model.generate(**inputs, do_sample=False, max_new_tokens=100) EXPECTED_PREDICTION_IDS = [ [2, 0, 50269, 50269, 51267, 50980, 50269, 50269, 50688, 50942, 50269, 50333, 50633, 50941, 51033, 50269, 51267, 50934, 50794, 50814, 51190, 51032, 50432, 50402, 50634, 50692, 50269, 50334, 50340, 50927, 51224, 50417, 51267, 50930, 51075, 50944, 51159, 51028, 50836, 50947, 50915, 51030, 2], [2, 0, 28884, 2507, 50413, 50839, 51139, 51047, 28884, 2507, 50980, 50842, 51135, 51043, 28884, 2507, 50417, 50848, 50573, 51043, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], ] # fmt: skip self.assertEqual(predictions.tolist(), EXPECTED_PREDICTION_IDS) generated_texts = processor.batch_decode(predictions, skip_special_tokens=False) EXPECTED_GENERATED_TEXTS = [ "", "wheelswheelswheels", ] self.assertEqual(generated_texts, EXPECTED_GENERATED_TEXTS) parsed_answer_0 = processor.post_process_generation( generated_texts[0], task="", image_size=(images[0].width, images[0].height) ) EXPECTED_PARSED_ANSWER_0 = { "": { "bboxes": [ [0, 0, 1298, 623], [0, 0, 545, 589], [0, 56, 473, 589], [993, 0, 1298, 582], [683, 477, 1197, 668], [212, 116, 475, 370], [0, 57, 92, 576], [1242, 130, 1298, 579], [1048, 591, 1157, 665], [737, 594, 840, 667], ], "labels": ["", "", "", "", "", "", "", "", "", ""], } } self.assertEqual(parsed_answer_0, EXPECTED_PARSED_ANSWER_0) parsed_answer_1 = processor.post_process_generation( generated_texts[1], task="", image_size=(images[1].width, images[1].height) ) EXPECTED_PARSED_ANSWER_1 = {"": {"bboxes": [[92, 273, 557, 373], [455, 275, 554, 371], [95, 278, 194, 371]], "bboxes_labels": ["wheels", "wheels", "wheels"], "polygons": [], "polygons_labels": []}} # fmt: skip self.assertEqual(parsed_answer_1, EXPECTED_PARSED_ANSWER_1) def test_base_model_inference_sdpa(self): model_name = "florence-community/Florence-2-base" processor = AutoProcessor.from_pretrained(model_name) model = Florence2ForConditionalGeneration.from_pretrained(model_name, attn_implementation="sdpa").to( torch_device ) prompt = "a car" inputs = processor(images=self.image2, text=prompt, return_tensors="pt") inputs.to(device=torch_device) EXPECTED_INPUT_IDS = [[processor.image_token_id] * processor.num_image_tokens + [0, 574, 22486, 10, 512, 11, 5, 2274, 19, 11445, 2]] # fmt: skip self.assertEqual(inputs["input_ids"].tolist(), EXPECTED_INPUT_IDS) predictions = model.generate(**inputs, do_sample=False, max_new_tokens=100) EXPECTED_PREDICTION_IDS = [[2, 0, 50548, 50648, 50551, 50648, 50559, 50641, 50562, 50641, 50567, 50637, 50570, 50637, 50575, 50633, 50579, 50631, 50584, 50629, 50589, 50627, 50593, 50624, 50600, 50622, 50606, 50620, 50612, 50618, 50618, 50616, 50625, 50614, 50634, 50612, 50645, 50610, 50659, 50608, 50678, 50606, 50758, 50606, 50783, 50608, 50797, 50610, 50808, 50612, 50816, 50614, 50822, 50616, 50828, 50618, 50835, 50620, 50841, 50622, 50847, 50624, 50853, 50629, 50858, 50635, 50861, 50641, 50864, 50648, 50867, 50654, 50870, 50660, 50872, 50666, 50875, 50670, 50877, 50677, 50880, 50683, 50883, 50689, 50886, 50695, 50889, 50702, 50895, 50710, 50900, 50714, 50905, 50716, 50908, 50720, 50908, 50725, 50911, 50729, 2]] # fmt: skip self.assertEqual(predictions.tolist(), EXPECTED_PREDICTION_IDS) generated_text = processor.batch_decode(predictions, skip_special_tokens=False)[0] EXPECTED_GENERATED_TEXT = "" # fmt: skip self.assertEqual(generated_text, EXPECTED_GENERATED_TEXT) parsed_answer = processor.post_process_generation( generated_text, task="", image_size=(self.image2.width, self.image2.height), ) EXPECTED_PARSED_ANSWER = {'': {'polygons': [[[178, 182, 180, 182, 185, 178, 187, 178, 191, 176, 192, 176, 196, 174, 198, 174, 201, 173, 205, 172, 207, 170, 212, 169, 216, 168, 219, 167, 223, 166, 228, 165, 233, 164, 240, 163, 249, 162, 262, 162, 313, 162, 329, 162, 338, 163, 345, 164, 350, 165, 354, 166, 358, 167, 362, 168, 366, 169, 370, 170, 374, 173, 377, 175, 379, 178, 381, 182, 383, 185, 384, 187, 386, 190, 388, 192, 389, 196, 391, 198, 393, 201, 395, 204, 397, 208, 400, 211, 404, 213, 407, 214, 409, 216, 409, 219, 411, 221]]], 'labels': ['']}} # fmt: skip self.assertEqual(parsed_answer, EXPECTED_PARSED_ANSWER) def test_base_model_batching_inference_sdpa(self): model_name = "florence-community/Florence-2-base" processor = AutoProcessor.from_pretrained(model_name) model = Florence2ForConditionalGeneration.from_pretrained(model_name, attn_implementation="sdpa").to( torch_device ) images = [self.image1, self.image2] prompts = ["", ""] inputs = processor(images=images, text=prompts, padding="longest", return_tensors="pt") EXPECTED_INPUT_IDS = [ [processor.image_token_id] * processor.num_image_tokens + [0, 2264, 16, 5, 2788, 11, 5, 2274, 116, 2, 1, 1, 1], [processor.image_token_id] * processor.num_image_tokens + [0, 574, 22486, 5, 8720, 19, 4120, 766, 11, 5, 2274, 4, 2], ] # fmt: skip self.assertEqual(inputs["input_ids"].tolist(), EXPECTED_INPUT_IDS) inputs.to(device=torch_device) predictions = model.generate(**inputs, do_sample=False, max_new_tokens=100) EXPECTED_PREDICTION_IDS = [ [2, 0, 47643, 47240, 6382, 47643, 7405, 495, 211, 2571, 4014, 5733, 36714, 11582, 11582, 36714, 18164, 9357, 36714, 6248, 3602, 37127, 27969, 7471, 44636, 23171, 41907, 27, 16948, 45895, 11582, 45262, 18537, 530, 791, 384, 229, 791, 5733, 565, 3048, 673, 10932, 5733, 565, 11120, 673, 2], [2, 0, 5901, 50322, 50602, 51202, 51043, 11219, 3679, 50694, 50772, 50743, 50784, 13630, 50978, 50845, 51134, 51041, 50419, 50853, 50578, 51042, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], ] # fmt: skip self.assertEqual(predictions.tolist(), EXPECTED_PREDICTION_IDS) generated_texts = processor.batch_decode(predictions, skip_special_tokens=False) EXPECTED_GENERATED_TEXTS = [ "中文中BBD DATSTOP第福科技有限公司KU O KUOPTUSOyesOPTUSTO", "cardoor handlewheel", ] # fmt: skip self.assertEqual(generated_texts, EXPECTED_GENERATED_TEXTS) parsed_answer = processor.post_process_generation( generated_texts[1], task="", image_size=(images[1].width, images[1].height) ) EXPECTED_PARSED_ANSWER = {'': {'bboxes': [[34, 160, 597, 371], [272, 241, 303, 247], [454, 276, 553, 370], [96, 280, 198, 371]], 'labels': ['car', 'door handle', 'wheel', 'wheel']}} # fmt: skip self.assertEqual(parsed_answer, EXPECTED_PARSED_ANSWER) @require_deterministic_for_xpu def test_large_model_inference_eager(self): model_name = "florence-community/Florence-2-large" processor = AutoProcessor.from_pretrained(model_name) model = Florence2ForConditionalGeneration.from_pretrained(model_name, attn_implementation="eager").to( torch_device ) prompt = "" inputs = processor(images=self.image1, text=prompt, return_tensors="pt") inputs.to(device=torch_device) EXPECTED_INPUT_IDS = [[processor.image_token_id] * processor.num_image_tokens + [0, 47066, 21700, 11, 4617, 99, 16, 2343, 11, 5, 2274, 4, 2]] # fmt: skip self.assertEqual(inputs["input_ids"].tolist(), EXPECTED_INPUT_IDS) predictions = model.generate(**inputs, do_sample=False, max_new_tokens=100) EXPECTED_PREDICTION_IDS = Expectations( { (None, None): [[2, 0, 133, 2274, 924, 10, 909, 512, 1428, 159, 10, 2014, 9321, 19, 6764, 3413, 4, 96, 5, 39299, 6, 89, 16, 10, 1275, 912, 1203, 2828, 15, 5, 526, 9, 5, 921, 6, 8, 11, 5, 3618, 6, 89, 32, 1104, 19638, 6, 3980, 6, 8, 10, 699, 2440, 6360, 4, 2]], ("xpu", 5): [[2, 0, 133, 2274, 924, 10, 909, 512, 1428, 159, 10, 2014, 9321, 19, 6764, 3413, 4, 96, 5, 39299, 6, 89, 16, 10, 1275, 912, 1203, 2828, 15, 5, 526, 9, 5, 921, 6, 8, 11, 5, 3618, 6, 89, 32, 3980, 6, 82, 3051, 15, 5, 2767, 22609, 6, 8, 41, 9599, 19, 766, 6904, 4, 2]], } ).get_expectation() # fmt: skip self.assertEqual(predictions.tolist(), EXPECTED_PREDICTION_IDS) generated_text = processor.batch_decode(predictions, skip_special_tokens=True)[0] EXPECTED_GENERATED_TEXT = Expectations( { (None, None): "The image shows a black car driving down a street lined with tall buildings. In the foreground, there is a red stop sign sitting on the side of the road, and in the background, there are white statues, trees, and a clear blue sky.", ("xpu", 5): "The image shows a black car driving down a street lined with tall buildings. In the foreground, there is a red stop sign sitting on the side of the road, and in the background, there are trees, people walking on the footpath, and an arch with name boards.", } ).get_expectation() # fmt: skip self.assertEqual(generated_text, EXPECTED_GENERATED_TEXT) @require_deterministic_for_xpu def test_large_model_batching_inference_eager(self): model_name = "florence-community/Florence-2-large" processor = AutoProcessor.from_pretrained(model_name) model = Florence2ForConditionalGeneration.from_pretrained(model_name, attn_implementation="eager").to( torch_device ) images = [self.image1, self.image2] prompts = ["", "car"] inputs = processor(images=images, text=prompts, padding="longest", return_tensors="pt") EXPECTED_INPUT_IDS = [ [processor.image_token_id] * processor.num_image_tokens + [0, 574, 22486, 5, 976, 5327, 11, 5, 2274, 4, 2], [processor.image_token_id] * processor.num_image_tokens + [0, 574, 22486, 512, 11, 5, 2274, 4, 2, 1, 1], ] # fmt: skip self.assertEqual(inputs["input_ids"].tolist(), EXPECTED_INPUT_IDS) inputs.to(device=torch_device) predictions = model.generate(**inputs, max_new_tokens=100) EXPECTED_PREDICTION_IDS = Expectations( { (None, None): [ [2, 0, 0, 0, 50269, 50269, 51268, 50944, 50269, 50269, 50631, 50940, 50269, 50269, 50575, 50940, 51032, 50269, 51268, 50932, 50793, 50813, 51190, 51031, 50432, 50401, 50632, 50691, 51071, 50943, 51159, 51027, 50835, 50946, 50915, 51029, 2], [2, 0, 5901, 50321, 50603, 51201, 51043, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], ], ("xpu", 5): [ [2, 0, 0, 0, 50269, 50269, 51268, 50944, 50269, 50269, 50579, 50940, 51032, 50269, 51268, 50932, 50793, 50813, 51190, 51031, 50432, 50401, 50632, 50691, 51071, 50943, 51159, 51027, 50835, 50946, 50915, 51029, 2], [2, 0, 5901, 50321, 50603, 51201, 51043, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], ], } ).get_expectation() # fmt: skip self.assertEqual(predictions.tolist(), EXPECTED_PREDICTION_IDS) generated_texts = processor.batch_decode(predictions, skip_special_tokens=False) EXPECTED_GENERATED_TEXTS = Expectations( { (None, None): [ "", "car", ], ("xpu", 5): [ "", "car", ], } ).get_expectation() # fmt: skip self.assertEqual(generated_texts, EXPECTED_GENERATED_TEXTS) parsed_answer_0 = processor.post_process_generation( generated_texts[0], task="", image_size=(images[0].width, images[0].height) ) EXPECTED_PARSED_ANSWER_0 = Expectations( { (None, None): { "": { "bboxes": [ [0, 0, 1299, 591], [0, 0, 471, 588], [0, 0, 398, 588], [992, 0, 1299, 581], [681, 476, 1197, 667], [212, 116, 472, 370], [1043, 590, 1157, 664], [736, 593, 840, 666], ], "labels": ["", "", "", "", "", "", "", ""], } }, ("xpu", 5): { "": { "bboxes": [ [0, 0, 1299, 591], [0, 0, 403, 588], [992, 0, 1299, 581], [681, 476, 1197, 667], [212, 116, 472, 370], [1043, 590, 1157, 664], [736, 593, 840, 666], ], "labels": ["", "", "", "", "", "", ""], } }, } ).get_expectation() self.assertEqual(parsed_answer_0, EXPECTED_PARSED_ANSWER_0) parsed_answer_1 = processor.post_process_generation( generated_texts[1], task="", image_size=(images[1].width, images[1].height) ) EXPECTED_PARSED_ANSWER_1 = {'': {'bboxes': [[33, 160, 596, 371]], 'bboxes_labels': ['car'], 'polygons': [], 'polygons_labels': []}} # fmt: skip self.assertEqual(parsed_answer_1, EXPECTED_PARSED_ANSWER_1) @require_deterministic_for_xpu def test_large_model_inference_sdpa(self): model_name = "florence-community/Florence-2-large" processor = AutoProcessor.from_pretrained(model_name) model = Florence2ForConditionalGeneration.from_pretrained(model_name, attn_implementation="sdpa").to( torch_device ) prompt = "a car" inputs = processor(images=self.image2, text=prompt, return_tensors="pt") inputs.to(device=torch_device) EXPECTED_INPUT_IDS = [[processor.image_token_id] * processor.num_image_tokens + [0, 574, 22486, 10, 512, 11, 5, 2274, 19, 11445, 2]] # fmt: skip self.assertEqual(inputs["input_ids"].tolist(), EXPECTED_INPUT_IDS) predictions = model.generate(**inputs, max_new_tokens=100) EXPECTED_PREDICTION_IDS = [[2, 0, 0, 0, 50548, 50646, 50551, 50644, 50554, 50644, 50562, 50637, 50565, 50637, 50570, 50633, 50573, 50633, 50578, 50629, 50582, 50627, 50587, 50625, 50592, 50623, 50597, 50621, 50603, 50619, 50609, 50616, 50615, 50614, 50622, 50612, 50629, 50610, 50639, 50608, 50651, 50606, 50667, 50604, 50695, 50602, 50750, 50602, 50778, 50604, 50793, 50606, 50805, 50608, 50812, 50610, 50818, 50612, 50825, 50614, 50831, 50616, 50837, 50619, 50844, 50621, 50848, 50623, 50854, 50627, 50857, 50631, 50861, 50637, 50864, 50644, 50867, 50650, 50870, 50656, 50873, 50662, 50875, 50668, 50878, 50673, 50879, 50679, 50883, 50685, 50886, 50691, 50889, 50698, 50892, 50704, 50898, 50712, 50903, 50714, 2]] # fmt: skip self.assertEqual(predictions.tolist(), EXPECTED_PREDICTION_IDS) generated_text = processor.batch_decode(predictions, skip_special_tokens=False)[0] EXPECTED_GENERATED_TEXT = "" # fmt: skip self.assertEqual(generated_text, EXPECTED_GENERATED_TEXT) parsed_answer = processor.post_process_generation( generated_text, task="", image_size=(self.image2.width, self.image2.height), ) EXPECTED_PARSED_ANSWER = {'': {'polygons': [[[178, 181, 180, 180, 182, 180, 187, 176, 189, 176, 192, 174, 194, 174, 198, 173, 200, 172, 203, 171, 207, 170, 210, 169, 214, 168, 217, 166, 221, 165, 226, 164, 230, 163, 237, 162, 244, 162, 255, 161, 272, 160, 308, 160, 326, 161, 335, 162, 343, 162, 347, 163, 351, 164, 356, 165, 360, 166, 363, 168, 368, 169, 370, 170, 374, 172, 376, 174, 379, 176, 381, 180, 383, 183, 384, 186, 386, 188, 388, 191, 390, 194, 390, 197, 393, 199, 395, 202, 397, 206, 399, 209, 402, 212, 406, 213]]], 'labels': ['']}} # fmt: skip self.assertEqual(parsed_answer, EXPECTED_PARSED_ANSWER) def test_large_model_batching_inference_sdpa(self): model_name = "florence-community/Florence-2-large" processor = AutoProcessor.from_pretrained(model_name) model = Florence2ForConditionalGeneration.from_pretrained(model_name, attn_implementation="sdpa").to( torch_device ) images = [self.image1, self.image2] prompts = ["", ""] inputs = processor(images=images, text=prompts, padding="longest", return_tensors="pt") EXPECTED_INPUT_IDS = [ [processor.image_token_id] * processor.num_image_tokens + [0, 2264, 16, 5, 2788, 11, 5, 2274, 6, 19, 3806, 116, 2], [processor.image_token_id] * processor.num_image_tokens + [0, 2264, 473, 5, 2274, 6190, 116, 2, 1, 1, 1, 1, 1], ] # fmt: skip self.assertEqual(inputs["input_ids"].tolist(), EXPECTED_INPUT_IDS) inputs.to(device=torch_device) predictions = model.generate(**inputs, max_new_tokens=100) EXPECTED_PREDICTION_IDS = [ [2, 0, 0, 0, 47643, 47240, 7487, 47643, 50802, 50337, 50922, 50337, 50922, 50397, 50802, 50397, 4652, 50270, 50372, 50288, 50372, 50288, 50394, 50270, 50394, 495, 2571, 50401, 50455, 50446, 50457, 50446, 50483, 50401, 50482, 4014, 5733, 50446, 50495, 50614, 50493, 50614, 50596, 50446, 50600, 530, 791, 673, 51230, 50640, 51261, 50640, 51261, 50666, 51230, 50666, 5733, 565, 3048, 50389, 50683, 50461, 50684, 50461, 50719, 50389, 50717, 7111, 230, 5061, 33893, 50707, 50668, 50755, 50668, 50755, 50682, 50707, 50682, 10932, 50290, 50708, 50333, 50706, 50334, 50751, 50290, 50753, 4652, 51128, 50704, 51149, 50704, 51149, 50729, 51128, 50729, 2], [2, 0, 102, 2272, 512, 9181, 11, 760, 9, 10, 5718, 745, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], ] # fmt: skip self.assertEqual(predictions.tolist(), EXPECTED_PREDICTION_IDS) generated_texts = processor.batch_decode(predictions, skip_special_tokens=False) EXPECTED_GENERATED_TEXTS = [ "中新中88DATSTOPKUOOPTUSOD COUKTyes88", "a green car parked in front of a yellow building", ] # fmt: skip self.assertEqual(generated_texts, EXPECTED_GENERATED_TEXTS) parsed_answer = processor.post_process_generation( generated_texts[0], task="", image_size=(images[0].width, images[0].height) ) EXPECTED_PARSED_ANSWER = {'': {'quad_boxes': [[693, 60, 849, 60, 849, 112, 693, 112], [1, 90, 25, 90, 25, 109, 1, 109], [172, 163, 230, 165, 230, 187, 172, 187], [230, 198, 449, 196, 449, 286, 230, 290], [1249, 325, 1290, 325, 1290, 348, 1249, 348], [156, 363, 250, 363, 250, 394, 156, 392], [570, 349, 632, 349, 632, 362, 570, 362], [27, 385, 83, 383, 85, 422, 27, 424], [1117, 381, 1144, 381, 1144, 403, 1117, 403]], 'labels': ['中新中', '88', 'DAT', 'STOP', 'KUO', 'OPTUS', 'OD COUKT', 'yes', '88']}} # fmt: skip self.assertEqual(parsed_answer, EXPECTED_PARSED_ANSWER)