# Copyright 2025 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import unittest from transformers import Florence2Processor from transformers.testing_utils import require_torch, require_vision from transformers.utils import is_torch_available from ...test_processing_common import ProcessorTesterMixin if is_torch_available(): import torch @require_torch @require_vision class Florence2ProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_class = Florence2Processor @classmethod def _setup_image_processor(cls): image_processor_class = cls._get_component_class_from_processor("image_processor") image_processor = image_processor_class.from_pretrained("florence-community/Florence-2-base") image_processor.image_seq_length = 0 return image_processor @classmethod def _setup_tokenizer(cls): tokenizer_class = cls._get_component_class_from_processor("tokenizer") tokenizer = tokenizer_class.from_pretrained("florence-community/Florence-2-base") tokenizer.image_token = "" tokenizer.image_token_id = tokenizer.encode(tokenizer.image_token, add_special_tokens=False)[0] return tokenizer @unittest.skip("Florence2Processor adds prefix and suffix tokens to the text") def test_tokenizer_defaults(self): pass @staticmethod def prepare_processor_dict(): return { "post_processor_config": { "ocr": { "pattern": r"(.+?)", "area_threshold": 0.0, }, "phrase_grounding": {"banned_grounding_tokens": ["the image"]}, "pure_text": {}, "description_with_bboxes": {}, "description_with_polygons": {}, "polygons": {}, "bboxes": {}, "description_with_bboxes_or_polygons": {}, } } def test_construct_prompts(self): processor = self.processor_class.from_pretrained(self.tmpdirname) # Test single text without task token text = "This is a simple text." prompts = processor._construct_prompts(text) self.assertEqual(prompts, [text]) # Test list of texts with task without input texts = ["", ""] prompts = processor._construct_prompts(texts) EXPECTED_PROMPTS_WITHOUT_INPUT = ["What is the text in the image?", "What does the image describe?"] self.assertEqual(prompts, EXPECTED_PROMPTS_WITHOUT_INPUT) # Test task with input texts = [" a red car"] prompts = processor._construct_prompts(texts) EXPECTED_PROMPTS_WITH_INPUT = ["Locate the phrases in the caption: a red car"] self.assertEqual(prompts, EXPECTED_PROMPTS_WITH_INPUT) # Test invalid prompt with task token not alone with self.assertRaises(ValueError): processor._construct_prompts(" extra text") def test_quantizer_quantize_dequantize(self): processor = self.processor_class.from_pretrained(self.tmpdirname) # Test bounding box quantization and dequantization boxes = torch.tensor([[0, 0, 30, 40], [500, 550, 600, 690], [750, 1121, 851, 1239]], dtype=torch.int32) size = (800, 1200) quantized_boxes = processor.post_processor.quantize(boxes, size) dequantized_boxes = processor.post_processor.dequantize(quantized_boxes, size) EXPECTED_DEQUANTIZED_BBOX = torch.tensor( [[0, 0, 30, 40], [500, 550, 600, 690], [750, 1121, 799, 1199]], dtype=torch.int32 ) self.assertTrue(torch.allclose(dequantized_boxes, EXPECTED_DEQUANTIZED_BBOX)) # Test points quantization and dequantization points = torch.tensor([[0, 0], [300, 400], [850, 1250]], dtype=torch.int32) quantized_points = processor.post_processor.quantize(points, size) dequantized_points = processor.post_processor.dequantize(quantized_points, size) EXPECTED_DEQUANTIZED_POINTS = torch.tensor([[0, 0], [300, 400], [799, 1199]], dtype=torch.int32) self.assertTrue(torch.allclose(dequantized_points, EXPECTED_DEQUANTIZED_POINTS)) # Test invalid shape with self.assertRaises(ValueError): processor.post_processor.quantize(torch.tensor([[1, 2, 3]]), size) def test_post_process_parse_description_with_bboxes_from_text_and_spans(self): processor = self.processor_class.from_pretrained(self.tmpdirname) text_without_phrase = "" image_size = (1000, 1000) parsed_text_without_phrase = processor.post_processor.parse_description_with_bboxes_from_text_and_spans( text_without_phrase, image_size=image_size, allow_empty_phrase=True ) EXPECTED_PARSED_TEXT_WITHOUT_PHRASE = [ {"bbox": [53, 334, 933, 775], "cat_name": ""}, {"bbox": [711, 203, 906, 546], "cat_name": ""}, {"bbox": [585, 309, 774, 709], "cat_name": ""}, ] self.assertEqual(parsed_text_without_phrase, EXPECTED_PARSED_TEXT_WITHOUT_PHRASE) text_with_phrase = ( "cardoor handle" ) image_size = (1000, 1000) parsed_text_with_phrase = processor.post_processor.parse_description_with_bboxes_from_text_and_spans( text_with_phrase, image_size=image_size, allow_empty_phrase=False ) EXPECTED_PARSED_TEXT_WITH_PHRASE = [ {"bbox": [53, 334, 933, 775], "cat_name": "car"}, {"bbox": [425, 504, 474, 516], "cat_name": "door handle"}, ] self.assertEqual(parsed_text_with_phrase, EXPECTED_PARSED_TEXT_WITH_PHRASE) def test_post_process_parse_description_with_polygons_from_text_and_spans(self): processor = self.processor_class.from_pretrained(self.tmpdirname) text_without_phrase = "" image_size = (1000, 1000) parsed_text_without_phrase = processor.post_processor.parse_description_with_polygons_from_text_and_spans( text_without_phrase, image_size=image_size, allow_empty_phrase=True ) EXPECTED_PARSED_TEXT_WITHOUT_PHRASE = [ { "cat_name": "", "polygons": [[279, 379, 282, 379, 290, 373, 293, 373, 298, 369, 301, 369]], } ] self.assertEqual(parsed_text_without_phrase, EXPECTED_PARSED_TEXT_WITHOUT_PHRASE) text_with_phrase = ( "Hello" ) image_size = (1000, 1000) parsed_text_with_phrase = processor.post_processor.parse_description_with_polygons_from_text_and_spans( text_with_phrase, image_size=image_size, allow_empty_phrase=False ) EXPECTED_PARSED_TEXT_WITH_PHRASE = [ { "cat_name": "Hello", "polygons": [[769, 248, 771, 234, 773, 206, 773, 198, 771, 193]], } ] self.assertEqual(parsed_text_with_phrase, EXPECTED_PARSED_TEXT_WITH_PHRASE) def test_post_process_parse_ocr_from_text_and_spans(self): processor = self.processor_class.from_pretrained(self.tmpdirname) text = "HelloWorld" image_size = (1000, 1000) parsed = processor.post_processor.parse_ocr_from_text_and_spans( text, pattern=None, image_size=image_size, area_threshold=0.0 ) EXPECTED_PARSED_OCR = [ {"quad_box": [100, 100, 200, 100, 200, 200, 100, 200], "text": "Hello"}, {"quad_box": [300, 300, 400, 300, 400, 400, 300, 400], "text": "World"}, ] self.assertEqual(parsed, EXPECTED_PARSED_OCR) # Test with area threshold filtering small_text = "Small" parsed_small = processor.post_processor.parse_ocr_from_text_and_spans( small_text, pattern=None, image_size=image_size, area_threshold=0.01 ) EXPECTED_PARSED_OCR_SMALL = [] self.assertEqual(parsed_small, EXPECTED_PARSED_OCR_SMALL) def test_post_process_parse_phrase_grounding_from_text_and_spans(self): processor = self.processor_class.from_pretrained(self.tmpdirname) text = "red carsky" image_size = (1000, 1000) parsed = processor.post_processor.parse_phrase_grounding_from_text_and_spans(text, image_size=image_size) EXPECTED_PARSED_PHRASE_GROUNDING = [ {"bbox": [[53, 334, 933, 775], [711, 203, 906, 546]], "cat_name": "red car"}, {"bbox": [[0, 0, 1000, 300]], "cat_name": "sky"}, ] self.assertEqual(parsed, EXPECTED_PARSED_PHRASE_GROUNDING) # Test with blacklisted phrase blacklisted_text = "the image" parsed_blacklisted = processor.post_processor.parse_phrase_grounding_from_text_and_spans( blacklisted_text, image_size=image_size ) EXPECTED_PARSED_BLACKLISTED = [] self.assertEqual(parsed_blacklisted, EXPECTED_PARSED_BLACKLISTED) def test_post_process_generation(self): processor = self.processor_class.from_pretrained(self.tmpdirname) # Test pure_text task text = "Hello world" cap_result = processor.post_process_generation(text=text, task="", image_size=None) EXPECTED_PURE_TEXT_RESULT = {"": "Hello world"} self.assertEqual(cap_result, EXPECTED_PURE_TEXT_RESULT) # Test description_with_bboxes task text = "car" od_result = processor.post_process_generation(text=text, task="", image_size=(1000, 1000)) EXPECTED_BBOXES_RESULT = {"": {"bboxes": [[53, 334, 933, 775]], "labels": ["car"]}} self.assertEqual(od_result, EXPECTED_BBOXES_RESULT) # Test OCR task text = "Hello" ocr_result = processor.post_process_generation(text=text, task="", image_size=(1000, 1000)) EXPECTED_OCR_RESULT = { "": {"quad_boxes": [[100, 100, 200, 100, 200, 200, 100, 200]], "labels": ["Hello"]} } self.assertEqual(ocr_result, EXPECTED_OCR_RESULT)