Files
transformers/tests/models/florence2/test_processing_florence2.py
陈赣 06f1fd69a6
Some checks failed
Self-hosted runner (nightly-past-ci-caller) / Get number (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.11 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.10 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.9 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.8 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.7 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.6 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.5 (push) Has been cancelled
Self-hosted runner (benchmark) / Benchmark (aws-g5-4xlarge-cache) (push) Has been cancelled
Build documentation / build (push) Has been cancelled
Build documentation / build_other_lang (push) Has been cancelled
CodeQL Security Analysis / CodeQL Analysis (push) Has been cancelled
New model PR merged notification / Notify new model (push) Has been cancelled
PR CI / pr-ci (push) Has been cancelled
Slow tests on important models (on Push - A10) / Get all modified files (push) Has been cancelled
Secret Leaks / trufflehog (push) Has been cancelled
Update Transformers metadata / build_and_package (push) Has been cancelled
Slow tests on important models (on Push - A10) / Model CI (push) Has been cancelled
Check Tiny Models / Check tiny models (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Model CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Pipeline CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Example CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / DeepSpeed CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Trainer/FSDP CI (push) Has been cancelled
Nvidia CI - Flash Attn / Setup (push) Has been cancelled
Nvidia CI - Flash Attn / Model CI (push) Has been cancelled
Nvidia CI / Setup (push) Has been cancelled
Nvidia CI / Model CI (push) Has been cancelled
Nvidia CI / Torch pipeline CI (push) Has been cancelled
Nvidia CI / Example CI (push) Has been cancelled
Nvidia CI / Trainer/FSDP CI (push) Has been cancelled
Nvidia CI / DeepSpeed CI (push) Has been cancelled
Nvidia CI / Quantization CI (push) Has been cancelled
Nvidia CI / Kernels CI (push) Has been cancelled
Doctests / Setup (push) Has been cancelled
Doctests / Call doctest jobs (push) Has been cancelled
Doctests / Send results to webhook (push) Has been cancelled
Extras Smoke Test / Get supported Python versions (push) Has been cancelled
Extras Smoke Test / Test extras on Python ${{ matrix.python-version }} (push) Has been cancelled
Extras Smoke Test / Check Slack token availability (push) Has been cancelled
Extras Smoke Test / Notify failures to Slack (push) Has been cancelled
Self-hosted runner (AMD scheduled CI caller) / Trigger Scheduled AMD CI (push) Has been cancelled
Stale Bot / Close Stale Issues (push) Has been cancelled
first commit
2026-06-05 16:53:03 +08:00

236 lines
11 KiB
Python

# Copyright 2025 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from transformers import Florence2Processor
from transformers.testing_utils import require_torch, require_vision
from transformers.utils import is_torch_available
from ...test_processing_common import ProcessorTesterMixin
if is_torch_available():
import torch
@require_torch
@require_vision
class Florence2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
processor_class = Florence2Processor
@classmethod
def _setup_image_processor(cls):
image_processor_class = cls._get_component_class_from_processor("image_processor")
image_processor = image_processor_class.from_pretrained("florence-community/Florence-2-base")
image_processor.image_seq_length = 0
return image_processor
@classmethod
def _setup_tokenizer(cls):
tokenizer_class = cls._get_component_class_from_processor("tokenizer")
tokenizer = tokenizer_class.from_pretrained("florence-community/Florence-2-base")
tokenizer.image_token = "<image>"
tokenizer.image_token_id = tokenizer.encode(tokenizer.image_token, add_special_tokens=False)[0]
return tokenizer
@unittest.skip("Florence2Processor adds prefix and suffix tokens to the text")
def test_tokenizer_defaults(self):
pass
@staticmethod
def prepare_processor_dict():
return {
"post_processor_config": {
"ocr": {
"pattern": r"(.+?)<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>",
"area_threshold": 0.0,
},
"phrase_grounding": {"banned_grounding_tokens": ["the image"]},
"pure_text": {},
"description_with_bboxes": {},
"description_with_polygons": {},
"polygons": {},
"bboxes": {},
"description_with_bboxes_or_polygons": {},
}
}
def test_construct_prompts(self):
processor = self.processor_class.from_pretrained(self.tmpdirname)
# Test single text without task token
text = "This is a simple text."
prompts = processor._construct_prompts(text)
self.assertEqual(prompts, [text])
# Test list of texts with task without input
texts = ["<OCR>", "<CAPTION>"]
prompts = processor._construct_prompts(texts)
EXPECTED_PROMPTS_WITHOUT_INPUT = ["What is the text in the image?", "What does the image describe?"]
self.assertEqual(prompts, EXPECTED_PROMPTS_WITHOUT_INPUT)
# Test task with input
texts = ["<CAPTION_TO_PHRASE_GROUNDING> a red car"]
prompts = processor._construct_prompts(texts)
EXPECTED_PROMPTS_WITH_INPUT = ["Locate the phrases in the caption: a red car"]
self.assertEqual(prompts, EXPECTED_PROMPTS_WITH_INPUT)
# Test invalid prompt with task token not alone
with self.assertRaises(ValueError):
processor._construct_prompts("<OCR> extra text")
def test_quantizer_quantize_dequantize(self):
processor = self.processor_class.from_pretrained(self.tmpdirname)
# Test bounding box quantization and dequantization
boxes = torch.tensor([[0, 0, 30, 40], [500, 550, 600, 690], [750, 1121, 851, 1239]], dtype=torch.int32)
size = (800, 1200)
quantized_boxes = processor.post_processor.quantize(boxes, size)
dequantized_boxes = processor.post_processor.dequantize(quantized_boxes, size)
EXPECTED_DEQUANTIZED_BBOX = torch.tensor(
[[0, 0, 30, 40], [500, 550, 600, 690], [750, 1121, 799, 1199]], dtype=torch.int32
)
self.assertTrue(torch.allclose(dequantized_boxes, EXPECTED_DEQUANTIZED_BBOX))
# Test points quantization and dequantization
points = torch.tensor([[0, 0], [300, 400], [850, 1250]], dtype=torch.int32)
quantized_points = processor.post_processor.quantize(points, size)
dequantized_points = processor.post_processor.dequantize(quantized_points, size)
EXPECTED_DEQUANTIZED_POINTS = torch.tensor([[0, 0], [300, 400], [799, 1199]], dtype=torch.int32)
self.assertTrue(torch.allclose(dequantized_points, EXPECTED_DEQUANTIZED_POINTS))
# Test invalid shape
with self.assertRaises(ValueError):
processor.post_processor.quantize(torch.tensor([[1, 2, 3]]), size)
def test_post_process_parse_description_with_bboxes_from_text_and_spans(self):
processor = self.processor_class.from_pretrained(self.tmpdirname)
text_without_phrase = "</s><s><loc_53><loc_334><loc_933><loc_775><loc_711><loc_203><loc_906><loc_546><loc_585><loc_309><loc_774><loc_709><loc_577></s><pad>"
image_size = (1000, 1000)
parsed_text_without_phrase = processor.post_processor.parse_description_with_bboxes_from_text_and_spans(
text_without_phrase, image_size=image_size, allow_empty_phrase=True
)
EXPECTED_PARSED_TEXT_WITHOUT_PHRASE = [
{"bbox": [53, 334, 933, 775], "cat_name": ""},
{"bbox": [711, 203, 906, 546], "cat_name": ""},
{"bbox": [585, 309, 774, 709], "cat_name": ""},
]
self.assertEqual(parsed_text_without_phrase, EXPECTED_PARSED_TEXT_WITHOUT_PHRASE)
text_with_phrase = (
"</s><s>car<loc_53><loc_334><loc_933><loc_775>door handle<loc_425><loc_504><loc_474><loc_516></s><pad>"
)
image_size = (1000, 1000)
parsed_text_with_phrase = processor.post_processor.parse_description_with_bboxes_from_text_and_spans(
text_with_phrase, image_size=image_size, allow_empty_phrase=False
)
EXPECTED_PARSED_TEXT_WITH_PHRASE = [
{"bbox": [53, 334, 933, 775], "cat_name": "car"},
{"bbox": [425, 504, 474, 516], "cat_name": "door handle"},
]
self.assertEqual(parsed_text_with_phrase, EXPECTED_PARSED_TEXT_WITH_PHRASE)
def test_post_process_parse_description_with_polygons_from_text_and_spans(self):
processor = self.processor_class.from_pretrained(self.tmpdirname)
text_without_phrase = "<loc_279><loc_379><loc_282><loc_379><loc_290><loc_373><loc_293><loc_373><loc_298><loc_369><loc_301><loc_369>"
image_size = (1000, 1000)
parsed_text_without_phrase = processor.post_processor.parse_description_with_polygons_from_text_and_spans(
text_without_phrase, image_size=image_size, allow_empty_phrase=True
)
EXPECTED_PARSED_TEXT_WITHOUT_PHRASE = [
{
"cat_name": "",
"polygons": [[279, 379, 282, 379, 290, 373, 293, 373, 298, 369, 301, 369]],
}
]
self.assertEqual(parsed_text_without_phrase, EXPECTED_PARSED_TEXT_WITHOUT_PHRASE)
text_with_phrase = (
"Hello<loc_769><loc_248><loc_771><loc_234><loc_773><loc_206><loc_773><loc_198><loc_771><loc_193>"
)
image_size = (1000, 1000)
parsed_text_with_phrase = processor.post_processor.parse_description_with_polygons_from_text_and_spans(
text_with_phrase, image_size=image_size, allow_empty_phrase=False
)
EXPECTED_PARSED_TEXT_WITH_PHRASE = [
{
"cat_name": "Hello",
"polygons": [[769, 248, 771, 234, 773, 206, 773, 198, 771, 193]],
}
]
self.assertEqual(parsed_text_with_phrase, EXPECTED_PARSED_TEXT_WITH_PHRASE)
def test_post_process_parse_ocr_from_text_and_spans(self):
processor = self.processor_class.from_pretrained(self.tmpdirname)
text = "</s><s>Hello<loc_100><loc_100><loc_200><loc_100><loc_200><loc_200><loc_100><loc_200>World<loc_300><loc_300><loc_400><loc_300><loc_400><loc_400><loc_300><loc_400></s>"
image_size = (1000, 1000)
parsed = processor.post_processor.parse_ocr_from_text_and_spans(
text, pattern=None, image_size=image_size, area_threshold=0.0
)
EXPECTED_PARSED_OCR = [
{"quad_box": [100, 100, 200, 100, 200, 200, 100, 200], "text": "Hello"},
{"quad_box": [300, 300, 400, 300, 400, 400, 300, 400], "text": "World"},
]
self.assertEqual(parsed, EXPECTED_PARSED_OCR)
# Test with area threshold filtering
small_text = "Small<loc_1><loc_1><loc_2><loc_2><loc_2><loc_2><loc_1><loc_1>"
parsed_small = processor.post_processor.parse_ocr_from_text_and_spans(
small_text, pattern=None, image_size=image_size, area_threshold=0.01
)
EXPECTED_PARSED_OCR_SMALL = []
self.assertEqual(parsed_small, EXPECTED_PARSED_OCR_SMALL)
def test_post_process_parse_phrase_grounding_from_text_and_spans(self):
processor = self.processor_class.from_pretrained(self.tmpdirname)
text = "</s><s>red car<loc_53><loc_334><loc_933><loc_775><loc_711><loc_203><loc_906><loc_546>sky<loc_0><loc_0><loc_1000><loc_300></s>"
image_size = (1000, 1000)
parsed = processor.post_processor.parse_phrase_grounding_from_text_and_spans(text, image_size=image_size)
EXPECTED_PARSED_PHRASE_GROUNDING = [
{"bbox": [[53, 334, 933, 775], [711, 203, 906, 546]], "cat_name": "red car"},
{"bbox": [[0, 0, 1000, 300]], "cat_name": "sky"},
]
self.assertEqual(parsed, EXPECTED_PARSED_PHRASE_GROUNDING)
# Test with blacklisted phrase
blacklisted_text = "the image<loc_100><loc_100><loc_200><loc_200>"
parsed_blacklisted = processor.post_processor.parse_phrase_grounding_from_text_and_spans(
blacklisted_text, image_size=image_size
)
EXPECTED_PARSED_BLACKLISTED = []
self.assertEqual(parsed_blacklisted, EXPECTED_PARSED_BLACKLISTED)
def test_post_process_generation(self):
processor = self.processor_class.from_pretrained(self.tmpdirname)
# Test pure_text task
text = "<s>Hello world</s>"
cap_result = processor.post_process_generation(text=text, task="<CAPTION>", image_size=None)
EXPECTED_PURE_TEXT_RESULT = {"<CAPTION>": "Hello world"}
self.assertEqual(cap_result, EXPECTED_PURE_TEXT_RESULT)
# Test description_with_bboxes task
text = "car<loc_53><loc_334><loc_933><loc_775>"
od_result = processor.post_process_generation(text=text, task="<OD>", image_size=(1000, 1000))
EXPECTED_BBOXES_RESULT = {"<OD>": {"bboxes": [[53, 334, 933, 775]], "labels": ["car"]}}
self.assertEqual(od_result, EXPECTED_BBOXES_RESULT)
# Test OCR task
text = "Hello<loc_100><loc_100><loc_200><loc_100><loc_200><loc_200><loc_100><loc_200>"
ocr_result = processor.post_process_generation(text=text, task="<OCR_WITH_REGION>", image_size=(1000, 1000))
EXPECTED_OCR_RESULT = {
"<OCR_WITH_REGION>": {"quad_boxes": [[100, 100, 200, 100, 200, 200, 100, 200]], "labels": ["Hello"]}
}
self.assertEqual(ocr_result, EXPECTED_OCR_RESULT)