Some checks failed
Self-hosted runner (nightly-past-ci-caller) / Get number (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.11 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.10 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.9 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.8 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.7 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.6 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.5 (push) Has been cancelled
Self-hosted runner (benchmark) / Benchmark (aws-g5-4xlarge-cache) (push) Has been cancelled
Build documentation / build (push) Has been cancelled
Build documentation / build_other_lang (push) Has been cancelled
CodeQL Security Analysis / CodeQL Analysis (push) Has been cancelled
New model PR merged notification / Notify new model (push) Has been cancelled
PR CI / pr-ci (push) Has been cancelled
Slow tests on important models (on Push - A10) / Get all modified files (push) Has been cancelled
Secret Leaks / trufflehog (push) Has been cancelled
Update Transformers metadata / build_and_package (push) Has been cancelled
Slow tests on important models (on Push - A10) / Model CI (push) Has been cancelled
Check Tiny Models / Check tiny models (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Model CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Pipeline CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Example CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / DeepSpeed CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Trainer/FSDP CI (push) Has been cancelled
Nvidia CI - Flash Attn / Setup (push) Has been cancelled
Nvidia CI - Flash Attn / Model CI (push) Has been cancelled
Nvidia CI / Setup (push) Has been cancelled
Nvidia CI / Model CI (push) Has been cancelled
Nvidia CI / Torch pipeline CI (push) Has been cancelled
Nvidia CI / Example CI (push) Has been cancelled
Nvidia CI / Trainer/FSDP CI (push) Has been cancelled
Nvidia CI / DeepSpeed CI (push) Has been cancelled
Nvidia CI / Quantization CI (push) Has been cancelled
Nvidia CI / Kernels CI (push) Has been cancelled
Doctests / Setup (push) Has been cancelled
Doctests / Call doctest jobs (push) Has been cancelled
Doctests / Send results to webhook (push) Has been cancelled
Extras Smoke Test / Get supported Python versions (push) Has been cancelled
Extras Smoke Test / Test extras on Python ${{ matrix.python-version }} (push) Has been cancelled
Extras Smoke Test / Check Slack token availability (push) Has been cancelled
Extras Smoke Test / Notify failures to Slack (push) Has been cancelled
Self-hosted runner (AMD scheduled CI caller) / Trigger Scheduled AMD CI (push) Has been cancelled
Stale Bot / Close Stale Issues (push) Has been cancelled
236 lines
11 KiB
Python
236 lines
11 KiB
Python
# Copyright 2025 The HuggingFace Team. All rights reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
import unittest
|
|
|
|
from transformers import Florence2Processor
|
|
from transformers.testing_utils import require_torch, require_vision
|
|
from transformers.utils import is_torch_available
|
|
|
|
from ...test_processing_common import ProcessorTesterMixin
|
|
|
|
|
|
if is_torch_available():
|
|
import torch
|
|
|
|
|
|
@require_torch
|
|
@require_vision
|
|
class Florence2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|
processor_class = Florence2Processor
|
|
|
|
@classmethod
|
|
def _setup_image_processor(cls):
|
|
image_processor_class = cls._get_component_class_from_processor("image_processor")
|
|
image_processor = image_processor_class.from_pretrained("florence-community/Florence-2-base")
|
|
image_processor.image_seq_length = 0
|
|
return image_processor
|
|
|
|
@classmethod
|
|
def _setup_tokenizer(cls):
|
|
tokenizer_class = cls._get_component_class_from_processor("tokenizer")
|
|
tokenizer = tokenizer_class.from_pretrained("florence-community/Florence-2-base")
|
|
tokenizer.image_token = "<image>"
|
|
tokenizer.image_token_id = tokenizer.encode(tokenizer.image_token, add_special_tokens=False)[0]
|
|
return tokenizer
|
|
|
|
@unittest.skip("Florence2Processor adds prefix and suffix tokens to the text")
|
|
def test_tokenizer_defaults(self):
|
|
pass
|
|
|
|
@staticmethod
|
|
def prepare_processor_dict():
|
|
return {
|
|
"post_processor_config": {
|
|
"ocr": {
|
|
"pattern": r"(.+?)<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>",
|
|
"area_threshold": 0.0,
|
|
},
|
|
"phrase_grounding": {"banned_grounding_tokens": ["the image"]},
|
|
"pure_text": {},
|
|
"description_with_bboxes": {},
|
|
"description_with_polygons": {},
|
|
"polygons": {},
|
|
"bboxes": {},
|
|
"description_with_bboxes_or_polygons": {},
|
|
}
|
|
}
|
|
|
|
def test_construct_prompts(self):
|
|
processor = self.processor_class.from_pretrained(self.tmpdirname)
|
|
|
|
# Test single text without task token
|
|
text = "This is a simple text."
|
|
prompts = processor._construct_prompts(text)
|
|
self.assertEqual(prompts, [text])
|
|
|
|
# Test list of texts with task without input
|
|
texts = ["<OCR>", "<CAPTION>"]
|
|
prompts = processor._construct_prompts(texts)
|
|
EXPECTED_PROMPTS_WITHOUT_INPUT = ["What is the text in the image?", "What does the image describe?"]
|
|
self.assertEqual(prompts, EXPECTED_PROMPTS_WITHOUT_INPUT)
|
|
|
|
# Test task with input
|
|
texts = ["<CAPTION_TO_PHRASE_GROUNDING> a red car"]
|
|
prompts = processor._construct_prompts(texts)
|
|
EXPECTED_PROMPTS_WITH_INPUT = ["Locate the phrases in the caption: a red car"]
|
|
self.assertEqual(prompts, EXPECTED_PROMPTS_WITH_INPUT)
|
|
|
|
# Test invalid prompt with task token not alone
|
|
with self.assertRaises(ValueError):
|
|
processor._construct_prompts("<OCR> extra text")
|
|
|
|
def test_quantizer_quantize_dequantize(self):
|
|
processor = self.processor_class.from_pretrained(self.tmpdirname)
|
|
|
|
# Test bounding box quantization and dequantization
|
|
boxes = torch.tensor([[0, 0, 30, 40], [500, 550, 600, 690], [750, 1121, 851, 1239]], dtype=torch.int32)
|
|
size = (800, 1200)
|
|
quantized_boxes = processor.post_processor.quantize(boxes, size)
|
|
dequantized_boxes = processor.post_processor.dequantize(quantized_boxes, size)
|
|
EXPECTED_DEQUANTIZED_BBOX = torch.tensor(
|
|
[[0, 0, 30, 40], [500, 550, 600, 690], [750, 1121, 799, 1199]], dtype=torch.int32
|
|
)
|
|
self.assertTrue(torch.allclose(dequantized_boxes, EXPECTED_DEQUANTIZED_BBOX))
|
|
|
|
# Test points quantization and dequantization
|
|
points = torch.tensor([[0, 0], [300, 400], [850, 1250]], dtype=torch.int32)
|
|
quantized_points = processor.post_processor.quantize(points, size)
|
|
dequantized_points = processor.post_processor.dequantize(quantized_points, size)
|
|
EXPECTED_DEQUANTIZED_POINTS = torch.tensor([[0, 0], [300, 400], [799, 1199]], dtype=torch.int32)
|
|
self.assertTrue(torch.allclose(dequantized_points, EXPECTED_DEQUANTIZED_POINTS))
|
|
|
|
# Test invalid shape
|
|
with self.assertRaises(ValueError):
|
|
processor.post_processor.quantize(torch.tensor([[1, 2, 3]]), size)
|
|
|
|
def test_post_process_parse_description_with_bboxes_from_text_and_spans(self):
|
|
processor = self.processor_class.from_pretrained(self.tmpdirname)
|
|
text_without_phrase = "</s><s><loc_53><loc_334><loc_933><loc_775><loc_711><loc_203><loc_906><loc_546><loc_585><loc_309><loc_774><loc_709><loc_577></s><pad>"
|
|
image_size = (1000, 1000)
|
|
parsed_text_without_phrase = processor.post_processor.parse_description_with_bboxes_from_text_and_spans(
|
|
text_without_phrase, image_size=image_size, allow_empty_phrase=True
|
|
)
|
|
EXPECTED_PARSED_TEXT_WITHOUT_PHRASE = [
|
|
{"bbox": [53, 334, 933, 775], "cat_name": ""},
|
|
{"bbox": [711, 203, 906, 546], "cat_name": ""},
|
|
{"bbox": [585, 309, 774, 709], "cat_name": ""},
|
|
]
|
|
self.assertEqual(parsed_text_without_phrase, EXPECTED_PARSED_TEXT_WITHOUT_PHRASE)
|
|
|
|
text_with_phrase = (
|
|
"</s><s>car<loc_53><loc_334><loc_933><loc_775>door handle<loc_425><loc_504><loc_474><loc_516></s><pad>"
|
|
)
|
|
image_size = (1000, 1000)
|
|
parsed_text_with_phrase = processor.post_processor.parse_description_with_bboxes_from_text_and_spans(
|
|
text_with_phrase, image_size=image_size, allow_empty_phrase=False
|
|
)
|
|
EXPECTED_PARSED_TEXT_WITH_PHRASE = [
|
|
{"bbox": [53, 334, 933, 775], "cat_name": "car"},
|
|
{"bbox": [425, 504, 474, 516], "cat_name": "door handle"},
|
|
]
|
|
self.assertEqual(parsed_text_with_phrase, EXPECTED_PARSED_TEXT_WITH_PHRASE)
|
|
|
|
def test_post_process_parse_description_with_polygons_from_text_and_spans(self):
|
|
processor = self.processor_class.from_pretrained(self.tmpdirname)
|
|
text_without_phrase = "<loc_279><loc_379><loc_282><loc_379><loc_290><loc_373><loc_293><loc_373><loc_298><loc_369><loc_301><loc_369>"
|
|
image_size = (1000, 1000)
|
|
parsed_text_without_phrase = processor.post_processor.parse_description_with_polygons_from_text_and_spans(
|
|
text_without_phrase, image_size=image_size, allow_empty_phrase=True
|
|
)
|
|
EXPECTED_PARSED_TEXT_WITHOUT_PHRASE = [
|
|
{
|
|
"cat_name": "",
|
|
"polygons": [[279, 379, 282, 379, 290, 373, 293, 373, 298, 369, 301, 369]],
|
|
}
|
|
]
|
|
self.assertEqual(parsed_text_without_phrase, EXPECTED_PARSED_TEXT_WITHOUT_PHRASE)
|
|
|
|
text_with_phrase = (
|
|
"Hello<loc_769><loc_248><loc_771><loc_234><loc_773><loc_206><loc_773><loc_198><loc_771><loc_193>"
|
|
)
|
|
image_size = (1000, 1000)
|
|
parsed_text_with_phrase = processor.post_processor.parse_description_with_polygons_from_text_and_spans(
|
|
text_with_phrase, image_size=image_size, allow_empty_phrase=False
|
|
)
|
|
EXPECTED_PARSED_TEXT_WITH_PHRASE = [
|
|
{
|
|
"cat_name": "Hello",
|
|
"polygons": [[769, 248, 771, 234, 773, 206, 773, 198, 771, 193]],
|
|
}
|
|
]
|
|
self.assertEqual(parsed_text_with_phrase, EXPECTED_PARSED_TEXT_WITH_PHRASE)
|
|
|
|
def test_post_process_parse_ocr_from_text_and_spans(self):
|
|
processor = self.processor_class.from_pretrained(self.tmpdirname)
|
|
text = "</s><s>Hello<loc_100><loc_100><loc_200><loc_100><loc_200><loc_200><loc_100><loc_200>World<loc_300><loc_300><loc_400><loc_300><loc_400><loc_400><loc_300><loc_400></s>"
|
|
image_size = (1000, 1000)
|
|
parsed = processor.post_processor.parse_ocr_from_text_and_spans(
|
|
text, pattern=None, image_size=image_size, area_threshold=0.0
|
|
)
|
|
EXPECTED_PARSED_OCR = [
|
|
{"quad_box": [100, 100, 200, 100, 200, 200, 100, 200], "text": "Hello"},
|
|
{"quad_box": [300, 300, 400, 300, 400, 400, 300, 400], "text": "World"},
|
|
]
|
|
self.assertEqual(parsed, EXPECTED_PARSED_OCR)
|
|
|
|
# Test with area threshold filtering
|
|
small_text = "Small<loc_1><loc_1><loc_2><loc_2><loc_2><loc_2><loc_1><loc_1>"
|
|
parsed_small = processor.post_processor.parse_ocr_from_text_and_spans(
|
|
small_text, pattern=None, image_size=image_size, area_threshold=0.01
|
|
)
|
|
EXPECTED_PARSED_OCR_SMALL = []
|
|
self.assertEqual(parsed_small, EXPECTED_PARSED_OCR_SMALL)
|
|
|
|
def test_post_process_parse_phrase_grounding_from_text_and_spans(self):
|
|
processor = self.processor_class.from_pretrained(self.tmpdirname)
|
|
text = "</s><s>red car<loc_53><loc_334><loc_933><loc_775><loc_711><loc_203><loc_906><loc_546>sky<loc_0><loc_0><loc_1000><loc_300></s>"
|
|
image_size = (1000, 1000)
|
|
parsed = processor.post_processor.parse_phrase_grounding_from_text_and_spans(text, image_size=image_size)
|
|
EXPECTED_PARSED_PHRASE_GROUNDING = [
|
|
{"bbox": [[53, 334, 933, 775], [711, 203, 906, 546]], "cat_name": "red car"},
|
|
{"bbox": [[0, 0, 1000, 300]], "cat_name": "sky"},
|
|
]
|
|
self.assertEqual(parsed, EXPECTED_PARSED_PHRASE_GROUNDING)
|
|
|
|
# Test with blacklisted phrase
|
|
blacklisted_text = "the image<loc_100><loc_100><loc_200><loc_200>"
|
|
parsed_blacklisted = processor.post_processor.parse_phrase_grounding_from_text_and_spans(
|
|
blacklisted_text, image_size=image_size
|
|
)
|
|
EXPECTED_PARSED_BLACKLISTED = []
|
|
self.assertEqual(parsed_blacklisted, EXPECTED_PARSED_BLACKLISTED)
|
|
|
|
def test_post_process_generation(self):
|
|
processor = self.processor_class.from_pretrained(self.tmpdirname)
|
|
|
|
# Test pure_text task
|
|
text = "<s>Hello world</s>"
|
|
cap_result = processor.post_process_generation(text=text, task="<CAPTION>", image_size=None)
|
|
EXPECTED_PURE_TEXT_RESULT = {"<CAPTION>": "Hello world"}
|
|
self.assertEqual(cap_result, EXPECTED_PURE_TEXT_RESULT)
|
|
|
|
# Test description_with_bboxes task
|
|
text = "car<loc_53><loc_334><loc_933><loc_775>"
|
|
od_result = processor.post_process_generation(text=text, task="<OD>", image_size=(1000, 1000))
|
|
EXPECTED_BBOXES_RESULT = {"<OD>": {"bboxes": [[53, 334, 933, 775]], "labels": ["car"]}}
|
|
self.assertEqual(od_result, EXPECTED_BBOXES_RESULT)
|
|
|
|
# Test OCR task
|
|
text = "Hello<loc_100><loc_100><loc_200><loc_100><loc_200><loc_200><loc_100><loc_200>"
|
|
ocr_result = processor.post_process_generation(text=text, task="<OCR_WITH_REGION>", image_size=(1000, 1000))
|
|
EXPECTED_OCR_RESULT = {
|
|
"<OCR_WITH_REGION>": {"quad_boxes": [[100, 100, 200, 100, 200, 200, 100, 200]], "labels": ["Hello"]}
|
|
}
|
|
self.assertEqual(ocr_result, EXPECTED_OCR_RESULT)
|