# Copyright 2024 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import unittest from transformers import OmDetTurboProcessor from transformers.testing_utils import require_torch, require_vision from transformers.utils import is_torch_available from ...test_processing_common import ProcessorTesterMixin IMAGE_MEAN = [123.675, 116.28, 103.53] IMAGE_STD = [58.395, 57.12, 57.375] if is_torch_available(): import torch from transformers.models.omdet_turbo.modeling_omdet_turbo import OmDetTurboObjectDetectionOutput @require_torch @require_vision class OmDetTurboProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_class = OmDetTurboProcessor text_input_name = "classes_input_ids" input_keys = [ "tasks_input_ids", "tasks_attention_mask", "classes_input_ids", "classes_attention_mask", "classes_structure", "pixel_values", "pixel_mask", ] batch_size = 5 num_queries = 5 embed_dim = 3 @classmethod def _setup_tokenizer(cls): tokenizer_class = cls._get_component_class_from_processor("tokenizer") return tokenizer_class.from_pretrained("openai/clip-vit-base-patch32") def get_fake_omdet_turbo_output(self): classes = self.get_fake_omdet_turbo_classes() classes_structure = torch.tensor([len(sublist) for sublist in classes]) torch.manual_seed(42) return OmDetTurboObjectDetectionOutput( decoder_coord_logits=torch.rand(self.batch_size, self.num_queries, 4), decoder_class_logits=torch.rand(self.batch_size, self.num_queries, self.embed_dim), classes_structure=classes_structure, ) def get_fake_omdet_turbo_classes(self): return [[f"class{i}_{j}" for i in range(self.num_queries)] for j in range(self.batch_size)] def test_post_process_grounded_object_detection(self): processor = self.get_processor() omdet_turbo_output = self.get_fake_omdet_turbo_output() omdet_turbo_classes = self.get_fake_omdet_turbo_classes() post_processed = processor.post_process_grounded_object_detection( omdet_turbo_output, omdet_turbo_classes, target_sizes=[(400, 30) for _ in range(self.batch_size)] ) self.assertEqual(len(post_processed), self.batch_size) self.assertEqual(list(post_processed[0].keys()), ["boxes", "scores", "labels", "text_labels"]) self.assertEqual(post_processed[0]["boxes"].shape, (self.num_queries, 4)) self.assertEqual(post_processed[0]["scores"].shape, (self.num_queries,)) expected_scores = torch.tensor([0.7310, 0.6579, 0.6513, 0.6444, 0.6252]) torch.testing.assert_close(post_processed[0]["scores"], expected_scores, rtol=1e-4, atol=1e-4) expected_box_slice = torch.tensor([14.9657, 141.2052, 30.0000, 312.9670]) torch.testing.assert_close(post_processed[0]["boxes"][0], expected_box_slice, rtol=1e-4, atol=1e-4)