first commit
Some checks failed
Self-hosted runner (nightly-past-ci-caller) / Get number (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.11 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.10 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.9 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.8 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.7 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.6 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.5 (push) Has been cancelled
Self-hosted runner (benchmark) / Benchmark (aws-g5-4xlarge-cache) (push) Has been cancelled
Build documentation / build (push) Has been cancelled
Build documentation / build_other_lang (push) Has been cancelled
CodeQL Security Analysis / CodeQL Analysis (push) Has been cancelled
New model PR merged notification / Notify new model (push) Has been cancelled
PR CI / pr-ci (push) Has been cancelled
Slow tests on important models (on Push - A10) / Get all modified files (push) Has been cancelled
Secret Leaks / trufflehog (push) Has been cancelled
Update Transformers metadata / build_and_package (push) Has been cancelled
Slow tests on important models (on Push - A10) / Model CI (push) Has been cancelled
Check Tiny Models / Check tiny models (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Model CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Pipeline CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Example CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / DeepSpeed CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Trainer/FSDP CI (push) Has been cancelled
Nvidia CI - Flash Attn / Setup (push) Has been cancelled
Nvidia CI - Flash Attn / Model CI (push) Has been cancelled
Nvidia CI / Setup (push) Has been cancelled
Nvidia CI / Model CI (push) Has been cancelled
Nvidia CI / Torch pipeline CI (push) Has been cancelled
Nvidia CI / Example CI (push) Has been cancelled
Nvidia CI / Trainer/FSDP CI (push) Has been cancelled
Nvidia CI / DeepSpeed CI (push) Has been cancelled
Nvidia CI / Quantization CI (push) Has been cancelled
Nvidia CI / Kernels CI (push) Has been cancelled
Doctests / Setup (push) Has been cancelled
Doctests / Call doctest jobs (push) Has been cancelled
Doctests / Send results to webhook (push) Has been cancelled
Extras Smoke Test / Get supported Python versions (push) Has been cancelled
Extras Smoke Test / Test extras on Python ${{ matrix.python-version }} (push) Has been cancelled
Extras Smoke Test / Check Slack token availability (push) Has been cancelled
Extras Smoke Test / Notify failures to Slack (push) Has been cancelled
Self-hosted runner (AMD scheduled CI caller) / Trigger Scheduled AMD CI (push) Has been cancelled
Stale Bot / Close Stale Issues (push) Has been cancelled

This commit is contained in:
陈赣
2026-06-05 16:53:03 +08:00
commit 06f1fd69a6
6047 changed files with 1895387 additions and 0 deletions

View File

View File

@@ -0,0 +1,626 @@
# Copyright 2024 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import copy
import inspect
import unittest
from functools import cached_property
import pytest
from datasets import load_dataset
from transformers import UdopConfig, is_torch_available
from transformers.testing_utils import (
require_sentencepiece,
require_tokenizers,
require_torch,
require_vision,
slow,
torch_device,
)
from ...generation.test_utils import GenerationTesterMixin
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ModelTesterMixin, ids_tensor
from ...test_pipeline_mixin import PipelineTesterMixin
if is_torch_available():
import torch
import torch.nn.functional as F
from transformers import UdopEncoderModel, UdopForConditionalGeneration, UdopModel, UdopProcessor
class UdopModelTester:
def __init__(
self,
parent,
vocab_size=99,
batch_size=13,
encoder_seq_length=7,
decoder_seq_length=9,
# For common tests
is_training=True,
use_attention_mask=True,
use_labels=True,
hidden_size=32,
num_hidden_layers=2,
num_attention_heads=4,
d_ff=37,
relative_attention_num_buckets=32,
dropout_rate=0.1,
initializer_factor=0.002,
eos_token_id=1,
pad_token_id=0,
scope=None,
decoder_layers=None,
range_bbox=1000,
decoder_start_token_id=0,
):
self.parent = parent
self.batch_size = batch_size
self.encoder_seq_length = encoder_seq_length
self.decoder_seq_length = decoder_seq_length
# For common tests
self.seq_length = self.decoder_seq_length
self.is_training = is_training
self.use_attention_mask = use_attention_mask
self.use_labels = use_labels
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.d_ff = d_ff
self.relative_attention_num_buckets = relative_attention_num_buckets
self.dropout_rate = dropout_rate
self.initializer_factor = initializer_factor
self.eos_token_id = eos_token_id
self.pad_token_id = pad_token_id
self.scope = None
self.decoder_layers = decoder_layers
self.range_bbox = range_bbox
self.decoder_start_token_id = decoder_start_token_id
def prepare_config_and_inputs(self):
input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size)
bbox = ids_tensor([self.batch_size, self.encoder_seq_length, 4], self.range_bbox).float()
# Ensure that bbox is legal
for i in range(bbox.shape[0]):
for j in range(bbox.shape[1]):
if bbox[i, j, 3] < bbox[i, j, 1]:
t = bbox[i, j, 3]
bbox[i, j, 3] = bbox[i, j, 1]
bbox[i, j, 1] = t
if bbox[i, j, 2] < bbox[i, j, 0]:
t = bbox[i, j, 2]
bbox[i, j, 2] = bbox[i, j, 0]
bbox[i, j, 0] = t
decoder_input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
attention_mask = None
decoder_attention_mask = None
if self.use_attention_mask:
attention_mask = ids_tensor([self.batch_size, self.encoder_seq_length], vocab_size=2)
decoder_attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
lm_labels = None
if self.use_labels:
lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
config = self.get_config()
return (
config,
input_ids,
bbox,
decoder_input_ids,
attention_mask,
decoder_attention_mask,
lm_labels,
)
def get_config(self):
return UdopConfig(
vocab_size=self.vocab_size,
d_model=self.hidden_size,
d_ff=self.d_ff,
d_kv=self.hidden_size // self.num_attention_heads,
num_layers=self.num_hidden_layers,
num_decoder_layers=self.decoder_layers,
num_heads=self.num_attention_heads,
relative_attention_num_buckets=self.relative_attention_num_buckets,
dropout_rate=self.dropout_rate,
initializer_factor=self.initializer_factor,
eos_token_id=self.eos_token_id,
bos_token_id=self.pad_token_id,
pad_token_id=self.pad_token_id,
decoder_start_token_id=self.decoder_start_token_id,
)
def create_and_check_model(
self,
config,
input_ids,
bbox,
decoder_input_ids,
attention_mask,
decoder_attention_mask,
lm_labels,
):
model = UdopModel(config=config)
model.to(torch_device)
model.eval()
result = model(
input_ids=input_ids,
bbox=bbox,
decoder_input_ids=decoder_input_ids,
attention_mask=attention_mask,
decoder_attention_mask=decoder_attention_mask,
)
result = model(input_ids=input_ids, bbox=bbox, decoder_input_ids=decoder_input_ids)
decoder_output = result.last_hidden_state
decoder_past = result.past_key_values
encoder_output = result.encoder_last_hidden_state
self.parent.assertEqual(encoder_output.size(), (self.batch_size, self.encoder_seq_length, self.hidden_size))
self.parent.assertEqual(decoder_output.size(), (self.batch_size, self.decoder_seq_length, self.hidden_size))
# There should be `num_layers` key value embeddings stored in decoder_past
self.parent.assertEqual(len(decoder_past), config.num_layers)
def create_and_check_with_lm_head(
self,
config,
input_ids,
bbox,
decoder_input_ids,
attention_mask,
decoder_attention_mask,
lm_labels,
):
model = UdopForConditionalGeneration(config=config).to(torch_device).eval()
outputs = model(
input_ids=input_ids,
bbox=bbox,
decoder_input_ids=decoder_input_ids,
decoder_attention_mask=decoder_attention_mask,
labels=lm_labels,
)
self.parent.assertEqual(len(outputs), 4)
self.parent.assertEqual(outputs["logits"].size(), (self.batch_size, self.decoder_seq_length, self.vocab_size))
self.parent.assertEqual(outputs["loss"].size(), ())
def create_and_check_generate_with_past_key_values(
self,
config,
input_ids,
bbox,
decoder_input_ids,
attention_mask,
decoder_attention_mask,
lm_labels,
):
model = UdopForConditionalGeneration(config=config).to(torch_device).eval()
torch.manual_seed(0)
output_without_past_cache = model.generate(
input_ids[:1], bbox=bbox[:1, :, :], num_beams=2, max_length=5, do_sample=True, use_cache=False
)
torch.manual_seed(0)
output_with_past_cache = model.generate(
input_ids[:1], bbox=bbox[:1, :, :], num_beams=2, max_length=5, do_sample=True
)
self.parent.assertTrue(torch.all(output_with_past_cache == output_without_past_cache))
def create_and_check_model_fp16_forward(
self,
config,
input_ids,
bbox,
decoder_input_ids,
attention_mask,
decoder_attention_mask,
lm_labels,
):
model = UdopForConditionalGeneration(config=config).to(torch_device).half().eval()
output = model(input_ids, bbox=bbox, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids).logits
self.parent.assertFalse(torch.isnan(output).any().item())
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
(
config,
input_ids,
bbox,
decoder_input_ids,
attention_mask,
decoder_attention_mask,
lm_labels,
) = config_and_inputs
inputs_dict = {
"input_ids": input_ids,
"attention_mask": attention_mask,
"bbox": bbox,
"decoder_input_ids": decoder_input_ids,
"decoder_attention_mask": decoder_attention_mask,
"use_cache": False,
}
return config, inputs_dict
@require_torch
class UdopModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
all_model_classes = (
(
UdopModel,
UdopForConditionalGeneration,
)
if is_torch_available()
else ()
)
pipeline_model_mapping = (
{"feature-extraction": UdopModel, "image-text-to-text": UdopForConditionalGeneration}
if is_torch_available()
else {}
)
test_resize_embeddings = True
is_encoder_decoder = True
test_cpu_offload = False
# The small UDOP model needs higher percentages for CPU/MP tests
model_split_percents = [0.8, 0.9]
def setUp(self):
self.model_tester = UdopModelTester(self)
self.config_tester = ConfigTester(self, config_class=UdopConfig, d_model=32)
def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
inputs_dict = copy.deepcopy(inputs_dict)
if model_class.__name__ == "UdopForConditionalGeneration":
if return_labels:
inputs_dict["labels"] = torch.zeros(
(self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
)
return inputs_dict
def test_config(self):
self.config_tester.run_common_tests()
def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)
def test_with_lm_head(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_with_lm_head(*config_and_inputs)
def test_generate_with_past_key_values(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_generate_with_past_key_values(*config_and_inputs)
@unittest.skipIf(torch_device == "cpu", "Can't do half precision")
def test_model_fp16_forward(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model_fp16_forward(*config_and_inputs)
@pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.")
def test_training_gradient_checkpointing(self):
super().test_training_gradient_checkpointing()
@pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.")
def test_training_gradient_checkpointing_use_reentrant_false(self):
super().test_training_gradient_checkpointing_use_reentrant_false()
@pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.")
def test_training_gradient_checkpointing_use_reentrant_true(self):
super().test_training_gradient_checkpointing_use_reentrant_true()
@unittest.skip(reason="Udop has no separate base model without a head.")
def test_model_base_model_prefix(self):
pass
def test_forward_signature(self):
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config)
signature = inspect.signature(model.forward)
# signature.parameters is an OrderedDict => so arg_names order is deterministic
arg_names = sorted([*signature.parameters.keys()])
expected_arg_names = [
"attention_mask",
"bbox",
"decoder_attention_mask",
"decoder_input_ids",
"decoder_inputs_embeds",
"encoder_outputs",
"input_ids",
"inputs_embeds",
"kwargs",
]
if model_class in self.all_generative_model_classes:
expected_arg_names.append(
"labels",
)
expected_arg_names = sorted(expected_arg_names)
self.assertListEqual(sorted(arg_names[: len(expected_arg_names)]), expected_arg_names)
# overwrite because T5 doesn't accept position ids as input and expects `decoder_input_ids`
def test_custom_4d_attention_mask(self):
for model_class in self.all_generative_model_classes:
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
model = model_class(config).to(device=torch_device, dtype=torch.float32)
(
input_ids,
_,
input_ids_shared_prefix,
mask_shared_prefix,
_,
) = self._get_custom_4d_mask_test_data()
logits = model.forward(
decoder_input_ids=input_ids,
input_ids=input_dict["input_ids"][:3],
bbox=input_dict["bbox"][:3],
).logits
# logits.shape == torch.Size([3, 4, ...])
logits_shared_prefix = model(
input_ids=input_dict["input_ids"][:1],
bbox=input_dict["bbox"][:1],
decoder_input_ids=input_ids_shared_prefix,
decoder_attention_mask=mask_shared_prefix,
)[0]
# logits_shared_prefix.shape == torch.Size([1, 6, ...])
out_last_tokens = logits[:, -1, :] # last tokens in each batch line
out_shared_prefix_last_tokens = logits_shared_prefix[0, -3:, :] # last three tokens
# comparing softmax-normalized logits:
normalized_0 = F.softmax(out_last_tokens)
normalized_1 = F.softmax(out_shared_prefix_last_tokens)
torch.testing.assert_close(normalized_0, normalized_1, rtol=1e-3, atol=1e-4)
@slow
def test_model_from_pretrained(self):
model_name = "microsoft/udop-large"
model = UdopForConditionalGeneration.from_pretrained(model_name)
self.assertIsNotNone(model)
@unittest.skip(reason="TODO: Fix me @joao")
def test_generate_without_input_ids(self):
pass
class UdopEncoderOnlyModelTester:
def __init__(
self,
parent,
vocab_size=99,
batch_size=13,
seq_length=7,
# For common tests
is_training=False,
use_attention_mask=True,
hidden_size=32,
num_hidden_layers=2,
decoder_layers=2,
num_attention_heads=4,
d_ff=37,
relative_attention_num_buckets=32,
dropout_rate=0.1,
initializer_factor=0.002,
eos_token_id=1,
pad_token_id=0,
scope=None,
range_bbox=1000,
):
self.parent = parent
self.batch_size = batch_size
# For common tests
self.seq_length = seq_length
self.is_training = is_training
self.use_attention_mask = use_attention_mask
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.decoder_layers = decoder_layers
self.num_attention_heads = num_attention_heads
self.d_ff = d_ff
self.relative_attention_num_buckets = relative_attention_num_buckets
self.dropout_rate = dropout_rate
self.initializer_factor = initializer_factor
self.eos_token_id = eos_token_id
self.pad_token_id = pad_token_id
self.scope = None
self.range_bbox = range_bbox
def get_config(self):
return UdopConfig(
vocab_size=self.vocab_size,
d_model=self.hidden_size,
d_ff=self.d_ff,
d_kv=self.hidden_size // self.num_attention_heads,
num_layers=self.num_hidden_layers,
num_decoder_layers=self.decoder_layers,
num_heads=self.num_attention_heads,
relative_attention_num_buckets=self.relative_attention_num_buckets,
dropout_rate=self.dropout_rate,
initializer_factor=self.initializer_factor,
eos_token_id=self.eos_token_id,
bos_token_id=self.pad_token_id,
pad_token_id=self.pad_token_id,
is_encoder_decoder=False,
)
def prepare_config_and_inputs(self):
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
bbox = ids_tensor([self.batch_size, self.seq_length, 4], self.range_bbox).float()
# Ensure that bbox is legal
for i in range(bbox.shape[0]):
for j in range(bbox.shape[1]):
if bbox[i, j, 3] < bbox[i, j, 1]:
t = bbox[i, j, 3]
bbox[i, j, 3] = bbox[i, j, 1]
bbox[i, j, 1] = t
if bbox[i, j, 2] < bbox[i, j, 0]:
t = bbox[i, j, 2]
bbox[i, j, 2] = bbox[i, j, 0]
bbox[i, j, 0] = t
attention_mask = None
if self.use_attention_mask:
attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
config = self.get_config()
return (
config,
input_ids,
bbox,
attention_mask,
)
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
(
config,
input_ids,
bbox,
attention_mask,
) = config_and_inputs
inputs_dict = {
"input_ids": input_ids,
"bbox": bbox,
"attention_mask": attention_mask,
}
return config, inputs_dict
def create_and_check_model(
self,
config,
input_ids,
bbox,
attention_mask,
):
model = UdopEncoderModel(config=config)
model.to(torch_device)
model.eval()
result = model(
input_ids=input_ids,
bbox=bbox,
attention_mask=attention_mask,
)
encoder_output = result.last_hidden_state
self.parent.assertEqual(encoder_output.size(), (self.batch_size, self.seq_length, self.hidden_size))
def create_and_check_model_fp16_forward(
self,
config,
input_ids,
bbox,
attention_mask,
):
model = UdopEncoderModel(config=config).to(torch_device).half().eval()
output = model(input_ids, bbox=bbox, attention_mask=attention_mask)["last_hidden_state"]
self.parent.assertFalse(torch.isnan(output).any().item())
class UdopEncoderOnlyModelTest(ModelTesterMixin, unittest.TestCase):
all_model_classes = (UdopEncoderModel,) if is_torch_available() else ()
test_resize_embeddings = False
def setUp(self):
self.model_tester = UdopEncoderOnlyModelTester(self)
self.config_tester = ConfigTester(self, config_class=UdopConfig, d_model=32)
def test_config(self):
self.config_tester.run_common_tests()
def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)
# overwrite because T5 doesn't accept position ids as input and expects `decoder_input_ids`
def test_custom_4d_attention_mask(self):
for model_class in self.all_generative_model_classes:
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
model = model_class(config).to(device=torch_device, dtype=torch.float32)
(
input_ids,
_,
input_ids_shared_prefix,
mask_shared_prefix,
_,
) = self._get_custom_4d_mask_test_data()
logits = model.forward(
decoder_input_ids=input_ids,
input_ids=input_dict["input_ids"][:3],
).logits
# logits.shape == torch.Size([3, 4, ...])
logits_shared_prefix = model(
input_ids=input_dict["input_ids"][:1],
decoder_input_ids=input_ids_shared_prefix,
decoder_attention_mask=mask_shared_prefix,
)[0]
# logits_shared_prefix.shape == torch.Size([1, 6, ...])
out_last_tokens = logits[:, -1, :] # last tokens in each batch line
out_shared_prefix_last_tokens = logits_shared_prefix[0, -3:, :] # last three tokens
# comparing softmax-normalized logits:
normalized_0 = F.softmax(out_last_tokens)
normalized_1 = F.softmax(out_shared_prefix_last_tokens)
torch.testing.assert_close(normalized_0, normalized_1, rtol=1e-3, atol=1e-4)
@require_torch
@require_sentencepiece
@require_tokenizers
@require_vision
@slow
class UdopModelIntegrationTests(unittest.TestCase):
@cached_property
def image(self):
ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test")
return ds[1]["image"]
@cached_property
def processor(self):
return UdopProcessor.from_pretrained("microsoft/udop-large")
@cached_property
def model(self):
return UdopForConditionalGeneration.from_pretrained("microsoft/udop-large").to(torch_device)
def test_conditional_generation(self):
processor = self.processor
model = self.model
prompt = "Question answering. In which year is the report made?"
encoding = processor(images=self.image, text=prompt, return_tensors="pt").to(torch_device)
predicted_ids = model.generate(**encoding)
predicted_text = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
self.assertEqual(predicted_text, "2013")

View File

@@ -0,0 +1,421 @@
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from functools import cached_property
from transformers import (
UdopProcessor,
UdopTokenizer,
UdopTokenizerFast,
)
from transformers.testing_utils import (
require_pytesseract,
require_sentencepiece,
require_tokenizers,
require_torch,
slow,
)
from transformers.utils import is_pytesseract_available, is_torch_available
from ...test_processing_common import ProcessorTesterMixin
if is_torch_available():
import torch
if is_pytesseract_available():
from transformers import LayoutLMv3ImageProcessor
@require_pytesseract
@require_sentencepiece
@require_tokenizers
class UdopProcessorTest(ProcessorTesterMixin, unittest.TestCase):
tokenizer_class = UdopTokenizer
rust_tokenizer_class = UdopTokenizerFast
processor_class = UdopProcessor
maxDiff = None
@classmethod
def _setup_image_processor(cls):
image_processor_class = cls._get_component_class_from_processor("image_processor")
return image_processor_class(
do_resize=True,
size=224,
apply_ocr=True,
)
@classmethod
def _setup_tokenizer(cls):
tokenizer_class = cls._get_component_class_from_processor("tokenizer")
return tokenizer_class.from_pretrained("microsoft/udop-large")
@unittest.skip("UdopProcessor doesn't return pixel_values tensors")
def test_image_processor_defaults(self):
pass
def test_text_target(self):
image_processor = self.get_component("image_processor")
tokenizer = self.get_component("tokenizer")
processor = UdopProcessor(tokenizer=tokenizer, image_processor=image_processor)
text = "hello world"
expected_decoding = "hello world</s>"
encoding_processor = processor(text_target=text)
encoding_tokenizer = tokenizer(text_target=text)
self.assertListEqual(encoding_processor["input_ids"], [21820, 296, 1])
self.assertListEqual(encoding_processor["attention_mask"], [1, 1, 1])
self.assertDictEqual(dict(encoding_processor), dict(encoding_tokenizer))
self.assertEqual(tokenizer.decode(encoding_processor["input_ids"]), expected_decoding)
@slow
def test_overflowing_tokens(self):
# In the case of overflowing tokens, test that we still have 1-to-1 mapping between the images and input_ids (sequences that are too long are broken down into multiple sequences).
from datasets import load_dataset
# set up
datasets = load_dataset("nielsr/funsd")
processor = UdopProcessor.from_pretrained("microsoft/udop-large", apply_ocr=False)
def preprocess_data(examples):
images = [image.convert("RGB") for image in examples["image"]]
words = list(examples["words"])
boxes = list(examples["bboxes"])
word_labels = list(examples["ner_tags"])
encoded_inputs = processor(
images,
words,
boxes=boxes,
word_labels=word_labels,
max_length=512,
padding="max_length",
truncation=True,
return_overflowing_tokens=True,
stride=50,
return_offsets_mapping=True,
return_tensors="pt",
)
return encoded_inputs
train_data = preprocess_data(datasets["train"])
self.assertEqual(len(train_data["pixel_values"]), len(train_data["input_ids"]))
@unittest.skip("We will not support batch input with and without images for UDOP!")
def test_processor_text_has_no_visual(self):
pass
# different use cases tests
@require_sentencepiece
@require_torch
@require_pytesseract
class UdopProcessorIntegrationTests(unittest.TestCase):
@cached_property
def get_images(self):
# we verify our implementation on 2 document images from the DocVQA dataset
from datasets import load_dataset
ds = load_dataset("hf-internal-testing/fixtures_docvqa", split="test")
return ds[0]["image"].convert("RGB"), ds[1]["image"].convert("RGB")
@cached_property
def get_tokenizers(self):
slow_tokenizer = UdopTokenizer.from_pretrained("microsoft/udop-large")
fast_tokenizer = UdopTokenizerFast.from_pretrained("microsoft/udop-large")
return [slow_tokenizer, fast_tokenizer]
@slow
def test_processor_case_1(self):
# case 1: document image classification (training, inference) + token classification (inference), apply_ocr = True
image_processor = LayoutLMv3ImageProcessor()
tokenizers = self.get_tokenizers
images = self.get_images
for tokenizer in tokenizers:
processor = UdopProcessor(image_processor=image_processor, tokenizer=tokenizer)
# not batched
input_image_processor = image_processor(images[0], return_tensors="pt")
input_processor = processor(images[0], return_tensors="pt")
# verify keys
expected_keys = ["attention_mask", "bbox", "input_ids", "pixel_values"]
actual_keys = sorted(input_processor.keys())
self.assertListEqual(actual_keys, expected_keys)
# verify pixel_values
self.assertTrue(
torch.allclose(input_image_processor["pixel_values"], input_processor["pixel_values"], atol=1e-2)
)
# verify input_ids
# this was obtained with Tesseract 4.1.1
# fmt: off
expected_decoding = "11:14 to 11:39 a.m 11:39 to 11:44 a.m. 11:44 a.m. to 12:25 p.m. 12:25 to 12:58 p.m. 12:58 to 4:00 p.m. 2:00 to 5:00 p.m. Coffee Break Coffee will be served for men and women in the lobby adjacent to exhibit area. Please move into exhibit area. (Exhibits Open) TRRF GENERAL SESSION (PART |) Presiding: Lee A. Waller TRRF Vice President “Introductory Remarks” Lee A. Waller, TRRF Vice Presi- dent Individual Interviews with TRRF Public Board Members and Sci- entific Advisory Council Mem- bers Conducted by TRRF Treasurer Philip G. Kuehn to get answers which the public refrigerated warehousing industry is looking for. Plus questions from the floor. Dr. Emil M. Mrak, University of Cal- ifornia, Chairman, TRRF Board; Sam R. Cecil, University of Georgia College of Agriculture; Dr. Stanley Charm, Tufts University School of Medicine; Dr. Robert H. Cotton, ITT Continental Baking Company; Dr. Owen Fennema, University of Wis- consin; Dr. Robert E. Hardenburg, USDA. Questions and Answers Exhibits Open Capt. Jack Stoney Room TRRF Scientific Advisory Council Meeting Ballroom Foyer</s>" # noqa: E231
# fmt: on
decoding = processor.decode(input_processor.input_ids.squeeze().tolist())
self.assertSequenceEqual(decoding, expected_decoding)
# batched
input_image_processor = image_processor(images, return_tensors="pt")
input_processor = processor(images, padding=True, return_tensors="pt")
# verify keys
expected_keys = ["attention_mask", "bbox", "input_ids", "pixel_values"]
actual_keys = sorted(input_processor.keys())
self.assertListEqual(actual_keys, expected_keys)
# verify pixel_values
self.assertTrue(
torch.allclose(input_image_processor["pixel_values"], input_processor["pixel_values"], atol=1e-2)
)
# verify input_ids
# this was obtained with Tesseract 4.1.1
# fmt: off
expected_decoding = "7 ITC Limited REPORT AND ACCOUNTS 2013 ITCs Brands: An Asset for the Nation The consumer needs and aspirations they fulfil, the benefit they generate for millions across ITCs value chains, the future-ready capabilities that support them, and the value that they create for the country, have made ITCs brands national assets, adding to Indias competitiveness. It is ITCs aspiration to be the No 1 FMCG player in the country, driven by its new FMCG businesses. A recent Nielsen report has highlighted that ITC's new FMCG businesses are the fastest growing among the top consumer goods companies operating in India. ITC takes justifiable pride that, along with generating economic value, these celebrated Indian brands also drive the creation of larger societal capital through the virtuous cycle of sustainable and inclusive growth. DI WILLS * ; LOVE DELIGHTFULLY SOFT SKIN? aia Ans Source: https://www.industrydocuments.ucsf.edu/docs/snbx0223</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>" # noqa: E231
# fmt: on
decoding = processor.decode(input_processor.input_ids[1].tolist())
self.assertSequenceEqual(decoding, expected_decoding)
@slow
def test_processor_case_2(self):
# case 2: document image classification (training, inference) + token classification (inference), apply_ocr=False
image_processor = LayoutLMv3ImageProcessor(apply_ocr=False)
tokenizers = self.get_tokenizers
images = self.get_images
for tokenizer in tokenizers:
processor = UdopProcessor(image_processor=image_processor, tokenizer=tokenizer)
# not batched
words = ["hello", "world"]
boxes = [[1, 2, 3, 4], [5, 6, 7, 8]]
input_processor = processor(images[0], words, boxes=boxes, return_tensors="pt")
# verify keys
expected_keys = ["attention_mask", "bbox", "input_ids", "pixel_values"]
actual_keys = list(input_processor.keys())
for key in expected_keys:
self.assertIn(key, actual_keys)
# verify input_ids
expected_decoding = "hello world</s>"
decoding = processor.decode(input_processor.input_ids.squeeze().tolist())
self.assertSequenceEqual(decoding, expected_decoding)
# batched
words = [["hello", "world"], ["my", "name", "is", "niels"]]
boxes = [[[1, 2, 3, 4], [5, 6, 7, 8]], [[3, 2, 5, 1], [6, 7, 4, 2], [3, 9, 2, 4], [1, 1, 2, 3]]]
input_processor = processor(images, words, boxes=boxes, padding=True, return_tensors="pt")
# verify keys
expected_keys = ["attention_mask", "bbox", "input_ids", "pixel_values"]
actual_keys = sorted(input_processor.keys())
self.assertListEqual(actual_keys, expected_keys)
# verify input_ids
expected_decoding = "hello world</s><pad><pad><pad><pad>"
decoding = processor.decode(input_processor.input_ids[0].tolist())
self.assertSequenceEqual(decoding, expected_decoding)
# verify bbox
expected_bbox = [
[3, 2, 5, 1],
[6, 7, 4, 2],
[3, 9, 2, 4],
[1, 1, 2, 3],
[1, 1, 2, 3],
[1, 1, 2, 3],
[1000, 1000, 1000, 1000],
]
self.assertListEqual(input_processor.bbox[1].tolist(), expected_bbox)
@slow
def test_processor_case_3(self):
# case 3: token classification (training), apply_ocr=False
image_processor = LayoutLMv3ImageProcessor(apply_ocr=False)
tokenizers = self.get_tokenizers
images = self.get_images
for tokenizer in tokenizers:
processor = UdopProcessor(image_processor=image_processor, tokenizer=tokenizer)
# not batched
words = ["weirdly", "world"]
boxes = [[1, 2, 3, 4], [5, 6, 7, 8]]
word_labels = [1, 2]
input_processor = processor(images[0], words, boxes=boxes, word_labels=word_labels, return_tensors="pt")
# verify keys
expected_keys = ["attention_mask", "bbox", "input_ids", "labels", "pixel_values"]
actual_keys = sorted(input_processor.keys())
self.assertListEqual(actual_keys, expected_keys)
# verify input_ids
expected_decoding = "weirdly world</s>"
decoding = processor.decode(input_processor.input_ids.squeeze().tolist())
self.assertSequenceEqual(decoding, expected_decoding)
# verify labels
expected_labels = [1, -100, 2, -100]
self.assertListEqual(input_processor.labels.squeeze().tolist(), expected_labels)
# batched
words = [["hello", "world"], ["my", "name", "is", "niels"]]
boxes = [[[1, 2, 3, 4], [5, 6, 7, 8]], [[3, 2, 5, 1], [6, 7, 4, 2], [3, 9, 2, 4], [1, 1, 2, 3]]]
word_labels = [[1, 2], [6, 3, 10, 2]]
input_processor = processor(
images, words, boxes=boxes, word_labels=word_labels, padding=True, return_tensors="pt"
)
# verify keys
expected_keys = ["attention_mask", "bbox", "input_ids", "labels", "pixel_values"]
actual_keys = sorted(input_processor.keys())
self.assertListEqual(actual_keys, expected_keys)
# verify input_ids
expected_decoding = "my name is niels</s>"
decoding = processor.decode(input_processor.input_ids[1].tolist())
self.assertSequenceEqual(decoding, expected_decoding)
# verify bbox
expected_bbox = [
[3, 2, 5, 1],
[6, 7, 4, 2],
[3, 9, 2, 4],
[1, 1, 2, 3],
[1, 1, 2, 3],
[1, 1, 2, 3],
[1000, 1000, 1000, 1000],
]
self.assertListEqual(input_processor.bbox[1].tolist(), expected_bbox)
# verify labels
expected_labels = [6, 3, 10, 2, -100, -100, -100]
self.assertListEqual(input_processor.labels[1].tolist(), expected_labels)
@slow
def test_processor_case_4(self):
# case 4: visual question answering (inference), apply_ocr=True
image_processor = LayoutLMv3ImageProcessor()
tokenizers = self.get_tokenizers
images = self.get_images
for tokenizer in tokenizers:
processor = UdopProcessor(image_processor=image_processor, tokenizer=tokenizer)
# not batched
question = "What's his name?"
input_processor = processor(images[0], question, return_tensors="pt")
# verify keys
expected_keys = ["attention_mask", "bbox", "input_ids", "pixel_values"]
actual_keys = sorted(input_processor.keys())
self.assertListEqual(actual_keys, expected_keys)
# verify input_ids
# this was obtained with Tesseract 4.1.1
# fmt: off
expected_decoding = "What's his name?</s> 11:14 to 11:39 a.m 11:39 to 11:44 a.m. 11:44 a.m. to 12:25 p.m. 12:25 to 12:58 p.m. 12:58 to 4:00 p.m. 2:00 to 5:00 p.m. Coffee Break Coffee will be served for men and women in the lobby adjacent to exhibit area. Please move into exhibit area. (Exhibits Open) TRRF GENERAL SESSION (PART |) Presiding: Lee A. Waller TRRF Vice President “Introductory Remarks” Lee A. Waller, TRRF Vice Presi- dent Individual Interviews with TRRF Public Board Members and Sci- entific Advisory Council Mem- bers Conducted by TRRF Treasurer Philip G. Kuehn to get answers which the public refrigerated warehousing industry is looking for. Plus questions from the floor. Dr. Emil M. Mrak, University of Cal- ifornia, Chairman, TRRF Board; Sam R. Cecil, University of Georgia College of Agriculture; Dr. Stanley Charm, Tufts University School of Medicine; Dr. Robert H. Cotton, ITT Continental Baking Company; Dr. Owen Fennema, University of Wis- consin; Dr. Robert E. Hardenburg, USDA. Questions and Answers Exhibits Open Capt. Jack Stoney Room TRRF Scientific Advisory Council Meeting Ballroom Foyer</s>" # noqa: E231
# fmt: on
decoding = processor.decode(input_processor.input_ids.squeeze().tolist())
self.assertSequenceEqual(decoding, expected_decoding)
# batched
questions = ["How old is he?", "what's the time"]
input_processor = processor(
images, questions, padding="max_length", max_length=20, truncation=True, return_tensors="pt"
)
# verify keys
expected_keys = ["attention_mask", "bbox", "input_ids", "pixel_values"]
actual_keys = sorted(input_processor.keys())
self.assertListEqual(actual_keys, expected_keys)
# verify input_ids
# this was obtained with Tesseract 4.1.1
expected_decoding = "what's the time</s> 7 ITC Limited REPORT AND ACCOUNTS 2013 I</s>"
decoding = processor.decode(input_processor.input_ids[1].tolist())
self.assertSequenceEqual(decoding, expected_decoding)
# verify bbox
# fmt: off
expected_bbox = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [1000, 1000, 1000, 1000], [0, 45, 67, 80], [72, 56, 109, 67], [72, 56, 109, 67], [116, 56, 189, 67], [198, 59, 253, 66], [198, 59, 253, 66], [257, 59, 285, 66], [289, 59, 365, 66], [289, 59, 365, 66], [289, 59, 365, 66], [289, 59, 365, 66], [372, 59, 407, 66], [74, 136, 161, 158], [1000, 1000, 1000, 1000]] # noqa: E231
# fmt: on
self.assertListEqual(input_processor.bbox[1].tolist(), expected_bbox)
@slow
def test_processor_case_5(self):
# case 5: visual question answering (inference), apply_ocr=False
image_processor = LayoutLMv3ImageProcessor(apply_ocr=False)
tokenizers = self.get_tokenizers
images = self.get_images
for tokenizer in tokenizers:
processor = UdopProcessor(image_processor=image_processor, tokenizer=tokenizer)
# not batched
question = "What's his name?"
words = ["hello", "world"]
boxes = [[1, 2, 3, 4], [5, 6, 7, 8]]
input_processor = processor(images[0], question, text_pair=words, boxes=boxes, return_tensors="pt")
# verify keys
expected_keys = ["attention_mask", "bbox", "input_ids", "pixel_values"]
actual_keys = sorted(input_processor.keys())
self.assertListEqual(actual_keys, expected_keys)
# verify input_ids
expected_decoding = "What's his name?</s> hello world</s>"
decoding = processor.decode(input_processor.input_ids.squeeze().tolist())
self.assertSequenceEqual(decoding, expected_decoding)
# batched
questions = ["How old is he?", "what's the time"]
words = [["hello", "world"], ["my", "name", "is", "niels"]]
boxes = [[[1, 2, 3, 4], [5, 6, 7, 8]], [[3, 2, 5, 1], [6, 7, 4, 2], [3, 9, 2, 4], [1, 1, 2, 3]]]
input_processor = processor(
images, questions, text_pair=words, boxes=boxes, padding=True, return_tensors="pt"
)
# verify keys
expected_keys = ["attention_mask", "bbox", "input_ids", "pixel_values"]
actual_keys = sorted(input_processor.keys())
self.assertListEqual(actual_keys, expected_keys)
# verify input_ids
expected_decoding = "How old is he?</s> hello world</s><pad><pad><pad>"
decoding = processor.decode(input_processor.input_ids[0].tolist())
self.assertSequenceEqual(decoding, expected_decoding)
expected_decoding = "what's the time</s> my name is niels</s>"
decoding = processor.decode(input_processor.input_ids[1].tolist())
self.assertSequenceEqual(decoding, expected_decoding)
# verify bbox
expected_bbox = [[3, 9, 2, 4], [1, 1, 2, 3], [1, 1, 2, 3], [1, 1, 2, 3], [1000, 1000, 1000, 1000]]
self.assertListEqual(input_processor.bbox[1].tolist()[-5:], expected_bbox)

File diff suppressed because it is too large Load Diff