first commit
Some checks failed
Self-hosted runner (nightly-past-ci-caller) / Get number (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.11 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.10 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.9 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.8 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.7 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.6 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.5 (push) Has been cancelled
Self-hosted runner (benchmark) / Benchmark (aws-g5-4xlarge-cache) (push) Has been cancelled
Build documentation / build (push) Has been cancelled
Build documentation / build_other_lang (push) Has been cancelled
CodeQL Security Analysis / CodeQL Analysis (push) Has been cancelled
New model PR merged notification / Notify new model (push) Has been cancelled
PR CI / pr-ci (push) Has been cancelled
Slow tests on important models (on Push - A10) / Get all modified files (push) Has been cancelled
Secret Leaks / trufflehog (push) Has been cancelled
Update Transformers metadata / build_and_package (push) Has been cancelled
Slow tests on important models (on Push - A10) / Model CI (push) Has been cancelled
Check Tiny Models / Check tiny models (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Model CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Pipeline CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Example CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / DeepSpeed CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Trainer/FSDP CI (push) Has been cancelled
Nvidia CI - Flash Attn / Setup (push) Has been cancelled
Nvidia CI - Flash Attn / Model CI (push) Has been cancelled
Nvidia CI / Setup (push) Has been cancelled
Nvidia CI / Model CI (push) Has been cancelled
Nvidia CI / Torch pipeline CI (push) Has been cancelled
Nvidia CI / Example CI (push) Has been cancelled
Nvidia CI / Trainer/FSDP CI (push) Has been cancelled
Nvidia CI / DeepSpeed CI (push) Has been cancelled
Nvidia CI / Quantization CI (push) Has been cancelled
Nvidia CI / Kernels CI (push) Has been cancelled
Doctests / Setup (push) Has been cancelled
Doctests / Call doctest jobs (push) Has been cancelled
Doctests / Send results to webhook (push) Has been cancelled
Extras Smoke Test / Get supported Python versions (push) Has been cancelled
Extras Smoke Test / Test extras on Python ${{ matrix.python-version }} (push) Has been cancelled
Extras Smoke Test / Check Slack token availability (push) Has been cancelled
Extras Smoke Test / Notify failures to Slack (push) Has been cancelled
Self-hosted runner (AMD scheduled CI caller) / Trigger Scheduled AMD CI (push) Has been cancelled
Stale Bot / Close Stale Issues (push) Has been cancelled

This commit is contained in:
陈赣
2026-06-05 16:53:03 +08:00
commit 06f1fd69a6
6047 changed files with 1895387 additions and 0 deletions

View File

View File

@@ -0,0 +1,275 @@
# Copyright 2026 the HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from transformers.testing_utils import require_torch, require_vision
from transformers.utils import is_torch_available
from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
if is_torch_available():
import torch
from transformers import Sapiens2ImageProcessor
from transformers.modeling_outputs import SemanticSegmenterOutput
from transformers.models.sapiens2.modeling_sapiens2 import (
Sapiens2ImageMattingOutput,
Sapiens2NormalEstimatorOutput,
Sapiens2PointmapEstimatorOutput,
)
class Sapiens2ImageProcessingTester:
def __init__(
self,
parent,
batch_size=7,
num_channels=3,
image_size=18,
min_resolution=30,
max_resolution=400,
do_resize=True,
size=None,
do_normalize=True,
image_mean=[0.485, 0.456, 0.406],
image_std=[0.229, 0.224, 0.225],
do_reduce_labels=False,
):
super().__init__()
size = size if size is not None else {"height": 20, "width": 18}
self.parent = parent
self.batch_size = batch_size
self.num_channels = num_channels
self.image_size = image_size
self.min_resolution = min_resolution
self.max_resolution = max_resolution
self.do_resize = do_resize
self.size = size
self.do_normalize = do_normalize
self.image_mean = image_mean
self.image_std = image_std
self.do_reduce_labels = do_reduce_labels
def prepare_image_processor_dict(self):
return {
"do_resize": self.do_resize,
"size": self.size,
"do_normalize": self.do_normalize,
"image_mean": self.image_mean,
"image_std": self.image_std,
"do_reduce_labels": self.do_reduce_labels,
}
def expected_output_image_shape(self, images):
return self.num_channels, self.size["height"], self.size["width"]
def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
return prepare_image_inputs(
batch_size=self.batch_size,
num_channels=self.num_channels,
min_resolution=self.min_resolution,
max_resolution=self.max_resolution,
equal_resolution=equal_resolution,
numpify=numpify,
torchify=torchify,
)
@require_torch
@require_vision
class Sapiens2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
def setUp(self):
super().setUp()
self.image_processor_tester = Sapiens2ImageProcessingTester(self)
@property
def image_processor_dict(self):
return self.image_processor_tester.prepare_image_processor_dict()
def test_image_processor_properties(self):
for image_processing_class in self.image_processing_classes.values():
image_processing = image_processing_class(**self.image_processor_dict)
self.assertTrue(hasattr(image_processing, "do_resize"))
self.assertTrue(hasattr(image_processing, "size"))
self.assertTrue(hasattr(image_processing, "do_normalize"))
self.assertTrue(hasattr(image_processing, "image_mean"))
self.assertTrue(hasattr(image_processing, "image_std"))
self.assertTrue(hasattr(image_processing, "do_reduce_labels"))
def test_image_processor_from_dict_with_kwargs(self):
for image_processing_class in self.image_processing_classes.values():
image_processor = image_processing_class.from_dict(self.image_processor_dict)
self.assertEqual(image_processor.size, {"height": 20, "width": 18})
self.assertEqual(image_processor.do_reduce_labels, False)
image_processor = image_processing_class.from_dict(
self.image_processor_dict, size={"height": 42, "width": 42}, do_reduce_labels=True
)
self.assertEqual(image_processor.size, {"height": 42, "width": 42})
self.assertEqual(image_processor.do_reduce_labels, True)
def test_call_segmentation_maps(self):
for image_processing_class in self.image_processing_classes.values():
image_processing = image_processing_class(**self.image_processor_dict)
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
maps = [torch.zeros(image.shape[-2:]).long() for image in image_inputs]
# Single image + map
encoding = image_processing(image_inputs[0], maps[0], return_tensors="pt")
self.assertEqual(
encoding["pixel_values"].shape,
(
1,
self.image_processor_tester.num_channels,
self.image_processor_tester.size["height"],
self.image_processor_tester.size["width"],
),
)
self.assertEqual(
encoding["labels"].shape,
(1, self.image_processor_tester.size["height"], self.image_processor_tester.size["width"]),
)
self.assertEqual(encoding["labels"].dtype, torch.long)
self.assertTrue(encoding["labels"].min().item() >= 0)
self.assertTrue(encoding["labels"].max().item() <= 255)
# Batched images + maps
encoding = image_processing(image_inputs, maps, return_tensors="pt")
self.assertEqual(
encoding["pixel_values"].shape,
(
self.image_processor_tester.batch_size,
self.image_processor_tester.num_channels,
self.image_processor_tester.size["height"],
self.image_processor_tester.size["width"],
),
)
self.assertEqual(
encoding["labels"].shape,
(
self.image_processor_tester.batch_size,
self.image_processor_tester.size["height"],
self.image_processor_tester.size["width"],
),
)
self.assertEqual(encoding["labels"].dtype, torch.long)
self.assertTrue(encoding["labels"].min().item() >= 0)
self.assertTrue(encoding["labels"].max().item() <= 255)
def test_post_process_semantic_segmentation(self):
image_processor = Sapiens2ImageProcessor()
batch_size = 2
num_labels = 3
height = width = 16
outputs = SemanticSegmenterOutput(logits=torch.randn(batch_size, num_labels, height, width))
# without target_sizes: spatial dims match logits
segmentation = image_processor.post_process_semantic_segmentation(outputs)
self.assertEqual(len(segmentation), batch_size)
self.assertEqual(segmentation[0].shape, torch.Size([height, width]))
# with target_sizes: output is resized to requested size
target_sizes = [(height * 2, width * 2)] * batch_size
segmentation = image_processor.post_process_semantic_segmentation(outputs, target_sizes=target_sizes)
self.assertEqual(len(segmentation), batch_size)
self.assertEqual(segmentation[0].shape, torch.Size([height * 2, width * 2]))
# mismatched batch size raises ValueError
with self.assertRaises(ValueError):
image_processor.post_process_semantic_segmentation(outputs, target_sizes=[(100, 100)])
def test_post_process_normal_estimation(self):
image_processor = Sapiens2ImageProcessor()
batch_size = 2
num_labels = 3
height = width = 16
outputs = Sapiens2NormalEstimatorOutput(normals=torch.randn(batch_size, num_labels, height, width))
# without target_sizes: spatial dims match normals, values are L2-normalized
result = image_processor.post_process_normal_estimation(outputs)
self.assertEqual(len(result), batch_size)
self.assertEqual(result[0]["normals"].shape, torch.Size([num_labels, height, width]))
norms = result[0]["normals"].norm(p=2, dim=0)
torch.testing.assert_close(norms, torch.ones_like(norms), rtol=1e-4, atol=1e-4)
# with target_sizes: output is resized before normalization
target_sizes = [(height * 2, width * 2)] * batch_size
result = image_processor.post_process_normal_estimation(outputs, target_sizes=target_sizes)
self.assertEqual(len(result), batch_size)
self.assertEqual(result[0]["normals"].shape, torch.Size([num_labels, height * 2, width * 2]))
# mismatched batch size raises ValueError
with self.assertRaises(ValueError):
image_processor.post_process_normal_estimation(outputs, target_sizes=[(100, 100)])
def test_post_process_pointmap_estimation(self):
image_processor = Sapiens2ImageProcessor()
batch_size = 2
num_labels = 3
height = width = 16
outputs = Sapiens2PointmapEstimatorOutput(pointmaps=torch.randn(batch_size, num_labels, height, width))
# without target_sizes: spatial dims match pointmap
result = image_processor.post_process_pointmap_estimation(outputs)
self.assertEqual(len(result), batch_size)
self.assertEqual(result[0]["pointmap"].shape, torch.Size([num_labels, height, width]))
# with target_sizes: output is resized to requested size
target_sizes = [(height * 2, width * 2)] * batch_size
result = image_processor.post_process_pointmap_estimation(outputs, target_sizes=target_sizes)
self.assertEqual(len(result), batch_size)
self.assertEqual(result[0]["pointmap"].shape, torch.Size([num_labels, height * 2, width * 2]))
# with scales: scale division is applied
scale = torch.tensor([[2.0], [0.5]])
outputs_with_scale = Sapiens2PointmapEstimatorOutput(
pointmaps=torch.ones(batch_size, num_labels, height, width), scales=scale
)
result = image_processor.post_process_pointmap_estimation(outputs_with_scale)
torch.testing.assert_close(result[0]["pointmap"], torch.full((num_labels, height, width), 0.5))
torch.testing.assert_close(result[1]["pointmap"], torch.full((num_labels, height, width), 2.0))
# mismatched batch size raises ValueError
with self.assertRaises(ValueError):
image_processor.post_process_pointmap_estimation(outputs, target_sizes=[(100, 100)])
def test_post_process_image_matting(self):
image_processor = Sapiens2ImageProcessor()
batch_size = 2
height = width = 16
outputs = Sapiens2ImageMattingOutput(
foregrounds=torch.rand(batch_size, 3, height, width),
alphas=torch.rand(batch_size, 1, height, width),
)
# without target_sizes: spatial dims unchanged
result = image_processor.post_process_image_matting(outputs)
self.assertEqual(len(result), batch_size)
self.assertEqual(result[0]["foreground"].shape, torch.Size([3, height, width]))
self.assertEqual(result[0]["alpha"].shape, torch.Size([1, height, width]))
# values stay in [0, 1]
self.assertGreaterEqual(result[0]["alpha"].min().item(), 0.0)
self.assertLessEqual(result[0]["alpha"].max().item(), 1.0)
# with target_sizes: output is resized
target_sizes = [(height * 2, width * 2)] * batch_size
result = image_processor.post_process_image_matting(outputs, target_sizes=target_sizes)
self.assertEqual(result[0]["foreground"].shape, torch.Size([3, height * 2, width * 2]))
# mismatched batch size raises ValueError
with self.assertRaises(ValueError):
image_processor.post_process_image_matting(outputs, target_sizes=[(100, 100)])

View File

@@ -0,0 +1,743 @@
# Copyright 2026 the HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Testing suite for the PyTorch Sapiens2 model."""
import unittest
from functools import cached_property
from transformers import Sapiens2Config, Sapiens2HeadConfig
from transformers.testing_utils import Expectations, require_cv2, require_torch, require_vision, slow, torch_device
from transformers.utils import is_torch_available
from ...test_backbone_common import BackboneTesterMixin
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
from ...test_pipeline_mixin import PipelineTesterMixin
from ...test_processing_common import url_to_local_path
if is_torch_available():
import torch
from torch import nn
from transformers import (
Sapiens2Backbone,
Sapiens2ForImageMatting,
Sapiens2ForNormalEstimation,
Sapiens2ForPointmapEstimation,
Sapiens2ForPoseEstimation,
Sapiens2ForSemanticSegmentation,
Sapiens2ImageProcessor,
Sapiens2Model,
)
from transformers.image_utils import load_image_as_tensor
from transformers.models.sapiens2.modeling_sapiens2 import (
Sapiens2ImageMattingOutput,
Sapiens2PointmapEstimatorOutput,
)
class Sapiens2ModelTester:
def __init__(
self,
parent,
batch_size=13,
image_size=30,
patch_size=2,
num_channels=3,
is_training=False,
use_labels=True,
hidden_size=32,
num_hidden_layers=2,
num_attention_heads=4,
intermediate_size=37,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
type_sequence_label_size=10,
initializer_range=0.02,
num_register_tokens=2,
mask_ratio=0.5,
scope=None,
):
self.parent = parent
self.batch_size = batch_size
self.image_size = image_size
self.patch_size = patch_size
self.num_channels = num_channels
self.is_training = is_training
self.use_labels = use_labels
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.type_sequence_label_size = type_sequence_label_size
self.initializer_range = initializer_range
self.num_register_tokens = num_register_tokens
self.scope = scope
num_patches = (image_size // patch_size) ** 2
self.seq_length = num_patches + 1 + self.num_register_tokens
self.mask_ratio = mask_ratio
self.num_masks = int(mask_ratio * self.seq_length)
self.mask_length = num_patches
def prepare_config_and_inputs(self):
pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
labels = None
if self.use_labels:
labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
config = self.get_config()
return config, pixel_values, labels
def get_config(self):
return Sapiens2Config(
image_size=self.image_size,
patch_size=self.patch_size,
num_channels=self.num_channels,
hidden_size=self.hidden_size,
num_hidden_layers=self.num_hidden_layers,
num_attention_heads=self.num_attention_heads,
intermediate_size=self.intermediate_size,
hidden_act=self.hidden_act,
hidden_dropout_prob=self.hidden_dropout_prob,
attention_probs_dropout_prob=self.attention_probs_dropout_prob,
is_decoder=False,
initializer_range=self.initializer_range,
num_register_tokens=self.num_register_tokens,
stage_names=["stem"] + [f"stage{i}" for i in range(1, self.num_hidden_layers + 1)],
out_indices=[0, 1],
reshape_hidden_states=True,
num_labels=4,
flip_pairs=[[1, 2], [3, 4]],
# Head config sized to satisfy all model conversion patterns in test_reverse_loading_mapping
head_config=Sapiens2HeadConfig(
upsample_out_channels=[8, 4, 4, 4],
upsample_kernel_sizes=[4, 4, 4, 4],
conv_out_channels=[4, 4, 4],
conv_kernel_sizes=[1, 1, 1],
scale_conv_out_channels=[8, 4, 4],
scale_conv_kernel_sizes=[1, 1, 1],
scale_final_hidden_sizes=[8, 4],
),
)
def create_and_check_backbone(self, config, pixel_values, labels):
config.out_features = ["stage1", "stage2"]
config.reshape_hidden_states = True
model = Sapiens2Backbone(config)
model.to(torch_device)
model.eval()
with torch.no_grad():
outputs = model(pixel_values)
self.parent.assertEqual(len(outputs.feature_maps), 2)
for fm in outputs.feature_maps:
b, c, h, w = fm.shape
self.parent.assertEqual(b, self.batch_size)
self.parent.assertEqual(c, self.hidden_size)
self.parent.assertEqual(h, self.image_size // self.patch_size)
self.parent.assertEqual(w, self.image_size // self.patch_size)
def create_and_check_model(self, config, pixel_values, labels):
model = Sapiens2Model(config=config)
model.to(torch_device)
model.eval()
result = model(pixel_values)
self.parent.assertEqual(
result.last_hidden_state.shape,
(self.batch_size, self.seq_length, self.hidden_size),
)
def create_and_check_for_semantic_segmentation(self, config, pixel_values, labels):
model = Sapiens2ForSemanticSegmentation(config)
model.to(torch_device)
model.eval()
with torch.no_grad():
result = model(pixel_values)
# patch_height = image_size // patch_size = 30 // 2 = 15
# 4 deconv layers with stride=2: 15 * 2^4 = 240
patch_height = self.image_size // self.patch_size
expected_h = patch_height * (2 ** len(config.head_config.upsample_out_channels))
self.parent.assertEqual(
result.logits.shape,
(self.batch_size, config.num_labels, expected_h, expected_h),
)
def create_and_check_for_pose_estimation(self, config, pixel_values, labels):
model = Sapiens2ForPoseEstimation(config)
model.to(torch_device)
model.eval()
with torch.no_grad():
result = model(pixel_values)
patch_height = self.image_size // self.patch_size
expected_h = patch_height * (2 ** len(config.head_config.upsample_out_channels))
self.parent.assertEqual(
result.heatmaps.shape,
(self.batch_size, config.num_labels, expected_h, expected_h),
)
def create_and_check_for_normal_estimation(self, config, pixel_values, labels):
model = Sapiens2ForNormalEstimation(config)
model.to(torch_device)
model.eval()
with torch.no_grad():
result = model(pixel_values)
# PixelShuffle: Conv2d(padding=(ks-1)//2) then shuffle(2) — size per layer: (h + 2p - ks + 1) * 2
expected_h = config.image_size // self.patch_size
for ks in config.head_config.upsample_kernel_sizes:
padding = (ks - 1) // 2
expected_h = (expected_h + 2 * padding - ks + 1) * 2
self.parent.assertEqual(
result.normals.shape,
(self.batch_size, config.num_labels, expected_h, expected_h),
)
self.parent.assertIsNone(result.loss)
with self.parent.assertRaises(NotImplementedError):
model(pixel_values, labels=torch.randn_like(result.normals))
def create_and_check_for_matting(self, config, pixel_values, labels):
model = Sapiens2ForImageMatting(config)
model.to(torch_device)
model.eval()
with torch.no_grad():
result = model(pixel_values)
expected_h = config.image_size // self.patch_size
for ks in config.head_config.upsample_kernel_sizes:
padding = (ks - 1) // 2
expected_h = (expected_h + 2 * padding - ks + 1) * 2
self.parent.assertEqual(result.foregrounds.shape, (self.batch_size, 3, expected_h, expected_h))
self.parent.assertEqual(result.alphas.shape, (self.batch_size, 1, expected_h, expected_h))
# outputs are sigmoid-activated
self.parent.assertGreaterEqual(result.foregrounds.min().item(), 0.0)
self.parent.assertLessEqual(result.foregrounds.max().item(), 1.0)
self.parent.assertGreaterEqual(result.alphas.min().item(), 0.0)
self.parent.assertLessEqual(result.alphas.max().item(), 1.0)
self.parent.assertIsNone(result.loss)
with self.parent.assertRaises(NotImplementedError):
model(pixel_values, labels=torch.randn(self.batch_size, 4, expected_h, expected_h))
def create_and_check_for_pointmap_estimation(self, config, pixel_values, labels):
model = Sapiens2ForPointmapEstimation(config)
model.to(torch_device)
model.eval()
with torch.no_grad():
result = model(pixel_values)
# PixelShuffle: Conv2d(padding=(ks-1)//2) then shuffle(2) — size per layer: (h + 2p - ks + 1) * 2
expected_h = config.image_size // self.patch_size
for ks in config.head_config.upsample_kernel_sizes:
padding = (ks - 1) // 2
expected_h = (expected_h + 2 * padding - ks + 1) * 2
self.parent.assertEqual(
result.pointmaps.shape,
(self.batch_size, config.num_labels, expected_h, expected_h),
)
self.parent.assertEqual(result.scales.shape, (self.batch_size, 1))
self.parent.assertIsNone(result.loss)
with self.parent.assertRaises(NotImplementedError):
model(pixel_values, labels=torch.randn_like(result.pointmaps))
def prepare_config_and_inputs_for_semantic_segmentation(self):
config = self.get_config()
pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
labels = ids_tensor([self.batch_size, self.image_size, self.image_size], config.num_labels)
return config, pixel_values, labels
def prepare_config_and_inputs_for_pointmap_estimation(self):
config = self.get_config()
config.head_config.use_pixel_shuffle = True
pixel_values = floats_tensor([self.batch_size, self.num_channels, config.image_size, config.image_size])
labels = None
return config, pixel_values, labels
def prepare_config_and_inputs_for_common(self):
config = self.get_config()
# Use pixel-shuffle so all model classes (including Normal/Pointmap/Matting) instantiate
# decode_head.input_conv, satisfying the conversion patterns checked by test_reverse_loading_mapping.
config.head_config.use_pixel_shuffle = True
pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
inputs_dict = {"pixel_values": pixel_values}
return config, inputs_dict
@require_torch
class Sapiens2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
"""
Here we also overwrite some of the tests of test_modeling_common.py, as Sapiens2 does not use input_ids,
inputs_embeds, attention_mask and seq_length.
"""
all_model_classes = (
(
Sapiens2Model,
Sapiens2Backbone,
Sapiens2ForImageMatting,
Sapiens2ForNormalEstimation,
Sapiens2ForPointmapEstimation,
Sapiens2ForPoseEstimation,
Sapiens2ForSemanticSegmentation,
)
if is_torch_available()
else ()
)
pipeline_model_mapping = (
{
"image-feature-extraction": Sapiens2Model,
}
if is_torch_available()
else {}
)
test_resize_embeddings = False
def setUp(self):
self.model_tester = Sapiens2ModelTester(self)
self.config_tester = ConfigTester(self, config_class=Sapiens2Config, has_text_modality=False, hidden_size=32)
# The decoder heads contain ConvTranspose2d layers which are non-deterministic on CUDA.
# This non-deterministic behavior is amplified by the InstanceNorm2d layers and results in up
# to 6e-3 output differences with identical head inputs. We set cudnn.deterministic = True
# for test stability.
self._original_cudnn_deterministic = torch.backends.cudnn.deterministic
torch.backends.cudnn.deterministic = True
def tearDown(self):
torch.backends.cudnn.deterministic = self._original_cudnn_deterministic
def test_backbone(self):
config, pixel_values, labels = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_backbone(config, pixel_values, labels)
def test_config(self):
self.config_tester.run_common_tests()
@unittest.skip(reason="Sapiens2 does not use inputs_embeds")
def test_inputs_embeds(self):
pass
def test_model_get_set_embeddings(self):
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config)
self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
x = model.get_output_embeddings()
self.assertTrue(x is None or isinstance(x, nn.Linear))
def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)
def test_for_semantic_segmentation(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs_for_semantic_segmentation()
self.model_tester.create_and_check_for_semantic_segmentation(*config_and_inputs)
def test_for_pose_estimation(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs_for_semantic_segmentation()
self.model_tester.create_and_check_for_pose_estimation(*config_and_inputs)
def test_for_pointmap_estimation(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs_for_pointmap_estimation()
self.model_tester.create_and_check_for_pointmap_estimation(*config_and_inputs)
def test_for_normal_estimation(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs_for_pointmap_estimation()
self.model_tester.create_and_check_for_normal_estimation(*config_and_inputs)
def test_for_matting(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs_for_pointmap_estimation()
self.model_tester.create_and_check_for_matting(*config_and_inputs)
def test_batching_equivalence(self, atol=1e-4, rtol=1e-4):
# InstanceNorm2d in the decoder heads computes per-instance statistics; different batch
# sizes can trigger different parallelisation paths on CPU, producing O(1e-5) FP differences.
super().test_batching_equivalence(atol=atol, rtol=rtol)
@unittest.skip(reason="Sapiens2 does not support feedforward chunking")
def test_feed_forward_chunking(self):
pass
def prepare_img():
image = load_image_as_tensor(
url_to_local_path(
"https://huggingface.co/datasets/hf-internal-testing/fixtures-coco/resolve/main/val2017/000000004016.png"
)
)
return image
@require_torch
@require_vision
class Sapiens2ModelIntegrationTest(unittest.TestCase):
def setUp(self):
# The decoder heads contain ConvTranspose2d layers which are non-deterministic on CUDA.
# This non-deterministic behavior is amplified by the InstanceNorm2d layers and results in up
# to 6e-3 output differences with identical head inputs. We set cudnn.deterministic = True
# for test stability.
self._original_cudnn_deterministic = torch.backends.cudnn.deterministic
torch.backends.cudnn.deterministic = True
def tearDown(self):
torch.backends.cudnn.deterministic = self._original_cudnn_deterministic
@cached_property
def default_image_processor(self):
return Sapiens2ImageProcessor.from_pretrained("facebook/sapiens2-pretrain-0.4b")
@slow
def test_inference_no_head(self):
model = Sapiens2Model.from_pretrained("facebook/sapiens2-pretrain-0.4b").eval().to(torch_device)
image_processor = self.default_image_processor
image = prepare_img()
inputs = image_processor(image, return_tensors="pt").to(torch_device)
# forward pass
with torch.no_grad():
outputs = model(**inputs)
# verify the last hidden states
# seq length = num_patches + 1 (CLS token) + num_register_tokens
_, _, height, width = inputs["pixel_values"].shape
num_patches = (height // model.config.patch_size) * (width // model.config.patch_size)
expected_seq_length = num_patches + 1 + model.config.num_register_tokens
expected_shape = torch.Size((1, expected_seq_length, model.config.hidden_size))
self.assertEqual(outputs.last_hidden_state.shape, expected_shape)
last_layer_cls_token = outputs.pooler_output
EXPECTED_CLS_SLICE = Expectations({("cuda", None): [-0.09233, -0.00107, -0.12215, 0.07374, -0.03773]})
expected_cls_slice = torch.tensor(EXPECTED_CLS_SLICE.get_expectation(), device=torch_device)
torch.testing.assert_close(last_layer_cls_token[0, :5], expected_cls_slice, rtol=1e-3, atol=1e-3)
last_layer_register_tokens = outputs.last_hidden_state[:, 1 : model.config.num_register_tokens + 1]
EXPECTED_REGISTER_SLICE = Expectations({("cuda", None): [0.08412, 0.04387, 0.05709, -0.04962, 0.03715]})
expected_register_slice = torch.tensor(EXPECTED_REGISTER_SLICE.get_expectation(), device=torch_device)
torch.testing.assert_close(last_layer_register_tokens[0, 0, :5], expected_register_slice, rtol=1e-3, atol=1e-3)
last_layer_patch_tokens = outputs.last_hidden_state[:, model.config.num_register_tokens + 1 :]
EXPECTED_PATCH_SLICE = Expectations({("cuda", None): [0.14232, -0.11947, -0.05910, -0.09457, -0.11410]})
expected_patch_slice = torch.tensor(EXPECTED_PATCH_SLICE.get_expectation(), device=torch_device)
torch.testing.assert_close(last_layer_patch_tokens[0, 0, :5], expected_patch_slice, rtol=1e-3, atol=1e-3)
@slow
def test_inference_semantic_segmentation(self):
model = Sapiens2ForSemanticSegmentation.from_pretrained("facebook/sapiens2-seg-0.4b").eval().to(torch_device)
image_processor = self.default_image_processor
image = prepare_img()
inputs = image_processor(image, return_tensors="pt").to(torch_device)
# forward pass
with torch.no_grad():
outputs = model(**inputs)
logits = outputs.logits
# verify the logits shape: segmentation head upsamples back to the original image resolution
_, _, height, width = inputs["pixel_values"].shape
expected_shape = torch.Size((1, model.config.num_labels, height, width))
self.assertEqual(logits.shape, expected_shape)
EXPECTED_LOGITS_SLICE = Expectations(
{("cuda", None): [[3.45260, 5.55483, 6.57901], [5.71913, 7.21420, 8.11209], [6.82645, 7.98208, 8.31385]]}
)
expected_logits_slice = torch.tensor(EXPECTED_LOGITS_SLICE.get_expectation(), device=torch_device)
torch.testing.assert_close(logits[0, 0, :3, :3], expected_logits_slice, rtol=1e-3, atol=1e-3)
# verify post-processing without resizing: output shape matches model input resolution
segmentation = image_processor.post_process_semantic_segmentation(outputs=outputs)
self.assertEqual(len(segmentation), 1)
self.assertEqual(segmentation[0].shape, torch.Size([height, width]))
# verify post-processing with target_sizes
target_size = (height // 2, width // 2)
segmentation = image_processor.post_process_semantic_segmentation(outputs=outputs, target_sizes=[target_size])
self.assertEqual(len(segmentation), 1)
self.assertEqual(segmentation[0].shape, torch.Size(target_size))
EXPECTED_CLASS_IDS = Expectations({("cuda", None): [[4, 3, 3], [3, 3, 3], [3, 3, 3]]})
expected_class_ids = torch.tensor(EXPECTED_CLASS_IDS.get_expectation(), device=torch_device)
torch.testing.assert_close(segmentation[0][50:53, 50:53], expected_class_ids)
@require_cv2
@slow
def test_inference_pose_estimation(self):
model = Sapiens2ForPoseEstimation.from_pretrained("facebook/sapiens2-pose-0.4b").eval().to(torch_device)
image_processor = self.default_image_processor
image = prepare_img()
image_height, image_width = image.shape[-2:]
# person bbox in COCO format (x, y, w, h)
boxes = [[[2.7080630e02, 5.7221174e-01, 2.9409006e02, 3.7946970e02]]]
inputs = image_processor(image, boxes=boxes, return_tensors="pt").to(torch_device)
with torch.no_grad():
outputs = model(**inputs)
heatmaps = outputs.heatmaps
self.assertEqual(heatmaps.shape, torch.Size([1, model.config.num_labels, 256, 192]))
EXPECTED_HEATMAPS = Expectations(
{("cuda", None): [[0.26140, 0.24656, 0.21673], [0.33708, 0.31597, 0.28028], [0.41624, 0.39270, 0.35014]]}
)
expected_heatmaps = torch.tensor(EXPECTED_HEATMAPS.get_expectation(), device=torch_device)
torch.testing.assert_close(heatmaps[0, 0, 70:73, 70:73], expected_heatmaps, rtol=1e-2, atol=1e-2)
results = image_processor.post_process_pose_estimation(outputs, boxes=boxes)
self.assertEqual(len(results), 1)
self.assertEqual(len(results[0]), 1)
person = results[0][0]
keypoints = person["keypoints"]
EXPECTED_KEYPOINTS = Expectations(
{("cuda", None): [[364.33920111, 97.92528764], [373.25104943, 80.97749201], [353.21072316, 83.38954486]]}
)
expected_keypoints = torch.tensor(EXPECTED_KEYPOINTS.get_expectation(), device=torch_device)
torch.testing.assert_close(keypoints[:3], expected_keypoints, rtol=1e-2, atol=1e-2)
scores = person["scores"]
EXPECTED_SCORES = Expectations({("cuda", None): [1.0007433, 0.9987416, 1.0015154]})
expected_scores = torch.tensor(EXPECTED_SCORES.get_expectation(), device=torch_device)
torch.testing.assert_close(scores[:3], expected_scores, rtol=1e-2, atol=1e-2)
bbox = person["bbox"]
expected_bbox_xywh = torch.tensor(boxes[0][0], device=torch_device)
expected_bbox_xyxy = torch.tensor(
[
expected_bbox_xywh[0],
expected_bbox_xywh[1],
expected_bbox_xywh[0] + expected_bbox_xywh[2],
expected_bbox_xywh[1] + expected_bbox_xywh[3],
],
device=torch_device,
)
torch.testing.assert_close(bbox, expected_bbox_xyxy, rtol=1e-3, atol=1e-3)
# target_sizes without source_sizes must raise
with self.assertRaises(ValueError):
image_processor.post_process_pose_estimation(outputs, boxes=boxes, target_sizes=[(432, 640)])
# source_sizes + target_sizes: keypoints and bbox scaled by target/source
target_height, target_width = image_height * 2, image_width * 2
results_scaled = image_processor.post_process_pose_estimation(
outputs,
boxes=boxes,
source_sizes=[(image_height, image_width)],
target_sizes=[(target_height, target_width)],
)
torch.testing.assert_close(results_scaled[0][0]["keypoints"], keypoints * 2.0)
torch.testing.assert_close(results_scaled[0][0]["bbox"], expected_bbox_xyxy * 2.0)
# Test flipping
flipped_inputs = {"pixel_values": inputs["pixel_values"].flip(-1)}
flip_pairs = torch.tensor(model.config.flip_pairs)
with torch.no_grad():
flipped_outputs = model(**flipped_inputs, flip_pairs=flip_pairs)
flipped_heatmaps = flipped_outputs.heatmaps
EXPECTED_FLIPPED_HEATMAPS = Expectations(
{("cuda", None): [[0.27348, 0.25426, 0.22496], [0.34877, 0.32563, 0.28418], [0.43967, 0.40607, 0.35721]]}
)
expected_flipped_heatmaps = torch.tensor(EXPECTED_FLIPPED_HEATMAPS.get_expectation(), device=torch_device)
torch.testing.assert_close(
flipped_heatmaps[0, 0, 70:73, 70:73], expected_flipped_heatmaps, rtol=1e-2, atol=1e-2
)
final_results = image_processor.post_process_pose_estimation(
outputs, outputs_flipped=flipped_outputs, boxes=boxes
)
self.assertEqual(len(final_results), 1)
self.assertEqual(len(final_results[0]), 1)
final_person = final_results[0][0]
final_keypoints = final_person["keypoints"]
EXPECTED_FINAL_KEYPOINTS = Expectations(
{("cuda", None): [[364.14644305, 97.99268751], [373.66756367, 81.19966519], [353.4574526, 83.647911]]}
)
expected_final_keypoints = torch.tensor(EXPECTED_FINAL_KEYPOINTS.get_expectation(), device=torch_device)
torch.testing.assert_close(final_keypoints[:3], expected_final_keypoints, rtol=1e-2, atol=1e-2)
final_scores = final_person["scores"]
EXPECTED_FINAL_SCORES = Expectations({("cuda", None): [1.0064079, 0.98746514, 0.99821794]})
expected_final_scores = torch.tensor(EXPECTED_FINAL_SCORES.get_expectation(), device=torch_device)
torch.testing.assert_close(final_scores[:3], expected_final_scores, rtol=1e-2, atol=1e-2)
final_bbox = final_person["bbox"]
torch.testing.assert_close(final_bbox, expected_bbox_xyxy, rtol=1e-3, atol=1e-3)
@slow
def test_inference_normal_estimation(self):
model = Sapiens2ForNormalEstimation.from_pretrained("facebook/sapiens2-normal-0.4b").eval().to(torch_device)
image_processor = Sapiens2ImageProcessor.from_pretrained("facebook/sapiens2-normal-0.4b")
image = prepare_img()
image_height, image_width = image.shape[-2:]
inputs = image_processor(image, return_tensors="pt").to(torch_device)
with torch.no_grad():
outputs = model(**inputs)
_, _, height, width = inputs["pixel_values"].shape
self.assertEqual(outputs.normals.shape, torch.Size([1, 3, height, width]))
# We can get closer to expected values by using cv2 resize instead of torchvision.
EXPECTED_NORMALS = Expectations(
{("cuda", None): [[0.9577, 1.8808, 0.9826], [1.6904, 1.7351, 1.9120], [2.4828, 1.9887, 2.5168]]}
)
expected_normals = torch.tensor(EXPECTED_NORMALS.get_expectation(), device=torch_device)
torch.testing.assert_close(outputs.normals[0, 0, :3, :3], expected_normals, rtol=1e-2, atol=1e-2)
result = image_processor.post_process_normal_estimation(outputs, source_sizes=[(image_height, image_width)])
self.assertEqual(len(result), 1)
self.assertEqual(result[0]["normals"].shape, torch.Size([3, 432, 640]))
EXPECTED_POSTPROCESSED_NORMALS = Expectations(
{("cuda", None): [[-0.8266, -0.7899, -0.7512], [-0.8227, -0.7843, -0.7440], [-0.8098, -0.7721, -0.7318]]}
)
expected_postprocessed_normals = torch.tensor(
EXPECTED_POSTPROCESSED_NORMALS.get_expectation(), device=torch_device
)
torch.testing.assert_close(
result[0]["normals"][0, :3, :3], expected_postprocessed_normals, rtol=1e-2, atol=1e-2
)
@slow
def test_inference_pointmap_estimation(self):
model = (
Sapiens2ForPointmapEstimation.from_pretrained("facebook/sapiens2-pointmap-0.4b").eval().to(torch_device)
)
image_processor = Sapiens2ImageProcessor.from_pretrained("facebook/sapiens2-pointmap-0.4b")
image = prepare_img()
image_height, image_width = image.shape[-2:]
inputs = image_processor(image, return_tensors="pt").to(torch_device)
with torch.no_grad():
outputs = model(**inputs)
self.assertIsInstance(outputs, Sapiens2PointmapEstimatorOutput)
_, _, height, width = inputs["pixel_values"].shape
self.assertEqual(outputs.pointmaps.shape, torch.Size([1, 3, height, width]))
self.assertEqual(outputs.scales.shape, torch.Size([1, 1]))
EXPECTED_SCALE = Expectations({("cuda", None): [[0.9931]]})
expected_scale = torch.tensor(EXPECTED_SCALE.get_expectation(), device=torch_device)
torch.testing.assert_close(outputs.scales, expected_scale, rtol=1e-3, atol=1e-3)
EXPECTED_POINTMAP = Expectations(
{("cuda", None): [[-0.0096, -0.0567, -0.0460], [-0.0657, -0.0583, -0.0688], [-0.1035, -0.0363, -0.0659]]}
)
expected_pointmap = torch.tensor(EXPECTED_POINTMAP.get_expectation(), device=torch_device)
torch.testing.assert_close(outputs.pointmaps[0, 0, :3, :3], expected_pointmap, rtol=1e-2, atol=1e-2)
result = image_processor.post_process_pointmap_estimation(outputs, source_sizes=[(image_height, image_width)])
self.assertEqual(len(result), 1)
self.assertEqual(result[0]["pointmap"].shape, torch.Size([3, image_height, image_width]))
# Head and post-processing are exactly identical to original code but differences from backbone
# get amplified after scaling and resizing so we need to relax the tolerance here.
EXPECTED_POSTPROCESSED_POINTMAP = Expectations(
{("cuda", None): [[0.0771, 0.1335, 0.3025], [-0.1179, 0.2904, 0.7140], [0.0337, 0.3037, 0.4390]]}
)
expected_postprocessed_pointmap = torch.tensor(
EXPECTED_POSTPROCESSED_POINTMAP.get_expectation(), device=torch_device
)
torch.testing.assert_close(
result[0]["pointmap"][0, :3, :3], expected_postprocessed_pointmap, rtol=1e-2, atol=1e-2
)
@slow
def test_inference_matting(self):
model = Sapiens2ForImageMatting.from_pretrained("facebook/sapiens2-matting-1b").eval().to(torch_device)
image_processor = self.default_image_processor
image = prepare_img()
image_height, image_width = image.shape[-2:]
inputs = image_processor(image, return_tensors="pt").to(torch_device)
with torch.no_grad():
outputs = model(**inputs)
self.assertIsInstance(outputs, Sapiens2ImageMattingOutput)
_, _, height, width = inputs["pixel_values"].shape
self.assertEqual(outputs.foregrounds.shape, torch.Size([1, 3, height, width]))
self.assertEqual(outputs.alphas.shape, torch.Size([1, 1, height, width]))
# Difference due to cv2 vs torchvision pre-processing. Model outputs are equal on same tensor input.
EXPECTED_FOREGROUNDS = Expectations(
{("cuda", None): [[0.1432, 0.2051, 0.3043], [0.1889, 0.2681, 0.3509], [0.2511, 0.3076, 0.4047]]}
)
expected_foregrounds = torch.tensor(EXPECTED_FOREGROUNDS.get_expectation(), device=torch_device)
torch.testing.assert_close(
outputs.foregrounds[0, 0, 100:103, 100:103], expected_foregrounds, rtol=1e-2, atol=1e-2
)
background = torch.tensor([177, 64, 0], device=torch_device).view(3, 1, 1)
result = image_processor.post_process_image_matting(
outputs, target_sizes=[(image_height, image_width)], backgrounds=background
)
self.assertEqual(len(result), 1)
alpha = result[0]["alpha"]
foreground = result[0]["foreground"]
composite = result[0]["composite"]
self.assertEqual(alpha.shape, (1, image_height, image_width))
self.assertEqual(foreground.shape, (3, image_height, image_width))
self.assertEqual(composite.shape, (3, image_height, image_width))
EXPECTED_ALPHA = Expectations(
{
("cuda", None): [
[0.99995, 0.9999123, 0.9997628],
[0.99991906, 0.9997431, 0.99754137],
[0.9997362, 0.99711365, 0.9444071],
]
}
)
expected_alpha = torch.tensor(EXPECTED_ALPHA.get_expectation(), device=torch_device)
torch.testing.assert_close(alpha[0, 300:303, 300:303], expected_alpha, rtol=1e-3, atol=1e-3)
EXPECTED_FOREGROUND = Expectations(
{
("cuda", None): [
[0.7175647, 0.6906685, 0.65860075],
[0.7162684, 0.6867891, 0.64463294],
[0.6924842, 0.67141336, 0.5356377],
]
}
)
expected_foreground = torch.tensor(EXPECTED_FOREGROUND.get_expectation(), device=torch_device)
torch.testing.assert_close(foreground[0, 300:303, 300:303], expected_foreground, rtol=1e-2, atol=1e-2)
EXPECTED_COMPOSITE = Expectations({("cuda", None): [[182, 176, 167], [182, 175, 164], [176, 171, 136]]})
expected_composite = torch.tensor(EXPECTED_COMPOSITE.get_expectation(), dtype=torch.uint8, device=torch_device)
torch.testing.assert_close(composite[0, 300:303, 300:303], expected_composite, rtol=0, atol=1)
@require_torch
class Sapiens2BackboneTest(unittest.TestCase, BackboneTesterMixin):
all_model_classes = (Sapiens2Backbone,) if is_torch_available() else ()
config_class = Sapiens2Config
def setUp(self):
self.model_tester = Sapiens2ModelTester(self)