Some checks failed
Self-hosted runner (nightly-past-ci-caller) / Get number (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.11 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.10 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.9 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.8 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.7 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.6 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.5 (push) Has been cancelled
Self-hosted runner (benchmark) / Benchmark (aws-g5-4xlarge-cache) (push) Has been cancelled
Build documentation / build (push) Has been cancelled
Build documentation / build_other_lang (push) Has been cancelled
CodeQL Security Analysis / CodeQL Analysis (push) Has been cancelled
New model PR merged notification / Notify new model (push) Has been cancelled
PR CI / pr-ci (push) Has been cancelled
Slow tests on important models (on Push - A10) / Get all modified files (push) Has been cancelled
Secret Leaks / trufflehog (push) Has been cancelled
Update Transformers metadata / build_and_package (push) Has been cancelled
Slow tests on important models (on Push - A10) / Model CI (push) Has been cancelled
Check Tiny Models / Check tiny models (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Model CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Pipeline CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Example CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / DeepSpeed CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Trainer/FSDP CI (push) Has been cancelled
Nvidia CI - Flash Attn / Setup (push) Has been cancelled
Nvidia CI - Flash Attn / Model CI (push) Has been cancelled
Nvidia CI / Setup (push) Has been cancelled
Nvidia CI / Model CI (push) Has been cancelled
Nvidia CI / Torch pipeline CI (push) Has been cancelled
Nvidia CI / Example CI (push) Has been cancelled
Nvidia CI / Trainer/FSDP CI (push) Has been cancelled
Nvidia CI / DeepSpeed CI (push) Has been cancelled
Nvidia CI / Quantization CI (push) Has been cancelled
Nvidia CI / Kernels CI (push) Has been cancelled
Doctests / Setup (push) Has been cancelled
Doctests / Call doctest jobs (push) Has been cancelled
Doctests / Send results to webhook (push) Has been cancelled
Extras Smoke Test / Get supported Python versions (push) Has been cancelled
Extras Smoke Test / Test extras on Python ${{ matrix.python-version }} (push) Has been cancelled
Extras Smoke Test / Check Slack token availability (push) Has been cancelled
Extras Smoke Test / Notify failures to Slack (push) Has been cancelled
Self-hosted runner (AMD scheduled CI caller) / Trigger Scheduled AMD CI (push) Has been cancelled
Stale Bot / Close Stale Issues (push) Has been cancelled
744 lines
34 KiB
Python
744 lines
34 KiB
Python
# Copyright 2026 the HuggingFace Team. All rights reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
"""Testing suite for the PyTorch Sapiens2 model."""
|
|
|
|
import unittest
|
|
from functools import cached_property
|
|
|
|
from transformers import Sapiens2Config, Sapiens2HeadConfig
|
|
from transformers.testing_utils import Expectations, require_cv2, require_torch, require_vision, slow, torch_device
|
|
from transformers.utils import is_torch_available
|
|
|
|
from ...test_backbone_common import BackboneTesterMixin
|
|
from ...test_configuration_common import ConfigTester
|
|
from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
|
|
from ...test_pipeline_mixin import PipelineTesterMixin
|
|
from ...test_processing_common import url_to_local_path
|
|
|
|
|
|
if is_torch_available():
|
|
import torch
|
|
from torch import nn
|
|
|
|
from transformers import (
|
|
Sapiens2Backbone,
|
|
Sapiens2ForImageMatting,
|
|
Sapiens2ForNormalEstimation,
|
|
Sapiens2ForPointmapEstimation,
|
|
Sapiens2ForPoseEstimation,
|
|
Sapiens2ForSemanticSegmentation,
|
|
Sapiens2ImageProcessor,
|
|
Sapiens2Model,
|
|
)
|
|
from transformers.image_utils import load_image_as_tensor
|
|
from transformers.models.sapiens2.modeling_sapiens2 import (
|
|
Sapiens2ImageMattingOutput,
|
|
Sapiens2PointmapEstimatorOutput,
|
|
)
|
|
|
|
|
|
class Sapiens2ModelTester:
|
|
def __init__(
|
|
self,
|
|
parent,
|
|
batch_size=13,
|
|
image_size=30,
|
|
patch_size=2,
|
|
num_channels=3,
|
|
is_training=False,
|
|
use_labels=True,
|
|
hidden_size=32,
|
|
num_hidden_layers=2,
|
|
num_attention_heads=4,
|
|
intermediate_size=37,
|
|
hidden_act="gelu",
|
|
hidden_dropout_prob=0.1,
|
|
attention_probs_dropout_prob=0.1,
|
|
type_sequence_label_size=10,
|
|
initializer_range=0.02,
|
|
num_register_tokens=2,
|
|
mask_ratio=0.5,
|
|
scope=None,
|
|
):
|
|
self.parent = parent
|
|
self.batch_size = batch_size
|
|
self.image_size = image_size
|
|
self.patch_size = patch_size
|
|
self.num_channels = num_channels
|
|
self.is_training = is_training
|
|
self.use_labels = use_labels
|
|
self.hidden_size = hidden_size
|
|
self.num_hidden_layers = num_hidden_layers
|
|
self.num_attention_heads = num_attention_heads
|
|
self.intermediate_size = intermediate_size
|
|
self.hidden_act = hidden_act
|
|
self.hidden_dropout_prob = hidden_dropout_prob
|
|
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
|
self.type_sequence_label_size = type_sequence_label_size
|
|
self.initializer_range = initializer_range
|
|
self.num_register_tokens = num_register_tokens
|
|
self.scope = scope
|
|
|
|
num_patches = (image_size // patch_size) ** 2
|
|
self.seq_length = num_patches + 1 + self.num_register_tokens
|
|
self.mask_ratio = mask_ratio
|
|
self.num_masks = int(mask_ratio * self.seq_length)
|
|
self.mask_length = num_patches
|
|
|
|
def prepare_config_and_inputs(self):
|
|
pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
|
|
|
|
labels = None
|
|
if self.use_labels:
|
|
labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
|
|
|
|
config = self.get_config()
|
|
|
|
return config, pixel_values, labels
|
|
|
|
def get_config(self):
|
|
return Sapiens2Config(
|
|
image_size=self.image_size,
|
|
patch_size=self.patch_size,
|
|
num_channels=self.num_channels,
|
|
hidden_size=self.hidden_size,
|
|
num_hidden_layers=self.num_hidden_layers,
|
|
num_attention_heads=self.num_attention_heads,
|
|
intermediate_size=self.intermediate_size,
|
|
hidden_act=self.hidden_act,
|
|
hidden_dropout_prob=self.hidden_dropout_prob,
|
|
attention_probs_dropout_prob=self.attention_probs_dropout_prob,
|
|
is_decoder=False,
|
|
initializer_range=self.initializer_range,
|
|
num_register_tokens=self.num_register_tokens,
|
|
stage_names=["stem"] + [f"stage{i}" for i in range(1, self.num_hidden_layers + 1)],
|
|
out_indices=[0, 1],
|
|
reshape_hidden_states=True,
|
|
num_labels=4,
|
|
flip_pairs=[[1, 2], [3, 4]],
|
|
# Head config sized to satisfy all model conversion patterns in test_reverse_loading_mapping
|
|
head_config=Sapiens2HeadConfig(
|
|
upsample_out_channels=[8, 4, 4, 4],
|
|
upsample_kernel_sizes=[4, 4, 4, 4],
|
|
conv_out_channels=[4, 4, 4],
|
|
conv_kernel_sizes=[1, 1, 1],
|
|
scale_conv_out_channels=[8, 4, 4],
|
|
scale_conv_kernel_sizes=[1, 1, 1],
|
|
scale_final_hidden_sizes=[8, 4],
|
|
),
|
|
)
|
|
|
|
def create_and_check_backbone(self, config, pixel_values, labels):
|
|
config.out_features = ["stage1", "stage2"]
|
|
config.reshape_hidden_states = True
|
|
|
|
model = Sapiens2Backbone(config)
|
|
model.to(torch_device)
|
|
model.eval()
|
|
|
|
with torch.no_grad():
|
|
outputs = model(pixel_values)
|
|
|
|
self.parent.assertEqual(len(outputs.feature_maps), 2)
|
|
for fm in outputs.feature_maps:
|
|
b, c, h, w = fm.shape
|
|
self.parent.assertEqual(b, self.batch_size)
|
|
self.parent.assertEqual(c, self.hidden_size)
|
|
self.parent.assertEqual(h, self.image_size // self.patch_size)
|
|
self.parent.assertEqual(w, self.image_size // self.patch_size)
|
|
|
|
def create_and_check_model(self, config, pixel_values, labels):
|
|
model = Sapiens2Model(config=config)
|
|
model.to(torch_device)
|
|
model.eval()
|
|
result = model(pixel_values)
|
|
self.parent.assertEqual(
|
|
result.last_hidden_state.shape,
|
|
(self.batch_size, self.seq_length, self.hidden_size),
|
|
)
|
|
|
|
def create_and_check_for_semantic_segmentation(self, config, pixel_values, labels):
|
|
model = Sapiens2ForSemanticSegmentation(config)
|
|
model.to(torch_device)
|
|
model.eval()
|
|
with torch.no_grad():
|
|
result = model(pixel_values)
|
|
# patch_height = image_size // patch_size = 30 // 2 = 15
|
|
# 4 deconv layers with stride=2: 15 * 2^4 = 240
|
|
patch_height = self.image_size // self.patch_size
|
|
expected_h = patch_height * (2 ** len(config.head_config.upsample_out_channels))
|
|
self.parent.assertEqual(
|
|
result.logits.shape,
|
|
(self.batch_size, config.num_labels, expected_h, expected_h),
|
|
)
|
|
|
|
def create_and_check_for_pose_estimation(self, config, pixel_values, labels):
|
|
model = Sapiens2ForPoseEstimation(config)
|
|
model.to(torch_device)
|
|
model.eval()
|
|
with torch.no_grad():
|
|
result = model(pixel_values)
|
|
patch_height = self.image_size // self.patch_size
|
|
expected_h = patch_height * (2 ** len(config.head_config.upsample_out_channels))
|
|
self.parent.assertEqual(
|
|
result.heatmaps.shape,
|
|
(self.batch_size, config.num_labels, expected_h, expected_h),
|
|
)
|
|
|
|
def create_and_check_for_normal_estimation(self, config, pixel_values, labels):
|
|
model = Sapiens2ForNormalEstimation(config)
|
|
model.to(torch_device)
|
|
model.eval()
|
|
with torch.no_grad():
|
|
result = model(pixel_values)
|
|
# PixelShuffle: Conv2d(padding=(ks-1)//2) then shuffle(2) — size per layer: (h + 2p - ks + 1) * 2
|
|
expected_h = config.image_size // self.patch_size
|
|
for ks in config.head_config.upsample_kernel_sizes:
|
|
padding = (ks - 1) // 2
|
|
expected_h = (expected_h + 2 * padding - ks + 1) * 2
|
|
self.parent.assertEqual(
|
|
result.normals.shape,
|
|
(self.batch_size, config.num_labels, expected_h, expected_h),
|
|
)
|
|
self.parent.assertIsNone(result.loss)
|
|
with self.parent.assertRaises(NotImplementedError):
|
|
model(pixel_values, labels=torch.randn_like(result.normals))
|
|
|
|
def create_and_check_for_matting(self, config, pixel_values, labels):
|
|
model = Sapiens2ForImageMatting(config)
|
|
model.to(torch_device)
|
|
model.eval()
|
|
with torch.no_grad():
|
|
result = model(pixel_values)
|
|
expected_h = config.image_size // self.patch_size
|
|
for ks in config.head_config.upsample_kernel_sizes:
|
|
padding = (ks - 1) // 2
|
|
expected_h = (expected_h + 2 * padding - ks + 1) * 2
|
|
self.parent.assertEqual(result.foregrounds.shape, (self.batch_size, 3, expected_h, expected_h))
|
|
self.parent.assertEqual(result.alphas.shape, (self.batch_size, 1, expected_h, expected_h))
|
|
# outputs are sigmoid-activated
|
|
self.parent.assertGreaterEqual(result.foregrounds.min().item(), 0.0)
|
|
self.parent.assertLessEqual(result.foregrounds.max().item(), 1.0)
|
|
self.parent.assertGreaterEqual(result.alphas.min().item(), 0.0)
|
|
self.parent.assertLessEqual(result.alphas.max().item(), 1.0)
|
|
self.parent.assertIsNone(result.loss)
|
|
with self.parent.assertRaises(NotImplementedError):
|
|
model(pixel_values, labels=torch.randn(self.batch_size, 4, expected_h, expected_h))
|
|
|
|
def create_and_check_for_pointmap_estimation(self, config, pixel_values, labels):
|
|
model = Sapiens2ForPointmapEstimation(config)
|
|
model.to(torch_device)
|
|
model.eval()
|
|
with torch.no_grad():
|
|
result = model(pixel_values)
|
|
# PixelShuffle: Conv2d(padding=(ks-1)//2) then shuffle(2) — size per layer: (h + 2p - ks + 1) * 2
|
|
expected_h = config.image_size // self.patch_size
|
|
for ks in config.head_config.upsample_kernel_sizes:
|
|
padding = (ks - 1) // 2
|
|
expected_h = (expected_h + 2 * padding - ks + 1) * 2
|
|
self.parent.assertEqual(
|
|
result.pointmaps.shape,
|
|
(self.batch_size, config.num_labels, expected_h, expected_h),
|
|
)
|
|
self.parent.assertEqual(result.scales.shape, (self.batch_size, 1))
|
|
self.parent.assertIsNone(result.loss)
|
|
with self.parent.assertRaises(NotImplementedError):
|
|
model(pixel_values, labels=torch.randn_like(result.pointmaps))
|
|
|
|
def prepare_config_and_inputs_for_semantic_segmentation(self):
|
|
config = self.get_config()
|
|
pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
|
|
labels = ids_tensor([self.batch_size, self.image_size, self.image_size], config.num_labels)
|
|
return config, pixel_values, labels
|
|
|
|
def prepare_config_and_inputs_for_pointmap_estimation(self):
|
|
config = self.get_config()
|
|
config.head_config.use_pixel_shuffle = True
|
|
pixel_values = floats_tensor([self.batch_size, self.num_channels, config.image_size, config.image_size])
|
|
labels = None
|
|
return config, pixel_values, labels
|
|
|
|
def prepare_config_and_inputs_for_common(self):
|
|
config = self.get_config()
|
|
# Use pixel-shuffle so all model classes (including Normal/Pointmap/Matting) instantiate
|
|
# decode_head.input_conv, satisfying the conversion patterns checked by test_reverse_loading_mapping.
|
|
config.head_config.use_pixel_shuffle = True
|
|
pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
|
|
inputs_dict = {"pixel_values": pixel_values}
|
|
return config, inputs_dict
|
|
|
|
|
|
@require_torch
|
|
class Sapiens2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
|
|
"""
|
|
Here we also overwrite some of the tests of test_modeling_common.py, as Sapiens2 does not use input_ids,
|
|
inputs_embeds, attention_mask and seq_length.
|
|
"""
|
|
|
|
all_model_classes = (
|
|
(
|
|
Sapiens2Model,
|
|
Sapiens2Backbone,
|
|
Sapiens2ForImageMatting,
|
|
Sapiens2ForNormalEstimation,
|
|
Sapiens2ForPointmapEstimation,
|
|
Sapiens2ForPoseEstimation,
|
|
Sapiens2ForSemanticSegmentation,
|
|
)
|
|
if is_torch_available()
|
|
else ()
|
|
)
|
|
pipeline_model_mapping = (
|
|
{
|
|
"image-feature-extraction": Sapiens2Model,
|
|
}
|
|
if is_torch_available()
|
|
else {}
|
|
)
|
|
|
|
test_resize_embeddings = False
|
|
|
|
def setUp(self):
|
|
self.model_tester = Sapiens2ModelTester(self)
|
|
self.config_tester = ConfigTester(self, config_class=Sapiens2Config, has_text_modality=False, hidden_size=32)
|
|
# The decoder heads contain ConvTranspose2d layers which are non-deterministic on CUDA.
|
|
# This non-deterministic behavior is amplified by the InstanceNorm2d layers and results in up
|
|
# to 6e-3 output differences with identical head inputs. We set cudnn.deterministic = True
|
|
# for test stability.
|
|
self._original_cudnn_deterministic = torch.backends.cudnn.deterministic
|
|
torch.backends.cudnn.deterministic = True
|
|
|
|
def tearDown(self):
|
|
torch.backends.cudnn.deterministic = self._original_cudnn_deterministic
|
|
|
|
def test_backbone(self):
|
|
config, pixel_values, labels = self.model_tester.prepare_config_and_inputs()
|
|
self.model_tester.create_and_check_backbone(config, pixel_values, labels)
|
|
|
|
def test_config(self):
|
|
self.config_tester.run_common_tests()
|
|
|
|
@unittest.skip(reason="Sapiens2 does not use inputs_embeds")
|
|
def test_inputs_embeds(self):
|
|
pass
|
|
|
|
def test_model_get_set_embeddings(self):
|
|
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
|
|
|
|
for model_class in self.all_model_classes:
|
|
model = model_class(config)
|
|
self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
|
|
x = model.get_output_embeddings()
|
|
self.assertTrue(x is None or isinstance(x, nn.Linear))
|
|
|
|
def test_model(self):
|
|
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
|
self.model_tester.create_and_check_model(*config_and_inputs)
|
|
|
|
def test_for_semantic_segmentation(self):
|
|
config_and_inputs = self.model_tester.prepare_config_and_inputs_for_semantic_segmentation()
|
|
self.model_tester.create_and_check_for_semantic_segmentation(*config_and_inputs)
|
|
|
|
def test_for_pose_estimation(self):
|
|
config_and_inputs = self.model_tester.prepare_config_and_inputs_for_semantic_segmentation()
|
|
self.model_tester.create_and_check_for_pose_estimation(*config_and_inputs)
|
|
|
|
def test_for_pointmap_estimation(self):
|
|
config_and_inputs = self.model_tester.prepare_config_and_inputs_for_pointmap_estimation()
|
|
self.model_tester.create_and_check_for_pointmap_estimation(*config_and_inputs)
|
|
|
|
def test_for_normal_estimation(self):
|
|
config_and_inputs = self.model_tester.prepare_config_and_inputs_for_pointmap_estimation()
|
|
self.model_tester.create_and_check_for_normal_estimation(*config_and_inputs)
|
|
|
|
def test_for_matting(self):
|
|
config_and_inputs = self.model_tester.prepare_config_and_inputs_for_pointmap_estimation()
|
|
self.model_tester.create_and_check_for_matting(*config_and_inputs)
|
|
|
|
def test_batching_equivalence(self, atol=1e-4, rtol=1e-4):
|
|
# InstanceNorm2d in the decoder heads computes per-instance statistics; different batch
|
|
# sizes can trigger different parallelisation paths on CPU, producing O(1e-5) FP differences.
|
|
super().test_batching_equivalence(atol=atol, rtol=rtol)
|
|
|
|
@unittest.skip(reason="Sapiens2 does not support feedforward chunking")
|
|
def test_feed_forward_chunking(self):
|
|
pass
|
|
|
|
|
|
def prepare_img():
|
|
image = load_image_as_tensor(
|
|
url_to_local_path(
|
|
"https://huggingface.co/datasets/hf-internal-testing/fixtures-coco/resolve/main/val2017/000000004016.png"
|
|
)
|
|
)
|
|
return image
|
|
|
|
|
|
@require_torch
|
|
@require_vision
|
|
class Sapiens2ModelIntegrationTest(unittest.TestCase):
|
|
def setUp(self):
|
|
# The decoder heads contain ConvTranspose2d layers which are non-deterministic on CUDA.
|
|
# This non-deterministic behavior is amplified by the InstanceNorm2d layers and results in up
|
|
# to 6e-3 output differences with identical head inputs. We set cudnn.deterministic = True
|
|
# for test stability.
|
|
self._original_cudnn_deterministic = torch.backends.cudnn.deterministic
|
|
torch.backends.cudnn.deterministic = True
|
|
|
|
def tearDown(self):
|
|
torch.backends.cudnn.deterministic = self._original_cudnn_deterministic
|
|
|
|
@cached_property
|
|
def default_image_processor(self):
|
|
return Sapiens2ImageProcessor.from_pretrained("facebook/sapiens2-pretrain-0.4b")
|
|
|
|
@slow
|
|
def test_inference_no_head(self):
|
|
model = Sapiens2Model.from_pretrained("facebook/sapiens2-pretrain-0.4b").eval().to(torch_device)
|
|
|
|
image_processor = self.default_image_processor
|
|
image = prepare_img()
|
|
inputs = image_processor(image, return_tensors="pt").to(torch_device)
|
|
|
|
# forward pass
|
|
with torch.no_grad():
|
|
outputs = model(**inputs)
|
|
|
|
# verify the last hidden states
|
|
# seq length = num_patches + 1 (CLS token) + num_register_tokens
|
|
_, _, height, width = inputs["pixel_values"].shape
|
|
num_patches = (height // model.config.patch_size) * (width // model.config.patch_size)
|
|
expected_seq_length = num_patches + 1 + model.config.num_register_tokens
|
|
expected_shape = torch.Size((1, expected_seq_length, model.config.hidden_size))
|
|
self.assertEqual(outputs.last_hidden_state.shape, expected_shape)
|
|
|
|
last_layer_cls_token = outputs.pooler_output
|
|
EXPECTED_CLS_SLICE = Expectations({("cuda", None): [-0.09233, -0.00107, -0.12215, 0.07374, -0.03773]})
|
|
expected_cls_slice = torch.tensor(EXPECTED_CLS_SLICE.get_expectation(), device=torch_device)
|
|
torch.testing.assert_close(last_layer_cls_token[0, :5], expected_cls_slice, rtol=1e-3, atol=1e-3)
|
|
|
|
last_layer_register_tokens = outputs.last_hidden_state[:, 1 : model.config.num_register_tokens + 1]
|
|
EXPECTED_REGISTER_SLICE = Expectations({("cuda", None): [0.08412, 0.04387, 0.05709, -0.04962, 0.03715]})
|
|
expected_register_slice = torch.tensor(EXPECTED_REGISTER_SLICE.get_expectation(), device=torch_device)
|
|
torch.testing.assert_close(last_layer_register_tokens[0, 0, :5], expected_register_slice, rtol=1e-3, atol=1e-3)
|
|
|
|
last_layer_patch_tokens = outputs.last_hidden_state[:, model.config.num_register_tokens + 1 :]
|
|
EXPECTED_PATCH_SLICE = Expectations({("cuda", None): [0.14232, -0.11947, -0.05910, -0.09457, -0.11410]})
|
|
expected_patch_slice = torch.tensor(EXPECTED_PATCH_SLICE.get_expectation(), device=torch_device)
|
|
torch.testing.assert_close(last_layer_patch_tokens[0, 0, :5], expected_patch_slice, rtol=1e-3, atol=1e-3)
|
|
|
|
@slow
|
|
def test_inference_semantic_segmentation(self):
|
|
model = Sapiens2ForSemanticSegmentation.from_pretrained("facebook/sapiens2-seg-0.4b").eval().to(torch_device)
|
|
|
|
image_processor = self.default_image_processor
|
|
image = prepare_img()
|
|
inputs = image_processor(image, return_tensors="pt").to(torch_device)
|
|
|
|
# forward pass
|
|
with torch.no_grad():
|
|
outputs = model(**inputs)
|
|
logits = outputs.logits
|
|
|
|
# verify the logits shape: segmentation head upsamples back to the original image resolution
|
|
_, _, height, width = inputs["pixel_values"].shape
|
|
expected_shape = torch.Size((1, model.config.num_labels, height, width))
|
|
self.assertEqual(logits.shape, expected_shape)
|
|
|
|
EXPECTED_LOGITS_SLICE = Expectations(
|
|
{("cuda", None): [[3.45260, 5.55483, 6.57901], [5.71913, 7.21420, 8.11209], [6.82645, 7.98208, 8.31385]]}
|
|
)
|
|
expected_logits_slice = torch.tensor(EXPECTED_LOGITS_SLICE.get_expectation(), device=torch_device)
|
|
torch.testing.assert_close(logits[0, 0, :3, :3], expected_logits_slice, rtol=1e-3, atol=1e-3)
|
|
|
|
# verify post-processing without resizing: output shape matches model input resolution
|
|
segmentation = image_processor.post_process_semantic_segmentation(outputs=outputs)
|
|
self.assertEqual(len(segmentation), 1)
|
|
self.assertEqual(segmentation[0].shape, torch.Size([height, width]))
|
|
|
|
# verify post-processing with target_sizes
|
|
target_size = (height // 2, width // 2)
|
|
segmentation = image_processor.post_process_semantic_segmentation(outputs=outputs, target_sizes=[target_size])
|
|
self.assertEqual(len(segmentation), 1)
|
|
self.assertEqual(segmentation[0].shape, torch.Size(target_size))
|
|
|
|
EXPECTED_CLASS_IDS = Expectations({("cuda", None): [[4, 3, 3], [3, 3, 3], [3, 3, 3]]})
|
|
expected_class_ids = torch.tensor(EXPECTED_CLASS_IDS.get_expectation(), device=torch_device)
|
|
torch.testing.assert_close(segmentation[0][50:53, 50:53], expected_class_ids)
|
|
|
|
@require_cv2
|
|
@slow
|
|
def test_inference_pose_estimation(self):
|
|
model = Sapiens2ForPoseEstimation.from_pretrained("facebook/sapiens2-pose-0.4b").eval().to(torch_device)
|
|
|
|
image_processor = self.default_image_processor
|
|
image = prepare_img()
|
|
|
|
image_height, image_width = image.shape[-2:]
|
|
|
|
# person bbox in COCO format (x, y, w, h)
|
|
boxes = [[[2.7080630e02, 5.7221174e-01, 2.9409006e02, 3.7946970e02]]]
|
|
inputs = image_processor(image, boxes=boxes, return_tensors="pt").to(torch_device)
|
|
|
|
with torch.no_grad():
|
|
outputs = model(**inputs)
|
|
|
|
heatmaps = outputs.heatmaps
|
|
self.assertEqual(heatmaps.shape, torch.Size([1, model.config.num_labels, 256, 192]))
|
|
EXPECTED_HEATMAPS = Expectations(
|
|
{("cuda", None): [[0.26140, 0.24656, 0.21673], [0.33708, 0.31597, 0.28028], [0.41624, 0.39270, 0.35014]]}
|
|
)
|
|
expected_heatmaps = torch.tensor(EXPECTED_HEATMAPS.get_expectation(), device=torch_device)
|
|
torch.testing.assert_close(heatmaps[0, 0, 70:73, 70:73], expected_heatmaps, rtol=1e-2, atol=1e-2)
|
|
|
|
results = image_processor.post_process_pose_estimation(outputs, boxes=boxes)
|
|
self.assertEqual(len(results), 1)
|
|
self.assertEqual(len(results[0]), 1)
|
|
person = results[0][0]
|
|
|
|
keypoints = person["keypoints"]
|
|
EXPECTED_KEYPOINTS = Expectations(
|
|
{("cuda", None): [[364.33920111, 97.92528764], [373.25104943, 80.97749201], [353.21072316, 83.38954486]]}
|
|
)
|
|
expected_keypoints = torch.tensor(EXPECTED_KEYPOINTS.get_expectation(), device=torch_device)
|
|
torch.testing.assert_close(keypoints[:3], expected_keypoints, rtol=1e-2, atol=1e-2)
|
|
|
|
scores = person["scores"]
|
|
EXPECTED_SCORES = Expectations({("cuda", None): [1.0007433, 0.9987416, 1.0015154]})
|
|
expected_scores = torch.tensor(EXPECTED_SCORES.get_expectation(), device=torch_device)
|
|
torch.testing.assert_close(scores[:3], expected_scores, rtol=1e-2, atol=1e-2)
|
|
|
|
bbox = person["bbox"]
|
|
expected_bbox_xywh = torch.tensor(boxes[0][0], device=torch_device)
|
|
expected_bbox_xyxy = torch.tensor(
|
|
[
|
|
expected_bbox_xywh[0],
|
|
expected_bbox_xywh[1],
|
|
expected_bbox_xywh[0] + expected_bbox_xywh[2],
|
|
expected_bbox_xywh[1] + expected_bbox_xywh[3],
|
|
],
|
|
device=torch_device,
|
|
)
|
|
torch.testing.assert_close(bbox, expected_bbox_xyxy, rtol=1e-3, atol=1e-3)
|
|
|
|
# target_sizes without source_sizes must raise
|
|
with self.assertRaises(ValueError):
|
|
image_processor.post_process_pose_estimation(outputs, boxes=boxes, target_sizes=[(432, 640)])
|
|
|
|
# source_sizes + target_sizes: keypoints and bbox scaled by target/source
|
|
target_height, target_width = image_height * 2, image_width * 2
|
|
results_scaled = image_processor.post_process_pose_estimation(
|
|
outputs,
|
|
boxes=boxes,
|
|
source_sizes=[(image_height, image_width)],
|
|
target_sizes=[(target_height, target_width)],
|
|
)
|
|
torch.testing.assert_close(results_scaled[0][0]["keypoints"], keypoints * 2.0)
|
|
torch.testing.assert_close(results_scaled[0][0]["bbox"], expected_bbox_xyxy * 2.0)
|
|
|
|
# Test flipping
|
|
flipped_inputs = {"pixel_values": inputs["pixel_values"].flip(-1)}
|
|
flip_pairs = torch.tensor(model.config.flip_pairs)
|
|
|
|
with torch.no_grad():
|
|
flipped_outputs = model(**flipped_inputs, flip_pairs=flip_pairs)
|
|
|
|
flipped_heatmaps = flipped_outputs.heatmaps
|
|
EXPECTED_FLIPPED_HEATMAPS = Expectations(
|
|
{("cuda", None): [[0.27348, 0.25426, 0.22496], [0.34877, 0.32563, 0.28418], [0.43967, 0.40607, 0.35721]]}
|
|
)
|
|
expected_flipped_heatmaps = torch.tensor(EXPECTED_FLIPPED_HEATMAPS.get_expectation(), device=torch_device)
|
|
torch.testing.assert_close(
|
|
flipped_heatmaps[0, 0, 70:73, 70:73], expected_flipped_heatmaps, rtol=1e-2, atol=1e-2
|
|
)
|
|
|
|
final_results = image_processor.post_process_pose_estimation(
|
|
outputs, outputs_flipped=flipped_outputs, boxes=boxes
|
|
)
|
|
self.assertEqual(len(final_results), 1)
|
|
self.assertEqual(len(final_results[0]), 1)
|
|
|
|
final_person = final_results[0][0]
|
|
final_keypoints = final_person["keypoints"]
|
|
EXPECTED_FINAL_KEYPOINTS = Expectations(
|
|
{("cuda", None): [[364.14644305, 97.99268751], [373.66756367, 81.19966519], [353.4574526, 83.647911]]}
|
|
)
|
|
expected_final_keypoints = torch.tensor(EXPECTED_FINAL_KEYPOINTS.get_expectation(), device=torch_device)
|
|
torch.testing.assert_close(final_keypoints[:3], expected_final_keypoints, rtol=1e-2, atol=1e-2)
|
|
|
|
final_scores = final_person["scores"]
|
|
EXPECTED_FINAL_SCORES = Expectations({("cuda", None): [1.0064079, 0.98746514, 0.99821794]})
|
|
expected_final_scores = torch.tensor(EXPECTED_FINAL_SCORES.get_expectation(), device=torch_device)
|
|
torch.testing.assert_close(final_scores[:3], expected_final_scores, rtol=1e-2, atol=1e-2)
|
|
|
|
final_bbox = final_person["bbox"]
|
|
torch.testing.assert_close(final_bbox, expected_bbox_xyxy, rtol=1e-3, atol=1e-3)
|
|
|
|
@slow
|
|
def test_inference_normal_estimation(self):
|
|
model = Sapiens2ForNormalEstimation.from_pretrained("facebook/sapiens2-normal-0.4b").eval().to(torch_device)
|
|
|
|
image_processor = Sapiens2ImageProcessor.from_pretrained("facebook/sapiens2-normal-0.4b")
|
|
image = prepare_img()
|
|
image_height, image_width = image.shape[-2:]
|
|
inputs = image_processor(image, return_tensors="pt").to(torch_device)
|
|
|
|
with torch.no_grad():
|
|
outputs = model(**inputs)
|
|
|
|
_, _, height, width = inputs["pixel_values"].shape
|
|
self.assertEqual(outputs.normals.shape, torch.Size([1, 3, height, width]))
|
|
|
|
# We can get closer to expected values by using cv2 resize instead of torchvision.
|
|
EXPECTED_NORMALS = Expectations(
|
|
{("cuda", None): [[0.9577, 1.8808, 0.9826], [1.6904, 1.7351, 1.9120], [2.4828, 1.9887, 2.5168]]}
|
|
)
|
|
expected_normals = torch.tensor(EXPECTED_NORMALS.get_expectation(), device=torch_device)
|
|
torch.testing.assert_close(outputs.normals[0, 0, :3, :3], expected_normals, rtol=1e-2, atol=1e-2)
|
|
|
|
result = image_processor.post_process_normal_estimation(outputs, source_sizes=[(image_height, image_width)])
|
|
self.assertEqual(len(result), 1)
|
|
self.assertEqual(result[0]["normals"].shape, torch.Size([3, 432, 640]))
|
|
|
|
EXPECTED_POSTPROCESSED_NORMALS = Expectations(
|
|
{("cuda", None): [[-0.8266, -0.7899, -0.7512], [-0.8227, -0.7843, -0.7440], [-0.8098, -0.7721, -0.7318]]}
|
|
)
|
|
expected_postprocessed_normals = torch.tensor(
|
|
EXPECTED_POSTPROCESSED_NORMALS.get_expectation(), device=torch_device
|
|
)
|
|
torch.testing.assert_close(
|
|
result[0]["normals"][0, :3, :3], expected_postprocessed_normals, rtol=1e-2, atol=1e-2
|
|
)
|
|
|
|
@slow
|
|
def test_inference_pointmap_estimation(self):
|
|
model = (
|
|
Sapiens2ForPointmapEstimation.from_pretrained("facebook/sapiens2-pointmap-0.4b").eval().to(torch_device)
|
|
)
|
|
|
|
image_processor = Sapiens2ImageProcessor.from_pretrained("facebook/sapiens2-pointmap-0.4b")
|
|
image = prepare_img()
|
|
image_height, image_width = image.shape[-2:]
|
|
inputs = image_processor(image, return_tensors="pt").to(torch_device)
|
|
|
|
with torch.no_grad():
|
|
outputs = model(**inputs)
|
|
|
|
self.assertIsInstance(outputs, Sapiens2PointmapEstimatorOutput)
|
|
_, _, height, width = inputs["pixel_values"].shape
|
|
self.assertEqual(outputs.pointmaps.shape, torch.Size([1, 3, height, width]))
|
|
self.assertEqual(outputs.scales.shape, torch.Size([1, 1]))
|
|
|
|
EXPECTED_SCALE = Expectations({("cuda", None): [[0.9931]]})
|
|
expected_scale = torch.tensor(EXPECTED_SCALE.get_expectation(), device=torch_device)
|
|
torch.testing.assert_close(outputs.scales, expected_scale, rtol=1e-3, atol=1e-3)
|
|
|
|
EXPECTED_POINTMAP = Expectations(
|
|
{("cuda", None): [[-0.0096, -0.0567, -0.0460], [-0.0657, -0.0583, -0.0688], [-0.1035, -0.0363, -0.0659]]}
|
|
)
|
|
expected_pointmap = torch.tensor(EXPECTED_POINTMAP.get_expectation(), device=torch_device)
|
|
torch.testing.assert_close(outputs.pointmaps[0, 0, :3, :3], expected_pointmap, rtol=1e-2, atol=1e-2)
|
|
|
|
result = image_processor.post_process_pointmap_estimation(outputs, source_sizes=[(image_height, image_width)])
|
|
self.assertEqual(len(result), 1)
|
|
self.assertEqual(result[0]["pointmap"].shape, torch.Size([3, image_height, image_width]))
|
|
|
|
# Head and post-processing are exactly identical to original code but differences from backbone
|
|
# get amplified after scaling and resizing so we need to relax the tolerance here.
|
|
EXPECTED_POSTPROCESSED_POINTMAP = Expectations(
|
|
{("cuda", None): [[0.0771, 0.1335, 0.3025], [-0.1179, 0.2904, 0.7140], [0.0337, 0.3037, 0.4390]]}
|
|
)
|
|
expected_postprocessed_pointmap = torch.tensor(
|
|
EXPECTED_POSTPROCESSED_POINTMAP.get_expectation(), device=torch_device
|
|
)
|
|
torch.testing.assert_close(
|
|
result[0]["pointmap"][0, :3, :3], expected_postprocessed_pointmap, rtol=1e-2, atol=1e-2
|
|
)
|
|
|
|
@slow
|
|
def test_inference_matting(self):
|
|
model = Sapiens2ForImageMatting.from_pretrained("facebook/sapiens2-matting-1b").eval().to(torch_device)
|
|
|
|
image_processor = self.default_image_processor
|
|
image = prepare_img()
|
|
image_height, image_width = image.shape[-2:]
|
|
inputs = image_processor(image, return_tensors="pt").to(torch_device)
|
|
|
|
with torch.no_grad():
|
|
outputs = model(**inputs)
|
|
|
|
self.assertIsInstance(outputs, Sapiens2ImageMattingOutput)
|
|
_, _, height, width = inputs["pixel_values"].shape
|
|
self.assertEqual(outputs.foregrounds.shape, torch.Size([1, 3, height, width]))
|
|
self.assertEqual(outputs.alphas.shape, torch.Size([1, 1, height, width]))
|
|
|
|
# Difference due to cv2 vs torchvision pre-processing. Model outputs are equal on same tensor input.
|
|
EXPECTED_FOREGROUNDS = Expectations(
|
|
{("cuda", None): [[0.1432, 0.2051, 0.3043], [0.1889, 0.2681, 0.3509], [0.2511, 0.3076, 0.4047]]}
|
|
)
|
|
expected_foregrounds = torch.tensor(EXPECTED_FOREGROUNDS.get_expectation(), device=torch_device)
|
|
torch.testing.assert_close(
|
|
outputs.foregrounds[0, 0, 100:103, 100:103], expected_foregrounds, rtol=1e-2, atol=1e-2
|
|
)
|
|
|
|
background = torch.tensor([177, 64, 0], device=torch_device).view(3, 1, 1)
|
|
result = image_processor.post_process_image_matting(
|
|
outputs, target_sizes=[(image_height, image_width)], backgrounds=background
|
|
)
|
|
self.assertEqual(len(result), 1)
|
|
|
|
alpha = result[0]["alpha"]
|
|
foreground = result[0]["foreground"]
|
|
composite = result[0]["composite"]
|
|
self.assertEqual(alpha.shape, (1, image_height, image_width))
|
|
self.assertEqual(foreground.shape, (3, image_height, image_width))
|
|
self.assertEqual(composite.shape, (3, image_height, image_width))
|
|
|
|
EXPECTED_ALPHA = Expectations(
|
|
{
|
|
("cuda", None): [
|
|
[0.99995, 0.9999123, 0.9997628],
|
|
[0.99991906, 0.9997431, 0.99754137],
|
|
[0.9997362, 0.99711365, 0.9444071],
|
|
]
|
|
}
|
|
)
|
|
expected_alpha = torch.tensor(EXPECTED_ALPHA.get_expectation(), device=torch_device)
|
|
torch.testing.assert_close(alpha[0, 300:303, 300:303], expected_alpha, rtol=1e-3, atol=1e-3)
|
|
|
|
EXPECTED_FOREGROUND = Expectations(
|
|
{
|
|
("cuda", None): [
|
|
[0.7175647, 0.6906685, 0.65860075],
|
|
[0.7162684, 0.6867891, 0.64463294],
|
|
[0.6924842, 0.67141336, 0.5356377],
|
|
]
|
|
}
|
|
)
|
|
expected_foreground = torch.tensor(EXPECTED_FOREGROUND.get_expectation(), device=torch_device)
|
|
torch.testing.assert_close(foreground[0, 300:303, 300:303], expected_foreground, rtol=1e-2, atol=1e-2)
|
|
|
|
EXPECTED_COMPOSITE = Expectations({("cuda", None): [[182, 176, 167], [182, 175, 164], [176, 171, 136]]})
|
|
expected_composite = torch.tensor(EXPECTED_COMPOSITE.get_expectation(), dtype=torch.uint8, device=torch_device)
|
|
torch.testing.assert_close(composite[0, 300:303, 300:303], expected_composite, rtol=0, atol=1)
|
|
|
|
|
|
@require_torch
|
|
class Sapiens2BackboneTest(unittest.TestCase, BackboneTesterMixin):
|
|
all_model_classes = (Sapiens2Backbone,) if is_torch_available() else ()
|
|
config_class = Sapiens2Config
|
|
|
|
def setUp(self):
|
|
self.model_tester = Sapiens2ModelTester(self)
|