transformers/tests/models/sapiens2/test_modeling_sapiens2.py

# Copyright 2026 the HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Testing suite for the PyTorch Sapiens2 model."""

import unittest
from functools import cached_property

from transformers import Sapiens2Config, Sapiens2HeadConfig
from transformers.testing_utils import Expectations, require_cv2, require_torch, require_vision, slow, torch_device
from transformers.utils import is_torch_available

from ...test_backbone_common import BackboneTesterMixin
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
from ...test_pipeline_mixin import PipelineTesterMixin
from ...test_processing_common import url_to_local_path


if is_torch_available():
    import torch
    from torch import nn

    from transformers import (
        Sapiens2Backbone,
        Sapiens2ForImageMatting,
        Sapiens2ForNormalEstimation,
        Sapiens2ForPointmapEstimation,
        Sapiens2ForPoseEstimation,
        Sapiens2ForSemanticSegmentation,
        Sapiens2ImageProcessor,
        Sapiens2Model,
    )
    from transformers.image_utils import load_image_as_tensor
    from transformers.models.sapiens2.modeling_sapiens2 import (
        Sapiens2ImageMattingOutput,
        Sapiens2PointmapEstimatorOutput,
    )


class Sapiens2ModelTester:
    def __init__(
        self,
        parent,
        batch_size=13,
        image_size=30,
        patch_size=2,
        num_channels=3,
        is_training=False,
        use_labels=True,
        hidden_size=32,
        num_hidden_layers=2,
        num_attention_heads=4,
        intermediate_size=37,
        hidden_act="gelu",
        hidden_dropout_prob=0.1,
        attention_probs_dropout_prob=0.1,
        type_sequence_label_size=10,
        initializer_range=0.02,
        num_register_tokens=2,
        mask_ratio=0.5,
        scope=None,
    ):
        self.parent = parent
        self.batch_size = batch_size
        self.image_size = image_size
        self.patch_size = patch_size
        self.num_channels = num_channels
        self.is_training = is_training
        self.use_labels = use_labels
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.intermediate_size = intermediate_size
        self.hidden_act = hidden_act
        self.hidden_dropout_prob = hidden_dropout_prob
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        self.type_sequence_label_size = type_sequence_label_size
        self.initializer_range = initializer_range
        self.num_register_tokens = num_register_tokens
        self.scope = scope

        num_patches = (image_size // patch_size) ** 2
        self.seq_length = num_patches + 1 + self.num_register_tokens
        self.mask_ratio = mask_ratio
        self.num_masks = int(mask_ratio * self.seq_length)
        self.mask_length = num_patches

    def prepare_config_and_inputs(self):
        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])

        labels = None
        if self.use_labels:
            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)

        config = self.get_config()

        return config, pixel_values, labels

    def get_config(self):
        return Sapiens2Config(
            image_size=self.image_size,
            patch_size=self.patch_size,
            num_channels=self.num_channels,
            hidden_size=self.hidden_size,
            num_hidden_layers=self.num_hidden_layers,
            num_attention_heads=self.num_attention_heads,
            intermediate_size=self.intermediate_size,
            hidden_act=self.hidden_act,
            hidden_dropout_prob=self.hidden_dropout_prob,
            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
            is_decoder=False,
            initializer_range=self.initializer_range,
            num_register_tokens=self.num_register_tokens,
            stage_names=["stem"] + [f"stage{i}" for i in range(1, self.num_hidden_layers + 1)],
            out_indices=[0, 1],
            reshape_hidden_states=True,
            num_labels=4,
            flip_pairs=[[1, 2], [3, 4]],
            # Head config sized to satisfy all model conversion patterns in test_reverse_loading_mapping
            head_config=Sapiens2HeadConfig(
                upsample_out_channels=[8, 4, 4, 4],
                upsample_kernel_sizes=[4, 4, 4, 4],
                conv_out_channels=[4, 4, 4],
                conv_kernel_sizes=[1, 1, 1],
                scale_conv_out_channels=[8, 4, 4],
                scale_conv_kernel_sizes=[1, 1, 1],
                scale_final_hidden_sizes=[8, 4],
            ),
        )

    def create_and_check_backbone(self, config, pixel_values, labels):
        config.out_features = ["stage1", "stage2"]
        config.reshape_hidden_states = True

        model = Sapiens2Backbone(config)
        model.to(torch_device)
        model.eval()

        with torch.no_grad():
            outputs = model(pixel_values)

        self.parent.assertEqual(len(outputs.feature_maps), 2)
        for fm in outputs.feature_maps:
            b, c, h, w = fm.shape
            self.parent.assertEqual(b, self.batch_size)
            self.parent.assertEqual(c, self.hidden_size)
            self.parent.assertEqual(h, self.image_size // self.patch_size)
            self.parent.assertEqual(w, self.image_size // self.patch_size)

    def create_and_check_model(self, config, pixel_values, labels):
        model = Sapiens2Model(config=config)
        model.to(torch_device)
        model.eval()
        result = model(pixel_values)
        self.parent.assertEqual(
            result.last_hidden_state.shape,
            (self.batch_size, self.seq_length, self.hidden_size),
        )

    def create_and_check_for_semantic_segmentation(self, config, pixel_values, labels):
        model = Sapiens2ForSemanticSegmentation(config)
        model.to(torch_device)
        model.eval()
        with torch.no_grad():
            result = model(pixel_values)
        # patch_height = image_size // patch_size = 30 // 2 = 15
        # 4 deconv layers with stride=2: 15 * 2^4 = 240
        patch_height = self.image_size // self.patch_size
        expected_h = patch_height * (2 ** len(config.head_config.upsample_out_channels))
        self.parent.assertEqual(
            result.logits.shape,
            (self.batch_size, config.num_labels, expected_h, expected_h),
        )

    def create_and_check_for_pose_estimation(self, config, pixel_values, labels):
        model = Sapiens2ForPoseEstimation(config)
        model.to(torch_device)
        model.eval()
        with torch.no_grad():
            result = model(pixel_values)
        patch_height = self.image_size // self.patch_size
        expected_h = patch_height * (2 ** len(config.head_config.upsample_out_channels))
        self.parent.assertEqual(
            result.heatmaps.shape,
            (self.batch_size, config.num_labels, expected_h, expected_h),
        )

    def create_and_check_for_normal_estimation(self, config, pixel_values, labels):
        model = Sapiens2ForNormalEstimation(config)
        model.to(torch_device)
        model.eval()
        with torch.no_grad():
            result = model(pixel_values)
        # PixelShuffle: Conv2d(padding=(ks-1)//2) then shuffle(2) — size per layer: (h + 2p - ks + 1) * 2
        expected_h = config.image_size // self.patch_size
        for ks in config.head_config.upsample_kernel_sizes:
            padding = (ks - 1) // 2
            expected_h = (expected_h + 2 * padding - ks + 1) * 2
        self.parent.assertEqual(
            result.normals.shape,
            (self.batch_size, config.num_labels, expected_h, expected_h),
        )
        self.parent.assertIsNone(result.loss)
        with self.parent.assertRaises(NotImplementedError):
            model(pixel_values, labels=torch.randn_like(result.normals))

    def create_and_check_for_matting(self, config, pixel_values, labels):
        model = Sapiens2ForImageMatting(config)
        model.to(torch_device)
        model.eval()
        with torch.no_grad():
            result = model(pixel_values)
        expected_h = config.image_size // self.patch_size
        for ks in config.head_config.upsample_kernel_sizes:
            padding = (ks - 1) // 2
            expected_h = (expected_h + 2 * padding - ks + 1) * 2
        self.parent.assertEqual(result.foregrounds.shape, (self.batch_size, 3, expected_h, expected_h))
        self.parent.assertEqual(result.alphas.shape, (self.batch_size, 1, expected_h, expected_h))
        # outputs are sigmoid-activated
        self.parent.assertGreaterEqual(result.foregrounds.min().item(), 0.0)
        self.parent.assertLessEqual(result.foregrounds.max().item(), 1.0)
        self.parent.assertGreaterEqual(result.alphas.min().item(), 0.0)
        self.parent.assertLessEqual(result.alphas.max().item(), 1.0)
        self.parent.assertIsNone(result.loss)
        with self.parent.assertRaises(NotImplementedError):
            model(pixel_values, labels=torch.randn(self.batch_size, 4, expected_h, expected_h))

    def create_and_check_for_pointmap_estimation(self, config, pixel_values, labels):
        model = Sapiens2ForPointmapEstimation(config)
        model.to(torch_device)
        model.eval()
        with torch.no_grad():
            result = model(pixel_values)
        # PixelShuffle: Conv2d(padding=(ks-1)//2) then shuffle(2) — size per layer: (h + 2p - ks + 1) * 2
        expected_h = config.image_size // self.patch_size
        for ks in config.head_config.upsample_kernel_sizes:
            padding = (ks - 1) // 2
            expected_h = (expected_h + 2 * padding - ks + 1) * 2
        self.parent.assertEqual(
            result.pointmaps.shape,
            (self.batch_size, config.num_labels, expected_h, expected_h),
        )
        self.parent.assertEqual(result.scales.shape, (self.batch_size, 1))
        self.parent.assertIsNone(result.loss)
        with self.parent.assertRaises(NotImplementedError):
            model(pixel_values, labels=torch.randn_like(result.pointmaps))

    def prepare_config_and_inputs_for_semantic_segmentation(self):
        config = self.get_config()
        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
        labels = ids_tensor([self.batch_size, self.image_size, self.image_size], config.num_labels)
        return config, pixel_values, labels

    def prepare_config_and_inputs_for_pointmap_estimation(self):
        config = self.get_config()
        config.head_config.use_pixel_shuffle = True
        pixel_values = floats_tensor([self.batch_size, self.num_channels, config.image_size, config.image_size])
        labels = None
        return config, pixel_values, labels

    def prepare_config_and_inputs_for_common(self):
        config = self.get_config()
        # Use pixel-shuffle so all model classes (including Normal/Pointmap/Matting) instantiate
        # decode_head.input_conv, satisfying the conversion patterns checked by test_reverse_loading_mapping.
        config.head_config.use_pixel_shuffle = True
        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
        inputs_dict = {"pixel_values": pixel_values}
        return config, inputs_dict


@require_torch
class Sapiens2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
    """
    Here we also overwrite some of the tests of test_modeling_common.py, as Sapiens2 does not use input_ids,
    inputs_embeds, attention_mask and seq_length.
    """

    all_model_classes = (
        (
            Sapiens2Model,
            Sapiens2Backbone,
            Sapiens2ForImageMatting,
            Sapiens2ForNormalEstimation,
            Sapiens2ForPointmapEstimation,
            Sapiens2ForPoseEstimation,
            Sapiens2ForSemanticSegmentation,
        )
        if is_torch_available()
        else ()
    )
    pipeline_model_mapping = (
        {
            "image-feature-extraction": Sapiens2Model,
        }
        if is_torch_available()
        else {}
    )

    test_resize_embeddings = False

    def setUp(self):
        self.model_tester = Sapiens2ModelTester(self)
        self.config_tester = ConfigTester(self, config_class=Sapiens2Config, has_text_modality=False, hidden_size=32)
        # The decoder heads contain ConvTranspose2d layers which are non-deterministic on CUDA.
        # This non-deterministic behavior is amplified by the InstanceNorm2d layers and results in up
        # to 6e-3 output differences with identical head inputs. We set cudnn.deterministic = True
        # for test stability.
        self._original_cudnn_deterministic = torch.backends.cudnn.deterministic
        torch.backends.cudnn.deterministic = True

    def tearDown(self):
        torch.backends.cudnn.deterministic = self._original_cudnn_deterministic

    def test_backbone(self):
        config, pixel_values, labels = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_backbone(config, pixel_values, labels)

    def test_config(self):
        self.config_tester.run_common_tests()

    @unittest.skip(reason="Sapiens2 does not use inputs_embeds")
    def test_inputs_embeds(self):
        pass

    def test_model_get_set_embeddings(self):
        config, _ = self.model_tester.prepare_config_and_inputs_for_common()

        for model_class in self.all_model_classes:
            model = model_class(config)
            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
            x = model.get_output_embeddings()
            self.assertTrue(x is None or isinstance(x, nn.Linear))

    def test_model(self):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_model(*config_and_inputs)

    def test_for_semantic_segmentation(self):
        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_semantic_segmentation()
        self.model_tester.create_and_check_for_semantic_segmentation(*config_and_inputs)

    def test_for_pose_estimation(self):
        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_semantic_segmentation()
        self.model_tester.create_and_check_for_pose_estimation(*config_and_inputs)

    def test_for_pointmap_estimation(self):
        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_pointmap_estimation()
        self.model_tester.create_and_check_for_pointmap_estimation(*config_and_inputs)

    def test_for_normal_estimation(self):
        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_pointmap_estimation()
        self.model_tester.create_and_check_for_normal_estimation(*config_and_inputs)

    def test_for_matting(self):
        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_pointmap_estimation()
        self.model_tester.create_and_check_for_matting(*config_and_inputs)

    def test_batching_equivalence(self, atol=1e-4, rtol=1e-4):
        # InstanceNorm2d in the decoder heads computes per-instance statistics; different batch
        # sizes can trigger different parallelisation paths on CPU, producing O(1e-5) FP differences.
        super().test_batching_equivalence(atol=atol, rtol=rtol)

    @unittest.skip(reason="Sapiens2 does not support feedforward chunking")
    def test_feed_forward_chunking(self):
        pass


def prepare_img():
    image = load_image_as_tensor(
        url_to_local_path(
            "https://huggingface.co/datasets/hf-internal-testing/fixtures-coco/resolve/main/val2017/000000004016.png"
        )
    )
    return image


@require_torch
@require_vision
class Sapiens2ModelIntegrationTest(unittest.TestCase):
    def setUp(self):
        # The decoder heads contain ConvTranspose2d layers which are non-deterministic on CUDA.
        # This non-deterministic behavior is amplified by the InstanceNorm2d layers and results in up
        # to 6e-3 output differences with identical head inputs. We set cudnn.deterministic = True
        # for test stability.
        self._original_cudnn_deterministic = torch.backends.cudnn.deterministic
        torch.backends.cudnn.deterministic = True

    def tearDown(self):
        torch.backends.cudnn.deterministic = self._original_cudnn_deterministic

    @cached_property
    def default_image_processor(self):
        return Sapiens2ImageProcessor.from_pretrained("facebook/sapiens2-pretrain-0.4b")

    @slow
    def test_inference_no_head(self):
        model = Sapiens2Model.from_pretrained("facebook/sapiens2-pretrain-0.4b").eval().to(torch_device)

        image_processor = self.default_image_processor
        image = prepare_img()
        inputs = image_processor(image, return_tensors="pt").to(torch_device)

        # forward pass
        with torch.no_grad():
            outputs = model(**inputs)

        # verify the last hidden states
        # seq length = num_patches + 1 (CLS token) + num_register_tokens
        _, _, height, width = inputs["pixel_values"].shape
        num_patches = (height // model.config.patch_size) * (width // model.config.patch_size)
        expected_seq_length = num_patches + 1 + model.config.num_register_tokens
        expected_shape = torch.Size((1, expected_seq_length, model.config.hidden_size))
        self.assertEqual(outputs.last_hidden_state.shape, expected_shape)

        last_layer_cls_token = outputs.pooler_output
        EXPECTED_CLS_SLICE = Expectations({("cuda", None): [-0.09233, -0.00107, -0.12215, 0.07374, -0.03773]})
        expected_cls_slice = torch.tensor(EXPECTED_CLS_SLICE.get_expectation(), device=torch_device)
        torch.testing.assert_close(last_layer_cls_token[0, :5], expected_cls_slice, rtol=1e-3, atol=1e-3)

        last_layer_register_tokens = outputs.last_hidden_state[:, 1 : model.config.num_register_tokens + 1]
        EXPECTED_REGISTER_SLICE = Expectations({("cuda", None): [0.08412, 0.04387, 0.05709, -0.04962, 0.03715]})
        expected_register_slice = torch.tensor(EXPECTED_REGISTER_SLICE.get_expectation(), device=torch_device)
        torch.testing.assert_close(last_layer_register_tokens[0, 0, :5], expected_register_slice, rtol=1e-3, atol=1e-3)

        last_layer_patch_tokens = outputs.last_hidden_state[:, model.config.num_register_tokens + 1 :]
        EXPECTED_PATCH_SLICE = Expectations({("cuda", None): [0.14232, -0.11947, -0.05910, -0.09457, -0.11410]})
        expected_patch_slice = torch.tensor(EXPECTED_PATCH_SLICE.get_expectation(), device=torch_device)
        torch.testing.assert_close(last_layer_patch_tokens[0, 0, :5], expected_patch_slice, rtol=1e-3, atol=1e-3)

    @slow
    def test_inference_semantic_segmentation(self):
        model = Sapiens2ForSemanticSegmentation.from_pretrained("facebook/sapiens2-seg-0.4b").eval().to(torch_device)

        image_processor = self.default_image_processor
        image = prepare_img()
        inputs = image_processor(image, return_tensors="pt").to(torch_device)

        # forward pass
        with torch.no_grad():
            outputs = model(**inputs)
        logits = outputs.logits

        # verify the logits shape: segmentation head upsamples back to the original image resolution
        _, _, height, width = inputs["pixel_values"].shape
        expected_shape = torch.Size((1, model.config.num_labels, height, width))
        self.assertEqual(logits.shape, expected_shape)

        EXPECTED_LOGITS_SLICE = Expectations(
            {("cuda", None): [[3.45260, 5.55483, 6.57901], [5.71913, 7.21420, 8.11209], [6.82645, 7.98208, 8.31385]]}
        )
        expected_logits_slice = torch.tensor(EXPECTED_LOGITS_SLICE.get_expectation(), device=torch_device)
        torch.testing.assert_close(logits[0, 0, :3, :3], expected_logits_slice, rtol=1e-3, atol=1e-3)

        # verify post-processing without resizing: output shape matches model input resolution
        segmentation = image_processor.post_process_semantic_segmentation(outputs=outputs)
        self.assertEqual(len(segmentation), 1)
        self.assertEqual(segmentation[0].shape, torch.Size([height, width]))

        # verify post-processing with target_sizes
        target_size = (height // 2, width // 2)
        segmentation = image_processor.post_process_semantic_segmentation(outputs=outputs, target_sizes=[target_size])
        self.assertEqual(len(segmentation), 1)
        self.assertEqual(segmentation[0].shape, torch.Size(target_size))

        EXPECTED_CLASS_IDS = Expectations({("cuda", None): [[4, 3, 3], [3, 3, 3], [3, 3, 3]]})
        expected_class_ids = torch.tensor(EXPECTED_CLASS_IDS.get_expectation(), device=torch_device)
        torch.testing.assert_close(segmentation[0][50:53, 50:53], expected_class_ids)

    @require_cv2
    @slow
    def test_inference_pose_estimation(self):
        model = Sapiens2ForPoseEstimation.from_pretrained("facebook/sapiens2-pose-0.4b").eval().to(torch_device)

        image_processor = self.default_image_processor
        image = prepare_img()

        image_height, image_width = image.shape[-2:]

        # person bbox in COCO format (x, y, w, h)
        boxes = [[[2.7080630e02, 5.7221174e-01, 2.9409006e02, 3.7946970e02]]]
        inputs = image_processor(image, boxes=boxes, return_tensors="pt").to(torch_device)

        with torch.no_grad():
            outputs = model(**inputs)

        heatmaps = outputs.heatmaps
        self.assertEqual(heatmaps.shape, torch.Size([1, model.config.num_labels, 256, 192]))
        EXPECTED_HEATMAPS = Expectations(
            {("cuda", None): [[0.26140, 0.24656, 0.21673], [0.33708, 0.31597, 0.28028], [0.41624, 0.39270, 0.35014]]}
        )
        expected_heatmaps = torch.tensor(EXPECTED_HEATMAPS.get_expectation(), device=torch_device)
        torch.testing.assert_close(heatmaps[0, 0, 70:73, 70:73], expected_heatmaps, rtol=1e-2, atol=1e-2)

        results = image_processor.post_process_pose_estimation(outputs, boxes=boxes)
        self.assertEqual(len(results), 1)
        self.assertEqual(len(results[0]), 1)
        person = results[0][0]

        keypoints = person["keypoints"]
        EXPECTED_KEYPOINTS = Expectations(
            {("cuda", None): [[364.33920111, 97.92528764], [373.25104943, 80.97749201], [353.21072316, 83.38954486]]}
        )
        expected_keypoints = torch.tensor(EXPECTED_KEYPOINTS.get_expectation(), device=torch_device)
        torch.testing.assert_close(keypoints[:3], expected_keypoints, rtol=1e-2, atol=1e-2)

        scores = person["scores"]
        EXPECTED_SCORES = Expectations({("cuda", None): [1.0007433, 0.9987416, 1.0015154]})
        expected_scores = torch.tensor(EXPECTED_SCORES.get_expectation(), device=torch_device)
        torch.testing.assert_close(scores[:3], expected_scores, rtol=1e-2, atol=1e-2)

        bbox = person["bbox"]
        expected_bbox_xywh = torch.tensor(boxes[0][0], device=torch_device)
        expected_bbox_xyxy = torch.tensor(
            [
                expected_bbox_xywh[0],
                expected_bbox_xywh[1],
                expected_bbox_xywh[0] + expected_bbox_xywh[2],
                expected_bbox_xywh[1] + expected_bbox_xywh[3],
            ],
            device=torch_device,
        )
        torch.testing.assert_close(bbox, expected_bbox_xyxy, rtol=1e-3, atol=1e-3)

        # target_sizes without source_sizes must raise
        with self.assertRaises(ValueError):
            image_processor.post_process_pose_estimation(outputs, boxes=boxes, target_sizes=[(432, 640)])

        # source_sizes + target_sizes: keypoints and bbox scaled by target/source
        target_height, target_width = image_height * 2, image_width * 2
        results_scaled = image_processor.post_process_pose_estimation(
            outputs,
            boxes=boxes,
            source_sizes=[(image_height, image_width)],
            target_sizes=[(target_height, target_width)],
        )
        torch.testing.assert_close(results_scaled[0][0]["keypoints"], keypoints * 2.0)
        torch.testing.assert_close(results_scaled[0][0]["bbox"], expected_bbox_xyxy * 2.0)

        # Test flipping
        flipped_inputs = {"pixel_values": inputs["pixel_values"].flip(-1)}
        flip_pairs = torch.tensor(model.config.flip_pairs)

        with torch.no_grad():
            flipped_outputs = model(**flipped_inputs, flip_pairs=flip_pairs)

        flipped_heatmaps = flipped_outputs.heatmaps
        EXPECTED_FLIPPED_HEATMAPS = Expectations(
            {("cuda", None): [[0.27348, 0.25426, 0.22496], [0.34877, 0.32563, 0.28418], [0.43967, 0.40607, 0.35721]]}
        )
        expected_flipped_heatmaps = torch.tensor(EXPECTED_FLIPPED_HEATMAPS.get_expectation(), device=torch_device)
        torch.testing.assert_close(
            flipped_heatmaps[0, 0, 70:73, 70:73], expected_flipped_heatmaps, rtol=1e-2, atol=1e-2
        )

        final_results = image_processor.post_process_pose_estimation(
            outputs, outputs_flipped=flipped_outputs, boxes=boxes
        )
        self.assertEqual(len(final_results), 1)
        self.assertEqual(len(final_results[0]), 1)

        final_person = final_results[0][0]
        final_keypoints = final_person["keypoints"]
        EXPECTED_FINAL_KEYPOINTS = Expectations(
            {("cuda", None): [[364.14644305, 97.99268751], [373.66756367, 81.19966519], [353.4574526, 83.647911]]}
        )
        expected_final_keypoints = torch.tensor(EXPECTED_FINAL_KEYPOINTS.get_expectation(), device=torch_device)
        torch.testing.assert_close(final_keypoints[:3], expected_final_keypoints, rtol=1e-2, atol=1e-2)

        final_scores = final_person["scores"]
        EXPECTED_FINAL_SCORES = Expectations({("cuda", None): [1.0064079, 0.98746514, 0.99821794]})
        expected_final_scores = torch.tensor(EXPECTED_FINAL_SCORES.get_expectation(), device=torch_device)
        torch.testing.assert_close(final_scores[:3], expected_final_scores, rtol=1e-2, atol=1e-2)

        final_bbox = final_person["bbox"]
        torch.testing.assert_close(final_bbox, expected_bbox_xyxy, rtol=1e-3, atol=1e-3)

    @slow
    def test_inference_normal_estimation(self):
        model = Sapiens2ForNormalEstimation.from_pretrained("facebook/sapiens2-normal-0.4b").eval().to(torch_device)

        image_processor = Sapiens2ImageProcessor.from_pretrained("facebook/sapiens2-normal-0.4b")
        image = prepare_img()
        image_height, image_width = image.shape[-2:]
        inputs = image_processor(image, return_tensors="pt").to(torch_device)

        with torch.no_grad():
            outputs = model(**inputs)

        _, _, height, width = inputs["pixel_values"].shape
        self.assertEqual(outputs.normals.shape, torch.Size([1, 3, height, width]))

        # We can get closer to expected values by using cv2 resize instead of torchvision.
        EXPECTED_NORMALS = Expectations(
            {("cuda", None): [[0.9577, 1.8808, 0.9826], [1.6904, 1.7351, 1.9120], [2.4828, 1.9887, 2.5168]]}
        )
        expected_normals = torch.tensor(EXPECTED_NORMALS.get_expectation(), device=torch_device)
        torch.testing.assert_close(outputs.normals[0, 0, :3, :3], expected_normals, rtol=1e-2, atol=1e-2)

        result = image_processor.post_process_normal_estimation(outputs, source_sizes=[(image_height, image_width)])
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0]["normals"].shape, torch.Size([3, 432, 640]))

        EXPECTED_POSTPROCESSED_NORMALS = Expectations(
            {("cuda", None): [[-0.8266, -0.7899, -0.7512], [-0.8227, -0.7843, -0.7440], [-0.8098, -0.7721, -0.7318]]}
        )
        expected_postprocessed_normals = torch.tensor(
            EXPECTED_POSTPROCESSED_NORMALS.get_expectation(), device=torch_device
        )
        torch.testing.assert_close(
            result[0]["normals"][0, :3, :3], expected_postprocessed_normals, rtol=1e-2, atol=1e-2
        )

    @slow
    def test_inference_pointmap_estimation(self):
        model = (
            Sapiens2ForPointmapEstimation.from_pretrained("facebook/sapiens2-pointmap-0.4b").eval().to(torch_device)
        )

        image_processor = Sapiens2ImageProcessor.from_pretrained("facebook/sapiens2-pointmap-0.4b")
        image = prepare_img()
        image_height, image_width = image.shape[-2:]
        inputs = image_processor(image, return_tensors="pt").to(torch_device)

        with torch.no_grad():
            outputs = model(**inputs)

        self.assertIsInstance(outputs, Sapiens2PointmapEstimatorOutput)
        _, _, height, width = inputs["pixel_values"].shape
        self.assertEqual(outputs.pointmaps.shape, torch.Size([1, 3, height, width]))
        self.assertEqual(outputs.scales.shape, torch.Size([1, 1]))

        EXPECTED_SCALE = Expectations({("cuda", None): [[0.9931]]})
        expected_scale = torch.tensor(EXPECTED_SCALE.get_expectation(), device=torch_device)
        torch.testing.assert_close(outputs.scales, expected_scale, rtol=1e-3, atol=1e-3)

        EXPECTED_POINTMAP = Expectations(
            {("cuda", None): [[-0.0096, -0.0567, -0.0460], [-0.0657, -0.0583, -0.0688], [-0.1035, -0.0363, -0.0659]]}
        )
        expected_pointmap = torch.tensor(EXPECTED_POINTMAP.get_expectation(), device=torch_device)
        torch.testing.assert_close(outputs.pointmaps[0, 0, :3, :3], expected_pointmap, rtol=1e-2, atol=1e-2)

        result = image_processor.post_process_pointmap_estimation(outputs, source_sizes=[(image_height, image_width)])
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0]["pointmap"].shape, torch.Size([3, image_height, image_width]))

        # Head and post-processing are exactly identical to original code but differences from backbone
        # get amplified after scaling and resizing so we need to relax the tolerance here.
        EXPECTED_POSTPROCESSED_POINTMAP = Expectations(
            {("cuda", None): [[0.0771, 0.1335, 0.3025], [-0.1179, 0.2904, 0.7140], [0.0337, 0.3037, 0.4390]]}
        )
        expected_postprocessed_pointmap = torch.tensor(
            EXPECTED_POSTPROCESSED_POINTMAP.get_expectation(), device=torch_device
        )
        torch.testing.assert_close(
            result[0]["pointmap"][0, :3, :3], expected_postprocessed_pointmap, rtol=1e-2, atol=1e-2
        )

    @slow
    def test_inference_matting(self):
        model = Sapiens2ForImageMatting.from_pretrained("facebook/sapiens2-matting-1b").eval().to(torch_device)

        image_processor = self.default_image_processor
        image = prepare_img()
        image_height, image_width = image.shape[-2:]
        inputs = image_processor(image, return_tensors="pt").to(torch_device)

        with torch.no_grad():
            outputs = model(**inputs)

        self.assertIsInstance(outputs, Sapiens2ImageMattingOutput)
        _, _, height, width = inputs["pixel_values"].shape
        self.assertEqual(outputs.foregrounds.shape, torch.Size([1, 3, height, width]))
        self.assertEqual(outputs.alphas.shape, torch.Size([1, 1, height, width]))

        # Difference due to cv2 vs torchvision pre-processing. Model outputs are equal on same tensor input.
        EXPECTED_FOREGROUNDS = Expectations(
            {("cuda", None): [[0.1432, 0.2051, 0.3043], [0.1889, 0.2681, 0.3509], [0.2511, 0.3076, 0.4047]]}
        )
        expected_foregrounds = torch.tensor(EXPECTED_FOREGROUNDS.get_expectation(), device=torch_device)
        torch.testing.assert_close(
            outputs.foregrounds[0, 0, 100:103, 100:103], expected_foregrounds, rtol=1e-2, atol=1e-2
        )

        background = torch.tensor([177, 64, 0], device=torch_device).view(3, 1, 1)
        result = image_processor.post_process_image_matting(
            outputs, target_sizes=[(image_height, image_width)], backgrounds=background
        )
        self.assertEqual(len(result), 1)

        alpha = result[0]["alpha"]
        foreground = result[0]["foreground"]
        composite = result[0]["composite"]
        self.assertEqual(alpha.shape, (1, image_height, image_width))
        self.assertEqual(foreground.shape, (3, image_height, image_width))
        self.assertEqual(composite.shape, (3, image_height, image_width))

        EXPECTED_ALPHA = Expectations(
            {
                ("cuda", None): [
                    [0.99995, 0.9999123, 0.9997628],
                    [0.99991906, 0.9997431, 0.99754137],
                    [0.9997362, 0.99711365, 0.9444071],
                ]
            }
        )
        expected_alpha = torch.tensor(EXPECTED_ALPHA.get_expectation(), device=torch_device)
        torch.testing.assert_close(alpha[0, 300:303, 300:303], expected_alpha, rtol=1e-3, atol=1e-3)

        EXPECTED_FOREGROUND = Expectations(
            {
                ("cuda", None): [
                    [0.7175647, 0.6906685, 0.65860075],
                    [0.7162684, 0.6867891, 0.64463294],
                    [0.6924842, 0.67141336, 0.5356377],
                ]
            }
        )
        expected_foreground = torch.tensor(EXPECTED_FOREGROUND.get_expectation(), device=torch_device)
        torch.testing.assert_close(foreground[0, 300:303, 300:303], expected_foreground, rtol=1e-2, atol=1e-2)

        EXPECTED_COMPOSITE = Expectations({("cuda", None): [[182, 176, 167], [182, 175, 164], [176, 171, 136]]})
        expected_composite = torch.tensor(EXPECTED_COMPOSITE.get_expectation(), dtype=torch.uint8, device=torch_device)
        torch.testing.assert_close(composite[0, 300:303, 300:303], expected_composite, rtol=0, atol=1)


@require_torch
class Sapiens2BackboneTest(unittest.TestCase, BackboneTesterMixin):
    all_model_classes = (Sapiens2Backbone,) if is_torch_available() else ()
    config_class = Sapiens2Config

    def setUp(self):
        self.model_tester = Sapiens2ModelTester(self)