first commit
Some checks failed
Self-hosted runner (nightly-past-ci-caller) / Get number (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.11 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.10 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.9 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.8 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.7 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.6 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.5 (push) Has been cancelled
Self-hosted runner (benchmark) / Benchmark (aws-g5-4xlarge-cache) (push) Has been cancelled
Build documentation / build (push) Has been cancelled
Build documentation / build_other_lang (push) Has been cancelled
CodeQL Security Analysis / CodeQL Analysis (push) Has been cancelled
New model PR merged notification / Notify new model (push) Has been cancelled
PR CI / pr-ci (push) Has been cancelled
Slow tests on important models (on Push - A10) / Get all modified files (push) Has been cancelled
Secret Leaks / trufflehog (push) Has been cancelled
Update Transformers metadata / build_and_package (push) Has been cancelled
Slow tests on important models (on Push - A10) / Model CI (push) Has been cancelled
Check Tiny Models / Check tiny models (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Model CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Pipeline CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Example CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / DeepSpeed CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Trainer/FSDP CI (push) Has been cancelled
Nvidia CI - Flash Attn / Setup (push) Has been cancelled
Nvidia CI - Flash Attn / Model CI (push) Has been cancelled
Nvidia CI / Setup (push) Has been cancelled
Nvidia CI / Model CI (push) Has been cancelled
Nvidia CI / Torch pipeline CI (push) Has been cancelled
Nvidia CI / Example CI (push) Has been cancelled
Nvidia CI / Trainer/FSDP CI (push) Has been cancelled
Nvidia CI / DeepSpeed CI (push) Has been cancelled
Nvidia CI / Quantization CI (push) Has been cancelled
Nvidia CI / Kernels CI (push) Has been cancelled
Doctests / Setup (push) Has been cancelled
Doctests / Call doctest jobs (push) Has been cancelled
Doctests / Send results to webhook (push) Has been cancelled
Extras Smoke Test / Get supported Python versions (push) Has been cancelled
Extras Smoke Test / Test extras on Python ${{ matrix.python-version }} (push) Has been cancelled
Extras Smoke Test / Check Slack token availability (push) Has been cancelled
Extras Smoke Test / Notify failures to Slack (push) Has been cancelled
Self-hosted runner (AMD scheduled CI caller) / Trigger Scheduled AMD CI (push) Has been cancelled
Stale Bot / Close Stale Issues (push) Has been cancelled

This commit is contained in:
陈赣
2026-06-05 16:53:03 +08:00
commit 06f1fd69a6
6047 changed files with 1895387 additions and 0 deletions

View File

View File

@@ -0,0 +1,357 @@
# Copyright 2025 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import itertools
import json
import tempfile
import unittest
import numpy as np
import requests
from transformers.image_utils import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD
from transformers.models.video_llama_3.image_processing_video_llama_3 import smart_resize
from transformers.testing_utils import require_torch, require_vision
from transformers.utils import is_torch_available, is_vision_available
from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs, prepare_video_inputs
if is_torch_available():
import torch
if is_vision_available():
from PIL import Image
class VideoLlama3ImageProcessingTester:
def __init__(
self,
parent,
batch_size=7,
num_channels=3,
num_frames=10,
min_resolution=56,
max_resolution=1024,
min_pixels=14 * 14 * 16,
max_pixels=14 * 14 * 16384,
do_normalize=True,
image_mean=IMAGENET_STANDARD_MEAN,
image_std=IMAGENET_STANDARD_STD,
do_resize=True,
patch_size=14,
merge_size=1,
do_convert_rgb=True,
):
self.parent = parent
self.batch_size = batch_size
self.min_resolution = min_resolution
self.max_resolution = max_resolution
self.num_channels = num_channels
self.num_frames = num_frames
self.image_mean = image_mean
self.image_std = image_std
self.min_pixels = min_pixels
self.max_pixels = max_pixels
self.patch_size = patch_size
self.merge_size = merge_size
self.do_resize = do_resize
self.do_normalize = do_normalize
self.image_mean = image_mean
self.image_std = image_std
self.do_convert_rgb = do_convert_rgb
def prepare_image_processor_dict(self):
return {
"do_resize": self.do_resize,
"image_mean": self.image_mean,
"image_std": self.image_std,
"min_pixels": self.min_pixels,
"max_pixels": self.max_pixels,
"patch_size": self.patch_size,
"merge_size": self.merge_size,
}
def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
images = prepare_image_inputs(
batch_size=self.batch_size,
num_channels=self.num_channels,
min_resolution=self.min_resolution,
max_resolution=self.max_resolution,
equal_resolution=equal_resolution,
numpify=numpify,
torchify=torchify,
)
return [[image] for image in images]
def prepare_video_inputs(self, equal_resolution=False, numpify=False, torchify=False):
return prepare_video_inputs(
batch_size=self.batch_size,
num_channels=self.num_channels,
num_frames=self.num_frames,
min_resolution=self.min_resolution,
max_resolution=self.max_resolution,
equal_resolution=equal_resolution,
numpify=numpify,
torchify=torchify,
)
@require_torch
@require_vision
class VideoLlama3ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
def setUp(self):
super().setUp()
self.image_processor_tester = VideoLlama3ImageProcessingTester(self)
@property
def image_processor_dict(self):
return self.image_processor_tester.prepare_image_processor_dict()
def test_image_processor_properties(self):
for image_processing_class in self.image_processing_classes.values():
image_processing = image_processing_class(**self.image_processor_dict)
self.assertTrue(hasattr(image_processing, "do_normalize"))
self.assertTrue(hasattr(image_processing, "image_mean"))
self.assertTrue(hasattr(image_processing, "image_std"))
self.assertTrue(hasattr(image_processing, "do_resize"))
self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
self.assertTrue(hasattr(image_processing, "patch_size"))
self.assertTrue(hasattr(image_processing, "merge_size"))
def test_image_processor_to_json_string(self):
for image_processing_class in self.image_processing_classes.values():
image_processor = image_processing_class(**self.image_processor_dict)
obj = json.loads(image_processor.to_json_string())
for key, value in self.image_processor_dict.items():
if key not in ["min_pixels", "max_pixels"]:
self.assertEqual(obj[key], value)
def test_select_best_resolution(self):
# Test with a final resize resolution
best_resolution = smart_resize(561, 278, factor=28)
self.assertEqual(best_resolution, (560, 280))
def test_call_pil(self):
for image_processing_class in self.image_processing_classes.values():
# Initialize image_processing
image_processing = image_processing_class(**self.image_processor_dict)
# create random PIL images
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True)
for image in image_inputs:
self.assertIsInstance(image[0], Image.Image)
# Test not batched input
process_out = image_processing(image_inputs[0], return_tensors="pt")
encoded_images = process_out.pixel_values
image_grid_thws = process_out.image_grid_thw
expected_output_image_shape = (5329, 588)
expected_image_grid_thws = torch.Tensor([[1, 73, 73]])
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
self.assertTrue((image_grid_thws == expected_image_grid_thws).all())
# Test batched
process_out = image_processing(image_inputs, return_tensors="pt")
encoded_images = process_out.pixel_values
image_grid_thws = process_out.image_grid_thw
expected_output_image_shape = (37303, 588)
expected_image_grid_thws = torch.Tensor([[1, 73, 73]] * 7)
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
self.assertTrue((image_grid_thws == expected_image_grid_thws).all())
def test_call_numpy(self):
for image_processing_class in self.image_processing_classes.values():
# Initialize image_processing
image_processing = image_processing_class(**self.image_processor_dict)
# create random numpy tensors
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, numpify=True)
for image in image_inputs:
self.assertIsInstance(image[0], np.ndarray)
# Test not batched input
process_out = image_processing(image_inputs[0], return_tensors="pt")
encoded_images = process_out.pixel_values
image_grid_thws = process_out.image_grid_thw
expected_output_image_shape = (5329, 588)
expected_image_grid_thws = torch.Tensor([[1, 73, 73]])
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
self.assertTrue((image_grid_thws == expected_image_grid_thws).all())
# Test batched
process_out = image_processing(image_inputs, return_tensors="pt")
encoded_images = process_out.pixel_values
image_grid_thws = process_out.image_grid_thw
expected_output_image_shape = (37303, 588)
expected_image_grid_thws = torch.Tensor([[1, 73, 73]] * 7)
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
self.assertTrue((image_grid_thws == expected_image_grid_thws).all())
def test_call_pytorch(self):
for image_processing_class in self.image_processing_classes.values():
# Initialize image_processing
image_processing = image_processing_class(**self.image_processor_dict)
# create random PyTorch tensors
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, torchify=True)
for image in image_inputs:
self.assertIsInstance(image[0], torch.Tensor)
# Test not batched input
process_out = image_processing(image_inputs[0], return_tensors="pt")
encoded_images = process_out.pixel_values
image_grid_thws = process_out.image_grid_thw
expected_output_image_shape = (5329, 588)
expected_image_grid_thws = torch.Tensor([[1, 73, 73]])
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
self.assertTrue((image_grid_thws == expected_image_grid_thws).all())
# Test batched
process_out = image_processing(image_inputs, return_tensors="pt")
encoded_images = process_out.pixel_values
image_grid_thws = process_out.image_grid_thw
expected_output_image_shape = (37303, 588)
expected_image_grid_thws = torch.Tensor([[1, 73, 73]] * 7)
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
self.assertTrue((image_grid_thws == expected_image_grid_thws).all())
@unittest.skip(reason="VideoLlama3ImageProcessor doesn't treat 4 channel PIL and numpy consistently yet")
def test_call_numpy_4_channels(self):
pass
def test_nested_input(self):
for image_processing_class in self.image_processing_classes.values():
image_processing = image_processing_class(**self.image_processor_dict)
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True)
# Test batched as a list of images
process_out = image_processing(image_inputs, return_tensors="pt")
encoded_images = process_out.pixel_values
image_grid_thws = process_out.image_grid_thw
expected_output_image_shape = (37303, 588)
expected_image_grid_thws = torch.Tensor([[1, 73, 73]] * 7)
self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
self.assertTrue((image_grid_thws == expected_image_grid_thws).all())
# Test batched as a nested list of images, where each sublist is one batch
image_inputs_nested = image_inputs[:3] + image_inputs[3:]
process_out = image_processing(image_inputs_nested, return_tensors="pt")
encoded_images_nested = process_out.pixel_values
image_grid_thws_nested = process_out.image_grid_thw
expected_output_image_shape = (37303, 588)
expected_image_grid_thws = torch.Tensor([[1, 73, 73]] * 7)
self.assertEqual(tuple(encoded_images_nested.shape), expected_output_image_shape)
self.assertTrue((image_grid_thws == expected_image_grid_thws).all())
# Image processor should return same pixel values, independently of ipnut format
self.assertTrue((encoded_images_nested == encoded_images).all())
self.assertTrue((image_grid_thws_nested == expected_image_grid_thws).all())
@unittest.skip(
reason="`VideoLlama3ImageProcessor` works only with image inputs and doesn't process videos anymore."
)
def test_video_inputs(self):
pass
def test_custom_image_size(self):
for image_processing_class in self.image_processing_classes.values():
image_processing = image_processing_class(**self.image_processor_dict)
with tempfile.TemporaryDirectory() as tmpdirname:
image_processing.save_pretrained(tmpdirname)
image_processor_loaded = image_processing_class.from_pretrained(
tmpdirname, max_pixels=56 * 56, min_pixels=28 * 28
)
image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True)
process_out = image_processor_loaded(image_inputs, return_tensors="pt")
expected_output_video_shape = [112, 588]
self.assertListEqual(list(process_out.pixel_values.shape), expected_output_video_shape)
def test_custom_pixels(self):
pixel_choices = frozenset(itertools.product((100, 150, 200, 20000), (100, 150, 200, 20000)))
for image_processing_class in self.image_processing_classes.values():
image_processor_dict = self.image_processor_dict.copy()
for a_pixels, b_pixels in pixel_choices:
image_processor_dict["min_pixels"] = min(a_pixels, b_pixels)
image_processor_dict["max_pixels"] = max(a_pixels, b_pixels)
image_processor = image_processing_class(**image_processor_dict)
image_inputs = self.image_processor_tester.prepare_image_inputs()
# Just checking that it doesn't raise an error
image_processor(image_inputs, return_tensors="pt")
@require_vision
@require_torch
def test_backends_equivalence(self):
"""Override to also compare image_grid_thw across backends."""
if len(self.image_processing_classes) < 2:
self.skipTest(reason="Skipping backends equivalence test as there are less than 2 backends")
dummy_image = Image.open(
requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw
)
encodings = {}
for backend_name, image_processing_class in self.image_processing_classes.items():
image_processor = image_processing_class(**self.image_processor_dict)
encodings[backend_name] = image_processor(dummy_image, return_tensors="pt")
backend_names = list(encodings.keys())
reference_backend = backend_names[0]
reference = encodings[reference_backend]
for backend_name in backend_names[1:]:
encoding = encodings[backend_name]
self._assert_tensors_equivalence(reference.pixel_values, encoding.pixel_values)
self.assertEqual(reference.image_grid_thw.dtype, encoding.image_grid_thw.dtype)
self._assert_tensors_equivalence(reference.image_grid_thw.float(), encoding.image_grid_thw.float())
@require_vision
@require_torch
def test_backends_equivalence_batched(self):
"""Override to also compare image_grid_thw across backends."""
if len(self.image_processing_classes) < 2:
self.skipTest(reason="Skipping backends equivalence test as there are less than 2 backends")
if hasattr(self.image_processor_tester, "do_center_crop") and self.image_processor_tester.do_center_crop:
self.skipTest(
reason="Skipping as do_center_crop is True and center_crop functions are not equivalent for fast and slow processors"
)
dummy_images = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True)
encodings = {}
for backend_name, image_processing_class in self.image_processing_classes.items():
image_processor = image_processing_class(**self.image_processor_dict)
encodings[backend_name] = image_processor(dummy_images, return_tensors="pt")
backend_names = list(encodings.keys())
reference_backend = backend_names[0]
reference = encodings[reference_backend]
for backend_name in backend_names[1:]:
encoding = encodings[backend_name]
self._assert_tensors_equivalence(reference.pixel_values, encoding.pixel_values)
self.assertEqual(reference.image_grid_thw.dtype, encoding.image_grid_thw.dtype)
self._assert_tensors_equivalence(reference.image_grid_thw.float(), encoding.image_grid_thw.float())
def test_get_num_patches_without_images(self):
for image_processing_class in self.image_processing_classes.values():
image_processing = image_processing_class(**self.image_processor_dict)
num_patches = image_processing.get_number_of_image_patches(height=100, width=100, images_kwargs={})
self.assertEqual(num_patches, 49)
num_patches = image_processing.get_number_of_image_patches(height=200, width=50, images_kwargs={})
self.assertEqual(num_patches, 56)
num_patches = image_processing.get_number_of_image_patches(
height=100, width=100, images_kwargs={"patch_size": 28}
)
self.assertEqual(num_patches, 16)

View File

@@ -0,0 +1,999 @@
# Copyright 2025 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Testing suite for the PyTorch VideoLLaMA3 model."""
import copy
import gc
import inspect
import tempfile
import unittest
import numpy as np
import pytest
import requests
import torch.nn as nn
from parameterized import parameterized
from PIL import Image
from transformers import (
AutoProcessor,
VideoLlama3Config,
VideoLlama3ForConditionalGeneration,
VideoLlama3Model,
VideoLlama3VisionConfig,
VideoLlama3VisionModel,
is_torch_available,
)
from transformers.testing_utils import (
Expectations,
backend_empty_cache,
require_flash_attn,
require_torch,
require_torch_accelerator,
set_config_for_less_flaky_test,
set_model_for_less_flaky_test,
slow,
torch_device,
)
from transformers.utils import (
is_torch_bf16_available_on_device,
is_torch_fp16_available_on_device,
)
from ...generation.test_utils import GenerationTesterMixin
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import (
TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION,
ModelTesterMixin,
floats_tensor,
ids_tensor,
sdpa_kernel,
)
if is_torch_available():
import torch
def _test_encoder_eager_matches_sdpa_inference(
self,
dtype,
output_attentions,
enable_kernels,
atols=None,
rtols=None,
):
"""
This test is written as a regular function to be able to overload it easily with different tolerances.
Otherwise, `parameterize.expand` prevents it as it removes the original function from the namespace.
"""
if not self.has_attentions:
self.skipTest(reason="Model architecture does not support attentions")
if not self.all_model_classes[0]._supports_sdpa:
self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA")
# convert shorthand name to torch.dtype
if dtype == "fp16":
dtype = torch.float16
elif dtype == "bf16":
dtype = torch.bfloat16
elif dtype == "fp32":
dtype = torch.float32
if not is_torch_fp16_available_on_device(torch_device) and dtype == torch.float16:
self.skipTest(f"float16 not supported on {torch_device} (on the specific device currently used)")
if not is_torch_bf16_available_on_device(torch_device) and dtype == torch.bfloat16:
self.skipTest(
f"bfloat16 not supported on {torch_device} (on the specific device currently used, e.g. Nvidia T4 GPU)"
)
# Dictionary of tolerances for eager <> sdpa tests. Key = (device, sdpa_kernels_enabled, dtype)
if atols is None:
atols = {
("cpu", False, torch.float32): 1e-6,
("cpu", False, torch.float16): 5e-3,
("cpu", False, torch.bfloat16): 1e-2,
("cpu", True, torch.float32): 1e-6,
("cpu", True, torch.float16): 5e-3,
("cpu", True, torch.bfloat16): 1e-2,
("cuda", False, torch.float32): 1e-6,
("cuda", False, torch.bfloat16): 1e-2,
("cuda", False, torch.float16): 5e-3,
("cuda", True, torch.float32): 1e-6,
("cuda", True, torch.bfloat16): 1e-2,
("cuda", True, torch.float16): 5e-3,
}
if rtols is None:
rtols = {
("cpu", False, torch.float32): 1e-4,
("cpu", False, torch.float16): 5e-3,
("cpu", False, torch.bfloat16): 1e-2,
("cpu", True, torch.float32): 1e-4,
("cpu", True, torch.float16): 5e-3,
("cpu", True, torch.bfloat16): 1e-2,
("cuda", False, torch.float32): 1e-4,
("cuda", False, torch.bfloat16): 1e-2,
("cuda", False, torch.float16): 5e-3,
("cuda", True, torch.float32): 1e-4,
("cuda", True, torch.bfloat16): 3e-2, # (different from others)
("cuda", True, torch.float16): 5e-3,
}
for model_class in self.all_model_classes:
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
set_config_for_less_flaky_test(config)
model = model_class(config)
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
model_from_pretrained_kwargs = {
"pretrained_model_name_or_path": tmpdirname,
"dtype": dtype,
}
if hasattr(config, "use_mask_token") or "use_mask_token" in inspect.signature(model.__init__).parameters:
model_from_pretrained_kwargs["use_mask_token"] = True
# TODO: remove this try/except, models should have a shared API
try:
model_sdpa = model_class.from_pretrained(**model_from_pretrained_kwargs, attn_implementation="sdpa")
except ValueError:
model_sdpa = model_class.from_pretrained(**model_from_pretrained_kwargs)
model_sdpa = model_sdpa.eval().to(torch_device)
model_eager = model_class.from_pretrained(**model_from_pretrained_kwargs, attn_implementation="eager")
model_eager = model_eager.eval().to(torch_device)
set_model_for_less_flaky_test(model_eager)
set_model_for_less_flaky_test(model_sdpa)
# TODO: if we can also check with `batch_size=1` without being flaky?
for batch_size in [7]:
input_data_batch_size = batch_size
processed_inputs = {}
processed_inputs[model.main_input_name] = inputs_dict[model.main_input_name]
for key in getattr(self, "additional_model_inputs", []):
# Some models don't have all `additional_model_inputs`, especially when we
# craft cases to test model in different settings
if key in inputs_dict:
processed_inputs[key] = inputs_dict[key]
for key, value in processed_inputs.items():
if torch.is_floating_point(value):
value = value.to(dtype)
if key == "pixel_values":
continue
# extend value to have at least `input_data_batch_size` elements
if value.shape[0] < input_data_batch_size:
size = (input_data_batch_size - value.shape[0], *value.shape[1:])
if key == "grid_thw":
extension = torch.randint(high=5, size=size, dtype=value.dtype, device=torch_device)
elif key == "merge_sizes":
extension = torch.ones(size=size, dtype=value.dtype, device=torch_device)
value = torch.cat((value, extension), dim=0).to(torch_device)
processed_inputs[key] = value[:input_data_batch_size]
pixel_values = processed_inputs["pixel_values"]
target_len = torch.sum(processed_inputs["grid_thw"].prod(dim=1) // (processed_inputs["merge_sizes"] ** 2))
if pixel_values.size(0) < target_len:
size = (input_data_batch_size - value.shape[0], *value.shape[1:])
extension = torch.randn(
size=(target_len - pixel_values.size(0)), dtype=pixel_values.dtype, device=torch_device
)
elif pixel_values.size(0) > target_len:
pixel_values = pixel_values[:target_len]
processed_inputs["pixel_values"] = pixel_values
processed_inputs.update(
{
"output_hidden_states": True,
"output_attentions": output_attentions,
}
)
# TODO: test gradients as well (& for FA2 as well!)
with torch.no_grad():
with sdpa_kernel(
enable_flash=enable_kernels,
enable_math=True,
enable_mem_efficient=enable_kernels,
):
prepared_inputs = self._prepare_for_class(processed_inputs, model_class)
prepared_inputs = {
k: v.to(torch_device) if isinstance(v, torch.Tensor) else v for k, v in prepared_inputs.items()
}
outputs_eager = model_eager(**prepared_inputs)
outputs_sdpa = model_sdpa(**prepared_inputs)
key = "hidden_states"
# TODO: rename logits -> hidden_states
logits_eager = outputs_eager[key][-1]
logits_sdpa = outputs_sdpa[key][-1]
if torch_device in ["cpu", "cuda"]:
atol = atols[torch_device, enable_kernels, dtype]
rtol = rtols[torch_device, enable_kernels, dtype]
elif torch_device == "hpu":
atol = atols["cuda", enable_kernels, dtype]
rtol = rtols["cuda", enable_kernels, dtype]
elif torch_device == "xpu":
# As of PyTorch 2.5 XPU backend supports only torch.nn.attention.SDPBackend.MATH
# which is implemented on PyTorch level using aten operators and is
# device agnostic with respect to implementation of each aten operator.
atol = atols["cuda", False, dtype]
rtol = rtols["cuda", False, dtype]
else:
atol = 1e-7
rtol = 1e-4
# Avoid test flakiness with bf16!
# bf16 is not good at precision when the magnitude is larger. We have some models like `SiglipVision` with
# this test passing all the time for fp32/fp16 but flaky with bf16. Furthermore, `llama` and `clip` have
# this test passing all the time for bf16: it turns out their outputs are of smaller size (0.1 and 1.0)
# while `siglip` has outputs with maximal values around 3.0/4.0.
outputs_magnitude = float(
(torch.max(logits_sdpa.abs().amax(), logits_eager.abs().amax())).detach().to("cpu")
)
# The choice of `3e-2` in `outputs_magnitude * 1e-2` might not work if a model has even more larger outputs.
# (we can try to analyze the `rtol` more closely element-wise in the future and adjust the `rtol` instead of `atol`).
computed_atol = outputs_magnitude * 3e-2
if dtype == torch.bfloat16:
atol = max(atol, computed_atol)
results = [
torch.allclose(_logits_sdpa, _logits_eager, atol=atol, rtol=rtol)
for (_logits_sdpa, _logits_eager) in zip(logits_sdpa, logits_eager)
]
# If 80% batch elements have matched results, it's fine
if np.mean(results) < 0.8:
mean_relative_diff = ((logits_sdpa - logits_eager).abs() / (logits_eager.abs() + 1e-12)).mean()
raise ValueError(
f"mean relative difference for {key}: {mean_relative_diff:.3e}, torch atol = {atol}, torch rtol = "
f"{rtol}"
)
class VideoLlama3VisionModelTester:
def __init__(
self,
parent,
batch_size=12,
patch_size=2,
num_channels=3,
image_size=14,
is_training=True,
hidden_size=64,
num_hidden_layers=2,
num_attention_heads=4,
intermediate_size=37,
attention_dropout=0.1,
initializer_range=0.02,
scope=None,
):
self.parent = parent
self.batch_size = batch_size
self.patch_size = patch_size
self.num_channels = num_channels
self.image_size = image_size
self.is_training = is_training
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.attention_dropout = attention_dropout
self.initializer_range = initializer_range
self.scope = scope
self.seq_length = (self.image_size // self.patch_size) ** 2
def get_config(self):
return VideoLlama3VisionConfig(
patch_size=self.patch_size,
num_channels=self.num_channels,
hidden_size=self.hidden_size,
num_hidden_layers=self.num_hidden_layers,
num_attention_heads=self.num_attention_heads,
intermediate_size=self.intermediate_size,
attention_dropout=self.attention_dropout,
initializer_range=self.initializer_range,
)
def prepare_config_and_inputs(self):
config = self.get_config()
patch_size = config.patch_size
pixel_values = floats_tensor(
[
self.batch_size * (self.image_size**2) // (patch_size**2),
self.num_channels * (patch_size**2),
]
)
return config, pixel_values
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
config, pixel_values = config_and_inputs
num_patches = self.image_size // config.patch_size
inputs_dict = {
"pixel_values": pixel_values,
"grid_thw": torch.tensor([[1, num_patches, num_patches]] * self.batch_size, device=torch_device),
"merge_sizes": torch.tensor([1] * self.batch_size, device=torch_device),
}
return config, inputs_dict
@require_torch
class VideoLlama3VisionModelTest(ModelTesterMixin, unittest.TestCase):
"""
Here we also overwrite some of the tests of test_modeling_common.py, as SIGLIP does not use input_ids, inputs_embeds,
attention_mask and seq_length.
"""
all_model_classes = (VideoLlama3VisionModel,) if is_torch_available() else ()
additional_model_inputs = ["grid_thw", "merge_sizes"]
test_resize_embeddings = False
test_cpu_offload = False
test_disk_offload_safetensors = False
test_disk_offload_bin = False
def setUp(self):
self.model_tester = VideoLlama3VisionModelTester(self)
self.config_tester = ConfigTester(self, config_class=VideoLlama3VisionConfig, has_text_modality=False)
def test_config(self):
self.config_tester.run_common_tests()
def test_model_get_set_embeddings(self):
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config)
self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
x = model.get_output_embeddings()
self.assertTrue(x is None or isinstance(x, nn.Linear))
@parameterized.expand(TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION)
def test_eager_matches_sdpa_inference(
self, name, dtype, padding_side, use_attention_mask, output_attentions, enable_kernels
):
if use_attention_mask:
self.skipTest(reason="VideoLlama3VisionModel does not use attention mask")
_test_encoder_eager_matches_sdpa_inference(self, dtype, output_attentions, enable_kernels)
def test_attention_outputs(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
config.return_dict = True
# force eager attention to support output attentions
config._attn_implementation = "eager"
seq_len = getattr(self.model_tester, "seq_length", None)
encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
for model_class in self.all_model_classes:
inputs_dict["output_attentions"] = True
inputs_dict["output_hidden_states"] = False
config.return_dict = True
model = model_class._from_config(config, attn_implementation="eager")
config = model.config
model.to(torch_device)
model.eval()
with torch.no_grad():
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
attentions = outputs.attentions
self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
# check that output_attentions also work using config
del inputs_dict["output_attentions"]
config.output_attentions = True
for k in config.sub_configs:
getattr(config, k).output_attentions = True
model = model_class(config)
model.to(torch_device)
model.eval()
with torch.no_grad():
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
attentions = outputs.attentions
self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
self.assertListEqual(
list(attentions[0][0].shape[-3:]),
[self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
)
out_len = len(outputs)
# Check attention is always last and order is fine
inputs_dict["output_attentions"] = True
inputs_dict["output_hidden_states"] = True
model = model_class(config)
model.to(torch_device)
model.eval()
with torch.no_grad():
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
self.assertEqual(out_len + 1, len(outputs))
self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
self.assertListEqual(
list(self_attentions[0][0].shape[-3:]),
[self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
)
def test_hidden_states_output(self):
def check_hidden_states_output(inputs_dict, config, model_class):
model = model_class(copy.deepcopy(config))
model.to(torch_device)
model.eval()
with torch.no_grad():
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
expected_num_layers = getattr(
self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
)
self.assertEqual(len(hidden_states), expected_num_layers)
seq_length = torch.sum(inputs_dict["grid_thw"].prod(dim=1) // (inputs_dict["merge_sizes"] ** 2))
self.assertListEqual(
list(hidden_states[0].shape),
[seq_length, self.model_tester.hidden_size],
)
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
inputs_dict["output_hidden_states"] = True
check_hidden_states_output(inputs_dict, config, model_class)
# check that output_hidden_states also work using config
del inputs_dict["output_hidden_states"]
config.output_hidden_states = True
for k in config.sub_configs:
getattr(config, k).output_hidden_states = True
check_hidden_states_output(inputs_dict, config, model_class)
def test_retain_grad_hidden_states_attentions(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
for k in config.sub_configs:
getattr(config, k).output_hidden_states = True
config.output_hidden_states = True
config.output_attentions = self.has_attentions
for k in config.sub_configs:
getattr(config, k).output_attentions = self.has_attentions
# force eager attention to support output attentions
config._attn_implementation = "eager"
# no need to test all models as different heads yield the same functionality
model_class = self.all_model_classes[0]
model = model_class._from_config(config, attn_implementation="eager")
model.to(torch_device)
inputs = self._prepare_for_class(inputs_dict, model_class)
outputs = model(**inputs)
output = outputs[0]
# Encoder-/Decoder-only models
hidden_states = outputs.hidden_states[0]
hidden_states.retain_grad()
if self.has_attentions:
attentions = outputs.attentions[0][0]
attentions.retain_grad()
output.flatten()[0].backward(retain_graph=True)
self.assertIsNotNone(hidden_states.grad)
if self.has_attentions:
self.assertIsNotNone(attentions.grad)
@unittest.skip("Vision model requires additional positional inputs (grid_thw and merge_sizes)")
def test_flash_attn_2_inference_equivalence(self):
pass
@unittest.skip("Vision model requires additional positional inputs (grid_thw and merge_sizes)")
def test_flash_attn_2_inference_equivalence_right_padding(self):
pass
@unittest.skip("Vision model requires additional positional inputs (grid_thw and merge_sizes)")
def test_flash_attn_kernels_inference_equivalence(self):
pass
class VideoLlama3VisionText2TextModelTester:
def __init__(
self,
parent,
batch_size=3,
seq_length=7,
num_channels=3,
image_size=14,
is_training=True,
text_config={
"attention_dropout": 0.0,
"bos_token_id": 0,
"eos_token_id": 1,
"pad_token_id": 2,
"hidden_act": "silu",
"hidden_size": 32,
"intermediate_size": 37,
"max_position_embeddings": 512,
"max_window_layers": 3,
"model_type": "qwen2",
"num_attention_heads": 4,
"num_hidden_layers": 2,
"num_key_value_heads": 2,
"rms_norm_eps": 1e-06,
"rope_scaling": None,
"rope_theta": 1000000.0,
"sliding_window": None,
"tie_word_embeddings": True,
"vocab_size": 99,
},
vision_config={
"attention_dropout": 0.0,
"hidden_act": "gelu_pytorch_tanh",
"hidden_size": 32,
"intermediate_size": 64,
"layer_norm_eps": 1e-06,
"model_type": "video_llama_3_vision",
"num_attention_heads": 4,
"num_channels": 3,
"num_hidden_layers": 2,
"patch_size": 14,
},
use_token_compression=True,
image_token_id=3,
video_token_id=4,
):
self.parent = parent
self.hidden_size = text_config["hidden_size"]
self.num_hidden_layers = text_config["num_hidden_layers"]
self.num_attention_heads = text_config["num_attention_heads"]
self.patch_size = vision_config["patch_size"]
self.batch_size = batch_size
self.seq_length = seq_length
self.num_channels = num_channels
self.image_size = image_size
self.is_training = is_training
self.text_config = text_config
self.vision_config = vision_config
self.use_token_compression = use_token_compression
self.image_token_id = image_token_id
self.video_token_id = video_token_id
self.num_image_tokens = 32
self.seq_length = seq_length + self.num_image_tokens
def get_config(self):
return VideoLlama3Config(
text_config=self.text_config,
vision_config=self.vision_config,
use_token_compression=self.use_token_compression,
image_token_id=self.image_token_id,
video_token_id=self.video_token_id,
)
def prepare_config_and_inputs(self):
config = self.get_config()
patch_size = config.vision_config.patch_size
pixel_values = floats_tensor(
[
self.batch_size * (self.image_size**2) // (patch_size**2),
self.num_channels * (patch_size**2),
]
)
return config, pixel_values
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
config, pixel_values = config_and_inputs
input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size)
attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
input_ids[:, -1] = config.text_config.pad_token_id
attention_mask[:, -1] = 0
input_ids[input_ids == self.video_token_id] = config.text_config.pad_token_id
input_ids[input_ids == self.image_token_id] = config.text_config.pad_token_id
input_ids[:, self.num_image_tokens] = self.image_token_id
inputs_dict = {
"pixel_values": pixel_values,
"image_grid_thw": torch.tensor([[1, 1, 1]] * self.batch_size, device=torch_device),
"image_merge_sizes": torch.tensor([1] * self.batch_size, device=torch_device),
"input_ids": input_ids,
"attention_mask": attention_mask,
}
return config, inputs_dict
@require_torch
class VideoLlama3ModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
"""
Model tester for `VideoLlama3ForConditionalGeneration`.
"""
all_model_classes = (
(
VideoLlama3Model,
VideoLlama3ForConditionalGeneration,
)
if is_torch_available()
else ()
)
pipeline_model_mapping = {"image-text-to-text": VideoLlama3ForConditionalGeneration}
_is_composite = True
def setUp(self):
self.model_tester = VideoLlama3VisionText2TextModelTester(self)
self.config_tester = ConfigTester(self, config_class=VideoLlama3Config, has_text_modality=False)
def test_config(self):
self.config_tester.run_common_tests()
def test_mismatching_num_image_tokens(self):
"""
Tests that VLMs through an error with explicit message saying what is wrong
when number of images don't match number of image tokens in the text.
Also we need to test multi-image cases when one prompt has multiple image tokens.
"""
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config).to(torch_device)
curr_input_dict = copy.deepcopy(input_dict)
_ = model(**curr_input_dict) # successfull forward with no modifications
# remove one image but leave the image token in text
patch_size = config.vision_config.patch_size
one_img_length = (self.model_tester.image_size**2) // (patch_size**2)
curr_input_dict["pixel_values"] = curr_input_dict["pixel_values"][-one_img_length:, ...]
curr_input_dict["image_grid_thw"] = curr_input_dict["image_grid_thw"][-1:, ...]
curr_input_dict["image_merge_sizes"] = curr_input_dict["image_merge_sizes"][-1:, ...]
with self.assertRaisesRegex(ValueError, "Image features and image tokens do not match"):
_ = model(**curr_input_dict)
# simulate multi-image case by concatenating inputs where each has exactly one image/image-token
input_ids = curr_input_dict["input_ids"][:1]
pixel_values = curr_input_dict["pixel_values"][:one_img_length]
image_grid_thw = curr_input_dict["image_grid_thw"][:1]
image_merge_sizes = curr_input_dict["image_merge_sizes"][:1]
input_ids = torch.cat([input_ids, input_ids], dim=0)
# one image and two image tokens raise an error
with self.assertRaisesRegex(ValueError, "Image features and image tokens do not match"):
_ = model(
input_ids=input_ids,
pixel_values=pixel_values,
image_grid_thw=image_grid_thw,
image_merge_sizes=image_merge_sizes,
)
# two images and two image tokens don't raise an error
pixel_values = torch.cat([pixel_values, pixel_values], dim=0)
image_grid_thw = torch.cat([image_grid_thw, image_grid_thw], dim=0)
image_merge_sizes = torch.cat([image_merge_sizes, image_merge_sizes], dim=0)
_ = model(
input_ids=input_ids,
pixel_values=pixel_values,
image_grid_thw=image_grid_thw,
image_merge_sizes=image_merge_sizes,
)
def attention_mask_padding_matches_padding_free_with_position_ids(
self, attn_implementation: str, fa_kwargs: bool = False
):
max_new_tokens = 30
for model_class in self.all_generative_model_classes:
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
dummy_input = inputs_dict[model_class.main_input_name]
if dummy_input.dtype in [torch.float32, torch.float16]:
dummy_input = dummy_input.to(torch.bfloat16)
# make sure that all models have enough positions for generation
if hasattr(config, "max_position_embeddings"):
config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1
model = model_class(config)
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
if 0 in inputs_dict["attention_mask"][:, -1]:
inputs_dict["attention_mask"] = inputs_dict["attention_mask"].flip(1)
dummy_attention_mask = inputs_dict["attention_mask"]
inputs_dict["input_ids"][~dummy_attention_mask.bool()] = config.get_text_config().pad_token_id
model = (
model_class.from_pretrained(
tmpdirname,
dtype=torch.bfloat16,
attn_implementation=attn_implementation,
)
.to(torch_device)
.eval()
)
# flatten
padfree_positions = torch.cat(
[torch.arange(length) for length in dummy_attention_mask.sum(1).tolist()]
)
padfree_positions = padfree_positions.long().unsqueeze(0).to(torch_device)
padfree_inputs_dict = {
"pixel_values": inputs_dict["pixel_values"],
"image_grid_thw": inputs_dict["image_grid_thw"],
"image_merge_sizes": inputs_dict["image_merge_sizes"],
"input_ids": inputs_dict["input_ids"][dummy_attention_mask.bool()].unsqueeze(0),
"position_ids": padfree_positions,
}
if fa_kwargs:
cu_seq_lens = [0] + dummy_attention_mask.sum(1).tolist()
cu_seq_lens = torch.tensor(cu_seq_lens, device=torch_device)
max_length = cu_seq_lens.diff().max().item()
padfree_inputs_dict.update(
{
"cu_seq_lens_q": cu_seq_lens.cumsum(-1).to(dtype=torch.int32),
"cu_seq_lens_k": cu_seq_lens.cumsum(-1).to(dtype=torch.int32),
"max_length_q": max_length,
"max_length_k": max_length,
}
)
# We need to do simple forward without cache in roder to trigger packed SDPA/FLEX/EAGER path
res_padded = model(**inputs_dict, use_cache=False)
res_padfree = model(**padfree_inputs_dict, use_cache=False)
logits_padded = res_padded.logits[inputs_dict["attention_mask"].bool()]
logits_padfree = res_padfree.logits[0]
# acceptable numerical instability
tol = torch.finfo(torch.bfloat16).eps
torch.testing.assert_close(logits_padded, logits_padfree, rtol=tol, atol=tol)
@require_torch
@slow
class VideoLlama3IntegrationTest(unittest.TestCase):
def setUp(self):
self.processor = AutoProcessor.from_pretrained("lkhl/VideoLLaMA3-2B-Image-HF")
self.messages = [
{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": "Describe the image."},
],
}
]
url = "https://raw.githubusercontent.com/DAMO-NLP-SG/VideoLLaMA3/main/assets/sora.png"
self.image = Image.open(requests.get(url, stream=True).raw)
def tearDown(self):
gc.collect()
backend_empty_cache(torch_device)
def test_small_model_integration_test(self):
model = VideoLlama3ForConditionalGeneration.from_pretrained(
"lkhl/VideoLLaMA3-2B-Image-HF", dtype=torch.bfloat16, device_map=torch_device
)
text = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
inputs = self.processor(text=[text], images=[self.image], return_tensors="pt").to(torch_device)
expected_input_ids = [151644, 872, 198] + [151655] * 10549 + [198, 74785, 279, 2168, 13, 151645, 198, 151644, 77091, 198] # fmt: skip
self.assertEqual(expected_input_ids, inputs.input_ids[0].tolist())
expected_pixel_slice = torch.tensor(
[
[-0.8588, -0.9216, -0.9608],
[-0.9922, -0.9922, -0.9922],
[-0.9686, -0.9686, -0.9294],
[-0.9294, -0.9765, -0.9765],
[-0.9922, -0.9922, -0.9843],
[-0.6000, -0.4118, -0.3647],
],
dtype=torch.float32,
device=torch_device,
)
torch.testing.assert_close(expected_pixel_slice, inputs.pixel_values[:6, :3], atol=1e-4, rtol=1e-4)
output = model.generate(**inputs, max_new_tokens=20, do_sample=False, repetition_penalty=None)
# fmt: off
EXPECTED_DECODED_TEXT = Expectations(
{
("cuda", None): "user\n\nDescribe the image.\nassistant\nThe image captures a vibrant nighttime scene on a bustling city street. A woman in a striking red dress",
("xpu", None): "user\n\nDescribe the image.\nassistant\nThe image captures a vibrant nighttime scene on a bustling city street. A woman in a striking red dress",
}
).get_expectation()
# fmt: on
self.assertEqual(
self.processor.decode(output[0], skip_special_tokens=True),
EXPECTED_DECODED_TEXT,
)
def test_small_model_integration_test_batch(self):
model = VideoLlama3ForConditionalGeneration.from_pretrained(
"lkhl/VideoLLaMA3-2B-Image-HF", dtype=torch.bfloat16, device_map=torch_device
)
text = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
inputs = self.processor(text=[text, text], images=[self.image, self.image], return_tensors="pt").to(
torch_device
)
# it should not matter whether two images are the same size or not
output = model.generate(**inputs, max_new_tokens=20, do_sample=False, repetition_penalty=None)
EXPECTED_DECODED_TEXT = [
"user\n\nDescribe the image.\nassistant\nThe image captures a vibrant nighttime scene on a bustling city street. A woman in a striking red dress",
"user\n\nDescribe the image.\nassistant\nThe image captures a vibrant nighttime scene on a bustling city street. A woman in a striking red dress",
] # fmt: skip
self.assertEqual(
self.processor.batch_decode(output, skip_special_tokens=True),
EXPECTED_DECODED_TEXT,
)
def test_small_model_integration_test_batch_wo_image(self):
model = VideoLlama3ForConditionalGeneration.from_pretrained(
"lkhl/VideoLLaMA3-2B-Image-HF", dtype=torch.bfloat16, device_map=torch_device
)
text = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
messages2 = [
{"role": "user", "content": [{"type": "text", "text": "What is relativity?"}]},
]
text2 = self.processor.apply_chat_template(messages2, tokenize=False, add_generation_prompt=True)
inputs = self.processor(
text=[text, text2], images=[self.image], padding=True, padding_side="left", return_tensors="pt"
).to(torch_device)
# it should not matter whether two images are the same size or not
output = model.generate(**inputs, max_new_tokens=20, do_sample=False, repetition_penalty=None)
# fmt: off
EXPECTED_DECODED_TEXT = Expectations(
{
("cuda", None): [
"user\n\nDescribe the image.\nassistant\nThe image captures a vibrant night scene in a bustling Japanese city. A woman in a striking red dress",
"user\nWhat is relativity?\nassistant\nRelativity is a scientific theory that describes the relationship between space and time. It was first proposed by",
],
("xpu", None): [
"user\n\nDescribe the image.\nassistant\nThe image captures a vibrant nighttime scene on a bustling city street. A woman in a striking red dress",
"user\nWhat is relativity?\nassistant\nRelativity is a scientific theory that describes the relationship between space and time. It was first proposed by",
],
}
).get_expectation()
# fmt: on
self.assertEqual(
self.processor.batch_decode(output, skip_special_tokens=True),
EXPECTED_DECODED_TEXT,
)
def test_small_model_integration_test_batch_different_resolutions(self):
model = VideoLlama3ForConditionalGeneration.from_pretrained(
"lkhl/VideoLLaMA3-2B-Image-HF", dtype=torch.bfloat16, device_map=torch_device
)
text = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
text2 = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
image2 = self.image.resize((224, 224))
inputs = self.processor(
text=[text, text2], images=[self.image, image2], padding=True, padding_side="left", return_tensors="pt"
).to(torch_device)
# it should not matter whether two images are the same size or not
output = model.generate(**inputs, max_new_tokens=20, do_sample=False, repetition_penalty=None)
DECODED_TEXT = self.processor.batch_decode(output, skip_special_tokens=True)
# fmt: off
EXPECTED_DECODED_TEXT = Expectations(
{
("cuda", None): [
"user\n\nDescribe the image.\nassistant\nThe image captures a vibrant night scene in a bustling Japanese city. A woman in a striking red dress",
"user\n\nDescribe the image.\nassistant\nThe image depicts a striking urban scene at night. A person is standing in the center of a wet",
],
("xpu", None): [
"user\n\nDescribe the image.\nassistant\nThe image captures a vibrant night scene in a bustling Japanese city. A woman in a striking red dress",
"user\n\nDescribe the image.\nassistant\nThe image depicts a striking urban scene at night. A person is standing in the center of a wet",
],
}
).get_expectation()
# fmt: on
self.assertEqual(DECODED_TEXT, EXPECTED_DECODED_TEXT)
@require_flash_attn
@require_torch_accelerator
@pytest.mark.flash_attn_test
def test_small_model_integration_test_batch_flashatt2(self):
model = VideoLlama3ForConditionalGeneration.from_pretrained(
"lkhl/VideoLLaMA3-2B-Image-HF",
dtype=torch.bfloat16,
attn_implementation="flash_attention_2",
device_map=torch_device,
)
text = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
inputs = self.processor(text=[text, text], images=[self.image, self.image], return_tensors="pt").to(
torch_device
)
# it should not matter whether two images are the same size or not
output = model.generate(**inputs, max_new_tokens=20, do_sample=False, repetition_penalty=None)
# fmt: off
EXPECTED_DECODED_TEXTS = Expectations(
{
(None, None): ['user\n\nDescribe the image.\nassistant\nThe image captures a vibrant nighttime scene on a bustling city street. A woman in a striking red dress',
'user\n\nDescribe the image.\nassistant\nThe image captures a vibrant nighttime scene on a bustling city street. A woman in a striking red dress',
],
("xpu", 3): ['user\n\nDescribe the image.\nassistant\nThe image captures a vibrant nighttime scene on a bustling city street. A woman in a striking red dress',
'user\n\nDescribe the image.\nassistant\nThe image depicts a vibrant nighttime scene on a bustling city street. A woman in a striking red dress',
],
}
)
# fmt: on
EXPECTED_DECODED_TEXT = EXPECTED_DECODED_TEXTS.get_expectation()
DECODED_TEXT = self.processor.batch_decode(output, skip_special_tokens=True)
self.assertEqual(DECODED_TEXT, EXPECTED_DECODED_TEXT)
@require_flash_attn
@require_torch_accelerator
@pytest.mark.flash_attn_test
def test_small_model_integration_test_batch_wo_image_flashatt2(self):
model = VideoLlama3ForConditionalGeneration.from_pretrained(
"lkhl/VideoLLaMA3-2B-Image-HF",
dtype=torch.bfloat16,
attn_implementation="flash_attention_2",
device_map=torch_device,
)
text = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
messages2 = [
{"role": "user", "content": [{"type": "text", "text": "What is relativity?"}]},
]
text2 = self.processor.apply_chat_template(messages2, tokenize=False, add_generation_prompt=True)
inputs = self.processor(
text=[text, text2], images=[self.image], padding=True, padding_side="left", return_tensors="pt"
).to(torch_device)
# it should not matter whether two images are the same size or not
output = model.generate(**inputs, max_new_tokens=20, do_sample=False, repetition_penalty=None)
EXPECTED_DECODED_TEXT = [
'user\n\nDescribe the image.\nassistant\nThe image captures a vibrant nighttime scene on a bustling city street. A woman in a striking red dress',
'user\nWhat is relativity?\nassistant\nRelativity is a scientific theory that describes the relationship between space and time. It was first proposed by'
] # fmt: skip
self.assertEqual(
self.processor.batch_decode(output, skip_special_tokens=True),
EXPECTED_DECODED_TEXT,
)

View File

@@ -0,0 +1,339 @@
# Copyright 2025 the HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import inspect
import unittest
import numpy as np
from PIL import Image
from transformers.testing_utils import require_av, require_torch, require_torchvision, require_vision
from transformers.utils import is_torch_available, is_vision_available
from ...test_processing_common import ProcessorTesterMixin, url_to_local_path
if is_vision_available():
from transformers import VideoLlama3Processor
if is_torch_available():
import torch
def prepare_image_inputs():
"""This function prepares a list of PIL images"""
image_inputs = [np.random.randint(255, size=(3, 15, 50), dtype=np.uint8)]
image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs]
return image_inputs
@require_vision
@require_torch
@require_torchvision
class VideoLlama3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
processor_class = VideoLlama3Processor
model_id = "lkhl/VideoLLaMA3-2B-Image-HF"
@classmethod
def _setup_from_pretrained(cls, model_id, **kwargs):
return super()._setup_from_pretrained(model_id, patch_size=4, max_pixels=56 * 56, min_pixels=28 * 28, **kwargs)
@classmethod
def _setup_test_attributes(cls, processor):
cls.image_token = processor.image_token
def prepare_image_inputs(self, batch_size: int | None = None):
"""This function prepares a list of PIL images for testing"""
if batch_size is None:
return prepare_image_inputs()[0]
if batch_size < 1:
raise ValueError("batch_size must be greater than 0")
return prepare_image_inputs() * batch_size
# Copied from tests.models.llava.test_processing_llava.LlavaProcessorTest.test_get_num_vision_tokens
def test_get_num_vision_tokens(self):
"Tests general functionality of the helper used internally in vLLM"
processor = self.get_processor()
output = processor._get_num_multimodal_tokens(image_sizes=[(100, 100), (300, 100), (500, 30)])
self.assertTrue("num_image_tokens" in output)
self.assertEqual(len(output["num_image_tokens"]), 3)
self.assertTrue("num_image_patches" in output)
self.assertEqual(len(output["num_image_patches"]), 3)
@require_torch
@require_av
def _test_apply_chat_template(
self,
modality: str,
batch_size: int,
return_tensors: str,
input_name: str,
processor_name: str,
input_data: list[str],
):
processor = self.get_processor()
if processor.chat_template is None:
self.skipTest("Processor has no chat template")
if processor_name not in self.processor_class.get_attributes():
self.skipTest(f"{processor_name} attribute not present in {self.processor_class}")
batch_messages = [
[
{
"role": "user",
"content": [{"type": "text", "text": "Describe this."}],
},
]
] * batch_size
# Test that jinja can be applied
formatted_prompt = processor.apply_chat_template(batch_messages, add_generation_prompt=True, tokenize=False)
self.assertEqual(len(formatted_prompt), batch_size)
# Test that tokenizing with template and directly with `self.tokenizer` gives same output
formatted_prompt_tokenized = processor.apply_chat_template(
batch_messages, add_generation_prompt=True, tokenize=True, return_tensors=return_tensors
)
add_special_tokens = True
if processor.tokenizer.bos_token is not None and formatted_prompt[0].startswith(processor.tokenizer.bos_token):
add_special_tokens = False
tok_output = processor.tokenizer(
formatted_prompt, return_tensors=return_tensors, add_special_tokens=add_special_tokens
)
expected_output = tok_output.input_ids
self.assertListEqual(expected_output.tolist(), formatted_prompt_tokenized.tolist())
# Test that kwargs passed to processor's `__call__` are actually used
tokenized_prompt_100 = processor.apply_chat_template(
batch_messages,
add_generation_prompt=True,
tokenize=True,
padding="max_length",
truncation=True,
return_tensors=return_tensors,
max_length=100,
)
self.assertEqual(len(tokenized_prompt_100[0]), 100)
# Test that `return_dict=True` returns text related inputs in the dict
out_dict_text = processor.apply_chat_template(
batch_messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors=return_tensors,
)
self.assertTrue(all(key in out_dict_text for key in ["input_ids", "attention_mask"]))
self.assertEqual(len(out_dict_text["input_ids"]), batch_size)
self.assertEqual(len(out_dict_text["attention_mask"]), batch_size)
# Test that with modality URLs and `return_dict=True`, we get modality inputs in the dict
for idx, url in enumerate(input_data[:batch_size]):
batch_messages[idx][0]["content"] = [batch_messages[idx][0]["content"][0], {"type": modality, "url": url}]
out_dict = processor.apply_chat_template(
batch_messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors=return_tensors,
num_frames=2, # by default no more than 2 frames, otherwise too slow
)
input_name = getattr(self, input_name)
self.assertTrue(input_name in out_dict)
self.assertEqual(len(out_dict["input_ids"]), batch_size)
self.assertEqual(len(out_dict["attention_mask"]), batch_size)
if modality == "video":
# qwen pixels don't scale with bs same way as other models, calculate expected video token count based on video_grid_thw
expected_video_token_count = 0
for thw in out_dict["video_grid_thw"]:
expected_video_token_count += thw[0] * thw[1] * thw[2]
mm_len = expected_video_token_count
else:
mm_len = batch_size * 192
self.assertEqual(len(out_dict[input_name]), mm_len)
return_tensor_to_type = {"pt": torch.Tensor, "np": np.ndarray, None: list}
for k in out_dict:
self.assertIsInstance(out_dict[k], return_tensor_to_type[return_tensors])
@require_av
def test_apply_chat_template_video_frame_sampling(self):
processor = self.get_processor()
if processor.chat_template is None:
self.skipTest("Processor has no chat template")
signature = inspect.signature(processor.__call__)
if "videos" not in {*signature.parameters.keys()} or (
signature.parameters.get("videos") is not None
and signature.parameters["videos"].annotation == inspect._empty
):
self.skipTest("Processor doesn't accept videos at input")
messages = [
[
{
"role": "user",
"content": [
{
"type": "video",
"url": url_to_local_path(
"https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/Big_Buck_Bunny_720_10s_10MB.mp4"
),
},
{"type": "text", "text": "What is shown in this video?"},
],
},
]
]
formatted_prompt = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
self.assertEqual(len(formatted_prompt), 1)
num_frames = 3
out_dict_with_video = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
num_frames=num_frames,
)
self.assertTrue(self.videos_input_name in out_dict_with_video)
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 180)
# Load with `fps` arg
fps = 1
out_dict_with_video = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
fps=fps,
)
self.assertTrue(self.videos_input_name in out_dict_with_video)
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 80)
# Load with `fps` and `num_frames` args, should raise an error
with self.assertRaises(ValueError):
out_dict_with_video = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
fps=fps,
num_frames=num_frames,
)
# Load without any arg should load the whole video
out_dict_with_video = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
)
self.assertTrue(self.videos_input_name in out_dict_with_video)
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1200)
# Load video as a list of frames (i.e. images). NOTE: each frame should have same size
# because we assume they come from one video
messages[0][0]["content"][0] = {
"type": "video",
"url": [
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/australia.jpg",
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/australia.jpg",
],
}
out_dict_with_video = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
)
self.assertTrue(self.videos_input_name in out_dict_with_video)
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 192)
# When the inputs are frame URLs/paths we expect that those are already
# sampled and will raise an error is asked to sample again.
with self.assertRaisesRegex(
ValueError, "Sampling frames from a list of images is not supported! Set `do_sample_frames=False`"
):
out_dict_with_video = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
do_sample_frames=True,
)
def test_kwargs_overrides_custom_image_processor_kwargs(self):
processor = self.get_processor()
self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs()
image_input = self.prepare_image_inputs()
inputs = processor(text=input_str, images=image_input, return_tensors="pt")
self.assertEqual(inputs[self.images_input_name].shape[0], 52)
inputs = processor(text=input_str, images=image_input, max_pixels=56 * 56 * 4, return_tensors="pt")
self.assertEqual(inputs[self.images_input_name].shape[0], 52)
def test_special_mm_token_truncation(self):
"""Tests that special vision tokens do not get truncated when `truncation=True` is set."""
processor = self.get_processor()
input_str = self.prepare_text_inputs(batch_size=2, modalities="image")
image_input = self.prepare_image_inputs(batch_size=2)
_ = processor(
text=input_str,
images=image_input,
return_tensors="pt",
truncation=None,
padding=True,
)
with self.assertRaises(ValueError):
_ = processor(
text=input_str,
images=image_input,
return_tensors="pt",
truncation=True,
padding=True,
max_length=20,
)
def test_video_processor_defaults(self):
# Video processor has default `return_metadata=True` which doesn't match with processor
video_processor = self.get_component("video_processor")
# Get all required components for processor
components = {}
for attribute in self.processor_class.get_attributes():
components[attribute] = self.get_component(attribute)
processor = self.processor_class(**components)
video_input = self.prepare_video_inputs()
# Process with both video_processor and processor
input_video_proc = video_processor(video_input, return_tensors="pt", return_metadata=True)
input_processor = processor(videos=video_input, return_tensors="pt", return_metadata=True)
# Verify outputs match
for key in input_video_proc:
# processor changes metadata fps in-place when can't be inferred, i.e. if already decoded video
if key != "video_metadata":
torch.testing.assert_close(input_video_proc[key], input_processor[key])

View File

@@ -0,0 +1,368 @@
# Copyright 2025 the HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import unittest
import numpy as np
from transformers.image_utils import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD
from transformers.testing_utils import require_torch, require_vision
from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
from ...test_video_processing_common import VideoProcessingTestMixin, prepare_video_inputs
if is_torch_available():
import torch
if is_vision_available():
from PIL import Image
from transformers.image_utils import get_image_size
from transformers.models.video_llama_3.video_processing_video_llama_3 import smart_resize
if is_torchvision_available():
from transformers import VideoLlama3VideoProcessor
class VideoLlama3VideoProcessingTester:
def __init__(
self,
parent,
batch_size=5,
num_frames=8,
num_channels=3,
min_resolution=30,
max_resolution=80,
do_resize=True,
size=None,
do_center_crop=True,
crop_size=None,
do_normalize=True,
image_mean=IMAGENET_STANDARD_MEAN,
image_std=IMAGENET_STANDARD_STD,
do_convert_rgb=True,
temporal_patch_size=2,
patch_size=14,
min_pixels=20 * 20,
max_pixels=100 * 100 * 8,
merge_size=2,
):
size = size if size is not None else {"shortest_edge": 400, "longest_edge": 80000}
crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
self.parent = parent
self.batch_size = batch_size
self.num_frames = num_frames
self.num_channels = num_channels
self.min_resolution = min_resolution
self.max_resolution = max_resolution
self.do_resize = do_resize
self.size = size
self.do_center_crop = do_center_crop
self.crop_size = crop_size
self.do_normalize = do_normalize
self.image_mean = image_mean
self.image_std = image_std
self.do_convert_rgb = do_convert_rgb
self.temporal_patch_size = temporal_patch_size
self.patch_size = patch_size
self.min_pixels = min_pixels
self.max_pixels = max_pixels
self.merge_size = merge_size
def prepare_video_processor_dict(self):
return {
"do_resize": self.do_resize,
"do_center_crop": self.do_center_crop,
"crop_size": self.crop_size,
"do_normalize": self.do_normalize,
"image_mean": self.image_mean,
"image_std": self.image_std,
"do_convert_rgb": self.do_convert_rgb,
"temporal_patch_size": self.temporal_patch_size,
"patch_size": self.patch_size,
"min_pixels": self.min_pixels,
"max_pixels": self.max_pixels,
"merge_size": self.merge_size,
}
@require_vision
def expected_output_video_shape(self, videos, num_frames=None):
num_frames = num_frames if num_frames is not None else self.num_frames
grid_t = num_frames // self.temporal_patch_size
hidden_dim = self.num_channels * self.temporal_patch_size * self.patch_size * self.patch_size
seq_len = 0
for video in videos:
if isinstance(video[0], Image.Image):
video = np.stack([np.array(frame) for frame in video])
height, width = get_image_size(video)
resized_height, resized_width = smart_resize(
height,
width,
factor=self.patch_size * self.merge_size,
min_pixels=self.min_pixels,
max_pixels=self.max_pixels,
)
grid_h, grid_w = resized_height // self.patch_size, resized_width // self.patch_size
seq_len += grid_t * grid_h * grid_w
return [seq_len, hidden_dim]
def prepare_video_inputs(self, equal_resolution=False, return_tensors="pil"):
videos = prepare_video_inputs(
batch_size=self.batch_size,
num_frames=self.num_frames,
num_channels=self.num_channels,
min_resolution=self.min_resolution,
max_resolution=self.max_resolution,
equal_resolution=equal_resolution,
return_tensors=return_tensors,
)
return videos
@require_torch
@require_vision
class VideoLlama3VideoProcessingTest(VideoProcessingTestMixin, unittest.TestCase):
fast_video_processing_class = VideoLlama3VideoProcessor if is_torchvision_available() else None
def setUp(self):
super().setUp()
self.video_processor_tester = VideoLlama3VideoProcessingTester(self)
@property
def video_processor_dict(self):
return self.video_processor_tester.prepare_video_processor_dict()
def test_video_processor_properties(self):
video_processing = self.fast_video_processing_class(**self.video_processor_dict)
self.assertTrue(hasattr(video_processing, "do_resize"))
self.assertTrue(hasattr(video_processing, "size"))
self.assertTrue(hasattr(video_processing, "do_center_crop"))
self.assertTrue(hasattr(video_processing, "center_crop"))
self.assertTrue(hasattr(video_processing, "do_normalize"))
self.assertTrue(hasattr(video_processing, "image_mean"))
self.assertTrue(hasattr(video_processing, "image_std"))
self.assertTrue(hasattr(video_processing, "do_convert_rgb"))
def test_video_processor_from_dict_with_kwargs(self):
video_processor = self.fast_video_processing_class.from_dict(self.video_processor_dict)
self.assertEqual(video_processor.size, {"shortest_edge": 400, "longest_edge": 80000})
self.assertEqual(video_processor.crop_size, {"height": 18, "width": 18})
video_processor = self.fast_video_processing_class.from_dict(
self.video_processor_dict, size={"shortest_edge": 100, "longest_edge": 200}
)
# min_pixels and max_pixels take precedence over size, like in the image processor.
self.assertEqual(video_processor.size, {"shortest_edge": 400, "longest_edge": 80000})
processor_dict = self.video_processor_dict.copy()
processor_dict.pop("min_pixels")
processor_dict.pop("max_pixels")
video_processor = self.fast_video_processing_class.from_dict(
processor_dict, size={"shortest_edge": 100, "longest_edge": 200}
)
self.assertEqual(video_processor.size, {"shortest_edge": 100, "longest_edge": 200})
def test_video_processor_to_json_string(self):
for video_processing_class in self.video_processor_list:
video_processor = video_processing_class(**self.video_processor_dict)
obj = json.loads(video_processor.to_json_string())
for key, value in self.video_processor_dict.items():
if key not in ["min_pixels", "max_pixels"]:
self.assertEqual(obj[key], value)
def test_call_pil(self):
for video_processing_class in self.video_processor_list:
# Initialize video_processing
video_processing = video_processing_class(**self.video_processor_dict)
video_inputs = self.video_processor_tester.prepare_video_inputs(
equal_resolution=False, return_tensors="pil"
)
# Each video is a list of PIL Images
for video in video_inputs:
self.assertIsInstance(video[0], Image.Image)
# Test not batched input
encoded_videos = video_processing(video_inputs[0], return_tensors="pt")[self.input_name]
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape([video_inputs[0]])
self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
# Test batched
encoded_videos = video_processing(video_inputs, return_tensors="pt")[self.input_name]
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(video_inputs)
self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
def test_call_numpy(self):
for video_processing_class in self.video_processor_list:
# Initialize video_processing
video_processing = video_processing_class(**self.video_processor_dict)
# create random numpy tensors
video_inputs = self.video_processor_tester.prepare_video_inputs(
equal_resolution=False, return_tensors="np"
)
for video in video_inputs:
self.assertIsInstance(video, np.ndarray)
# Test not batched input
encoded_videos = video_processing(video_inputs[0], return_tensors="pt")[self.input_name]
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape([video_inputs[0]])
self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
# Test batched
encoded_videos = video_processing(video_inputs, return_tensors="pt")[self.input_name]
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(video_inputs)
self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
def test_call_pytorch(self):
for video_processing_class in self.video_processor_list:
# Initialize video_processing
video_processing = video_processing_class(**self.video_processor_dict)
# create random PyTorch tensors
video_inputs = self.video_processor_tester.prepare_video_inputs(
equal_resolution=False, return_tensors="torch"
)
for video in video_inputs:
self.assertIsInstance(video, torch.Tensor)
# Test not batched input
encoded_videos = video_processing(video_inputs[0], return_tensors="pt")[self.input_name]
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape([video_inputs[0]])
self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
# Test batched
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(video_inputs)
encoded_videos = video_processing(video_inputs, return_tensors="pt")[self.input_name]
self.assertEqual(
list(encoded_videos.shape),
expected_output_video_shape,
)
def test_nested_input(self):
"""Tests that the processor can work with nested list where each video is a list of arrays"""
for video_processing_class in self.video_processor_list:
video_processing = video_processing_class(**self.video_processor_dict)
video_inputs = self.video_processor_tester.prepare_video_inputs(
equal_resolution=False, return_tensors="np"
)
# Test not batched input
video_inputs_nested = [list(video) for video in video_inputs]
encoded_videos = video_processing(video_inputs_nested[0], return_tensors="pt")[self.input_name]
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape([video_inputs[0]])
self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
# Test batched
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(video_inputs)
encoded_videos = video_processing(video_inputs_nested, return_tensors="pt")[self.input_name]
self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
@unittest.skip("Skip for now, the test needs adjustment fo Qwen2VL")
def test_call_numpy_4_channels(self):
for video_processing_class in self.video_processor_list:
# Test that can process videos which have an arbitrary number of channels
# Initialize video_processing
video_processor = video_processing_class(**self.video_processor_dict)
# create random numpy tensors
self.video_processor_tester.num_channels = 4
video_inputs = self.video_processor_tester.prepare_video_inputs(
equal_resolution=False, return_tensors="np"
)
# Test not batched input
encoded_videos = video_processor(
video_inputs[0],
return_tensors="pt",
input_data_format="channels_last",
image_mean=0,
image_std=1,
)[self.input_name]
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape([video_inputs[0]])
self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
# Test batched
encoded_videos = video_processor(
video_inputs,
return_tensors="pt",
input_data_format="channels_last",
image_mean=0,
image_std=1,
)[self.input_name]
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(video_inputs)
self.assertEqual(list(encoded_videos.shape), expected_output_video_shape)
def test_call_sample_frames(self):
for video_processing_class in self.video_processor_list:
video_processing = video_processing_class(**self.video_processor_dict)
prev_num_frames = self.video_processor_tester.num_frames
self.video_processor_tester.num_frames = 8
video_inputs = self.video_processor_tester.prepare_video_inputs(
equal_resolution=False,
return_tensors="torch",
)
# Force set sampling to False. No sampling is expected even when `num_frames` exists
video_processing.do_sample_frames = False
encoded_videos = video_processing(video_inputs[0], return_tensors="pt", num_frames=3)[self.input_name]
encoded_videos_batched = video_processing(video_inputs, return_tensors="pt", num_frames=3)[self.input_name]
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape([video_inputs[0]])
expected_output_video_shape_batched = self.video_processor_tester.expected_output_video_shape(video_inputs)
self.assertListEqual(list(encoded_videos.shape), expected_output_video_shape)
self.assertListEqual(list(encoded_videos_batched.shape), expected_output_video_shape_batched)
# Set sampling to True. Video frames should be sampled with `num_frames` in the output
video_processing.do_sample_frames = True
encoded_videos = video_processing(video_inputs[0], return_tensors="pt", num_frames=4)[self.input_name]
encoded_videos_batched = video_processing(video_inputs, return_tensors="pt", num_frames=4)[self.input_name]
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(
[video_inputs[0]], num_frames=4
)
expected_output_video_shape_batched = self.video_processor_tester.expected_output_video_shape(
video_inputs, num_frames=4
)
self.assertListEqual(list(encoded_videos.shape), expected_output_video_shape)
self.assertListEqual(list(encoded_videos_batched.shape), expected_output_video_shape_batched)
metadata = [[{"duration": 2.0, "total_num_frames": 8, "fps": 4}]]
batched_metadata = metadata * len(video_inputs)
encoded_videos = video_processing(video_inputs[0], return_tensors="pt", fps=3, video_metadata=metadata)[
self.input_name
]
encoded_videos_batched = video_processing(
video_inputs, return_tensors="pt", fps=3, video_metadata=batched_metadata
)[self.input_name]
expected_output_video_shape = self.video_processor_tester.expected_output_video_shape(
[video_inputs[0]], num_frames=6
)
expected_output_video_shape_batched = self.video_processor_tester.expected_output_video_shape(
video_inputs, num_frames=6
)
self.assertListEqual(list(encoded_videos.shape), expected_output_video_shape)
self.assertListEqual(list(encoded_videos_batched.shape), expected_output_video_shape_batched)
# We should raise error when asked to sample more frames than there are in input video
with self.assertRaises(ValueError):
encoded_videos = video_processing(video_inputs[0], return_tensors="pt", num_frames=10)[self.input_name]
encoded_videos_batched = video_processing(video_inputs, return_tensors="pt", num_frames=10)[
self.input_name
]
# Assign back the actual num frames in tester
self.video_processor_tester.num_frames = prev_num_frames