Some checks failed
Self-hosted runner (nightly-past-ci-caller) / Get number (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.11 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.10 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.9 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.8 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.7 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.6 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.5 (push) Has been cancelled
Self-hosted runner (benchmark) / Benchmark (aws-g5-4xlarge-cache) (push) Has been cancelled
Build documentation / build (push) Has been cancelled
Build documentation / build_other_lang (push) Has been cancelled
CodeQL Security Analysis / CodeQL Analysis (push) Has been cancelled
New model PR merged notification / Notify new model (push) Has been cancelled
PR CI / pr-ci (push) Has been cancelled
Slow tests on important models (on Push - A10) / Get all modified files (push) Has been cancelled
Secret Leaks / trufflehog (push) Has been cancelled
Update Transformers metadata / build_and_package (push) Has been cancelled
Slow tests on important models (on Push - A10) / Model CI (push) Has been cancelled
Check Tiny Models / Check tiny models (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Model CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Pipeline CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Example CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / DeepSpeed CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Trainer/FSDP CI (push) Has been cancelled
Nvidia CI - Flash Attn / Setup (push) Has been cancelled
Nvidia CI - Flash Attn / Model CI (push) Has been cancelled
Nvidia CI / Setup (push) Has been cancelled
Nvidia CI / Model CI (push) Has been cancelled
Nvidia CI / Torch pipeline CI (push) Has been cancelled
Nvidia CI / Example CI (push) Has been cancelled
Nvidia CI / Trainer/FSDP CI (push) Has been cancelled
Nvidia CI / DeepSpeed CI (push) Has been cancelled
Nvidia CI / Quantization CI (push) Has been cancelled
Nvidia CI / Kernels CI (push) Has been cancelled
Doctests / Setup (push) Has been cancelled
Doctests / Call doctest jobs (push) Has been cancelled
Doctests / Send results to webhook (push) Has been cancelled
Extras Smoke Test / Get supported Python versions (push) Has been cancelled
Extras Smoke Test / Test extras on Python ${{ matrix.python-version }} (push) Has been cancelled
Extras Smoke Test / Check Slack token availability (push) Has been cancelled
Extras Smoke Test / Notify failures to Slack (push) Has been cancelled
Self-hosted runner (AMD scheduled CI caller) / Trigger Scheduled AMD CI (push) Has been cancelled
Stale Bot / Close Stale Issues (push) Has been cancelled
173 lines
7.3 KiB
Python
173 lines
7.3 KiB
Python
# Copyright 2024 The HuggingFace Team. All rights reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
import json
|
|
import unittest
|
|
|
|
import torch
|
|
|
|
from transformers.testing_utils import require_torch, require_vision
|
|
from transformers.utils import is_vision_available
|
|
|
|
from ...test_processing_common import ProcessorTesterMixin, url_to_local_path
|
|
|
|
|
|
if is_vision_available():
|
|
from transformers import LlavaOnevisionProcessor
|
|
|
|
|
|
@require_vision
|
|
@require_torch
|
|
class LlavaOnevisionProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|
processor_class = LlavaOnevisionProcessor
|
|
|
|
@classmethod
|
|
def _setup_tokenizer(cls):
|
|
tokenizer_class = cls._get_component_class_from_processor("tokenizer")
|
|
vocab_tokens = [
|
|
("<unk>", 0.0),
|
|
("<s>", 0.0),
|
|
("</s>", 0.0),
|
|
("[PAD]", 0.0),
|
|
("<image>", 0.0),
|
|
("<video>", 0.0),
|
|
("Hello", 0.0),
|
|
("world", 0.0),
|
|
]
|
|
vocab = {token: index for index, (token, _) in enumerate(vocab_tokens)}
|
|
tokenizer = tokenizer_class(vocab=vocab, add_bos_token=True, add_eos_token=False)
|
|
tokenizer.add_special_tokens({"additional_special_tokens": ["<image>", "<video>"]})
|
|
if tokenizer.pad_token is None:
|
|
tokenizer.pad_token = "[PAD]"
|
|
return tokenizer
|
|
|
|
@classmethod
|
|
def _setup_image_processor(cls):
|
|
image_processor_class = cls._get_component_class_from_processor("image_processor", use_fast=False)
|
|
return image_processor_class()
|
|
|
|
@classmethod
|
|
def _setup_test_attributes(cls, processor):
|
|
cls.image_token = processor.image_token
|
|
cls.video_token = processor.video_token
|
|
|
|
@staticmethod
|
|
def prepare_processor_dict():
|
|
return {
|
|
"chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + ' '}}{# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '<image>' }}{% endfor %}{# Render all video then #}{% for content in message['content'] | selectattr('type', 'equalto', 'video') %}{{ '<video>' }}{% endfor %}{# Render all text next #}{% if message['role'] != 'assistant' %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{{ '\n' + content['text'] }}{% endfor %}{% else %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{% generation %}{{ '\n' + content['text'] }}{% endgeneration %}{% endfor %}{% endif %}{{'<|im_end|>'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
|
|
"num_image_tokens": 6,
|
|
"vision_feature_select_strategy": "default"
|
|
} # fmt: skip
|
|
|
|
# Copied from tests.models.llava.test_processing_llava.LlavaProcessorTest.test_get_num_vision_tokens
|
|
def test_get_num_vision_tokens(self):
|
|
"Tests general functionality of the helper used internally in vLLM"
|
|
|
|
processor = self.get_processor()
|
|
|
|
output = processor._get_num_multimodal_tokens(image_sizes=[(100, 100), (300, 100), (500, 30)])
|
|
self.assertTrue("num_image_tokens" in output)
|
|
self.assertEqual(len(output["num_image_tokens"]), 3)
|
|
|
|
self.assertTrue("num_image_patches" in output)
|
|
self.assertEqual(len(output["num_image_patches"]), 3)
|
|
|
|
# Copied from tests.models.llava.test_processing_llava.LlavaProcessorTest.test_chat_template_is_saved
|
|
def test_chat_template_is_saved(self):
|
|
processor_loaded = self.processor_class.from_pretrained(self.tmpdirname)
|
|
processor_dict_loaded = json.loads(processor_loaded.to_json_string())
|
|
# chat templates aren't serialized to json in processors
|
|
self.assertFalse("chat_template" in processor_dict_loaded)
|
|
|
|
# they have to be saved as separate file and loaded back from that file
|
|
# so we check if the same template is loaded
|
|
processor_dict = self.prepare_processor_dict()
|
|
self.assertTrue(processor_loaded.chat_template == processor_dict.get("chat_template", None))
|
|
|
|
def test_image_token_filling(self):
|
|
processor = self.processor_class.from_pretrained(self.tmpdirname)
|
|
processor.patch_size = 14
|
|
processor.vision_feature_select_strategy = "default"
|
|
processor.image_processor.crop_size = {"height": 336, "width": 336}
|
|
processor.image_processor.size = {"shortest_edge": 336}
|
|
processor.image_processor.image_grid_pinpoints = [[672, 336]]
|
|
processor.num_image_tokens = (processor.image_processor.size["shortest_edge"] // processor.patch_size) ** 2
|
|
# Important to check with non square image
|
|
image = torch.randint(0, 2, (3, 503, 316))
|
|
expected_image_tokens = 1525
|
|
image_token_index = processor.image_token_id
|
|
|
|
messages = [
|
|
{
|
|
"role": "user",
|
|
"content": [
|
|
{"type": "image"},
|
|
{"type": "text", "text": "What is shown in this image?"},
|
|
],
|
|
},
|
|
]
|
|
inputs = processor(
|
|
text=[processor.apply_chat_template(messages)],
|
|
images=[image],
|
|
return_tensors="pt",
|
|
)
|
|
image_tokens = (inputs["input_ids"] == image_token_index).sum().item()
|
|
self.assertEqual(expected_image_tokens, image_tokens)
|
|
|
|
@require_torch
|
|
def test_apply_chat_template_video_frame_sampling(self):
|
|
processor = self.get_processor()
|
|
|
|
messages = [
|
|
[
|
|
{
|
|
"role": "user",
|
|
"content": [
|
|
{
|
|
"type": "video",
|
|
"url": url_to_local_path(
|
|
"https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/tiny_video.mp4"
|
|
),
|
|
},
|
|
{"type": "text", "text": "What is shown in this video?"},
|
|
],
|
|
},
|
|
]
|
|
]
|
|
|
|
num_frames = 3
|
|
out_dict_with_video = processor.apply_chat_template(
|
|
messages,
|
|
add_generation_prompt=True,
|
|
tokenize=True,
|
|
return_dict=True,
|
|
num_frames=num_frames,
|
|
return_tensors="pt",
|
|
)
|
|
self.assertTrue(self.videos_input_name in out_dict_with_video)
|
|
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1)
|
|
self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), num_frames)
|
|
|
|
# Choose an fps high enough to avoid rounding down to zero sampled frames on short dummy videos
|
|
fps = 4
|
|
out_dict_with_video = processor.apply_chat_template(
|
|
messages,
|
|
add_generation_prompt=True,
|
|
tokenize=True,
|
|
return_dict=True,
|
|
fps=fps,
|
|
return_tensors="pt",
|
|
)
|
|
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1)
|