Files
transformers/tests/models/llama/test_tokenization_llama.py
陈赣 06f1fd69a6
Some checks failed
Self-hosted runner (nightly-past-ci-caller) / Get number (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.11 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.10 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.9 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.8 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.7 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.6 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.5 (push) Has been cancelled
Self-hosted runner (benchmark) / Benchmark (aws-g5-4xlarge-cache) (push) Has been cancelled
Build documentation / build (push) Has been cancelled
Build documentation / build_other_lang (push) Has been cancelled
CodeQL Security Analysis / CodeQL Analysis (push) Has been cancelled
New model PR merged notification / Notify new model (push) Has been cancelled
PR CI / pr-ci (push) Has been cancelled
Slow tests on important models (on Push - A10) / Get all modified files (push) Has been cancelled
Secret Leaks / trufflehog (push) Has been cancelled
Update Transformers metadata / build_and_package (push) Has been cancelled
Slow tests on important models (on Push - A10) / Model CI (push) Has been cancelled
Check Tiny Models / Check tiny models (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Model CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Pipeline CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Example CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / DeepSpeed CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Trainer/FSDP CI (push) Has been cancelled
Nvidia CI - Flash Attn / Setup (push) Has been cancelled
Nvidia CI - Flash Attn / Model CI (push) Has been cancelled
Nvidia CI / Setup (push) Has been cancelled
Nvidia CI / Model CI (push) Has been cancelled
Nvidia CI / Torch pipeline CI (push) Has been cancelled
Nvidia CI / Example CI (push) Has been cancelled
Nvidia CI / Trainer/FSDP CI (push) Has been cancelled
Nvidia CI / DeepSpeed CI (push) Has been cancelled
Nvidia CI / Quantization CI (push) Has been cancelled
Nvidia CI / Kernels CI (push) Has been cancelled
Doctests / Setup (push) Has been cancelled
Doctests / Call doctest jobs (push) Has been cancelled
Doctests / Send results to webhook (push) Has been cancelled
Extras Smoke Test / Get supported Python versions (push) Has been cancelled
Extras Smoke Test / Test extras on Python ${{ matrix.python-version }} (push) Has been cancelled
Extras Smoke Test / Check Slack token availability (push) Has been cancelled
Extras Smoke Test / Notify failures to Slack (push) Has been cancelled
Self-hosted runner (AMD scheduled CI caller) / Trigger Scheduled AMD CI (push) Has been cancelled
Stale Bot / Close Stale Issues (push) Has been cancelled
first commit
2026-06-05 16:53:03 +08:00

75 lines
4.6 KiB
Python

import unittest
from tests.test_tokenization_common import TokenizerTesterMixin
from transformers import AutoTokenizer
from transformers.models.llama.tokenization_llama import LlamaTokenizer
from transformers.testing_utils import (
require_tokenizers,
slow,
)
@require_tokenizers
class LlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = [
"hf-internal-testing/llama-tokenizer",
"meta-llama/Llama-2-7b-hf",
"meta-llama/Meta-Llama-3-8B",
]
tokenizer_class = LlamaTokenizer
from_pretrained_kwargs = {}
# Integration test data - expected outputs for the default input string
integration_expected_tokens = ["▁This", "▁is", "▁a", "▁test", "", "<0xF0>", "<0x9F>", "<0x98>", "<0x8A>", "<0x0A>", "I", "▁was", "▁born", "▁in", "", "9", "2", "0", "0", "0", ",", "▁and", "▁this", "▁is", "▁f", "als", "é", ".", "<0x0A>", "", "", "", "", "<0xE8>", "<0xB0>", "<0x9B>", "", "<0x0A>", "Hi", "", "▁Hello", "<0x0A>", "Hi", "▁▁", "▁Hello", "<0x0A>", "<0x0A>", "", "<0x0A>", "▁▁", "<0x0A>", "▁Hello", "<0x0A>", "<s>", "<0x0A>", "hi", "<s>", "there", "<0x0A>", "The", "▁following", "▁string", "▁should", "▁be", "▁properly", "▁encoded", ":", "▁Hello", ".", "<0x0A>", "But", "", "ird", "▁and", "", "", "", "▁▁▁", "ird", "▁▁▁", "", "<0x0A>", "H", "ey", "▁how", "▁are", "▁you", "▁doing"] # fmt: skip
integration_expected_token_ids = [910, 338, 263, 1243, 29871, 243, 162, 155, 141, 13, 29902, 471, 6345, 297, 29871, 29929, 29906, 29900, 29900, 29900, 29892, 322, 445, 338, 285, 1338, 29948, 29889, 13, 30486, 31704, 30210, 30848, 235, 179, 158, 30392, 13, 18567, 29871, 15043, 13, 18567, 259, 15043, 13, 13, 29871, 13, 259, 13, 15043, 13, 1, 13, 2918, 1, 12711, 13, 1576, 1494, 1347, 881, 367, 6284, 18511, 29901, 15043, 29889, 13, 6246, 29871, 1823, 322, 29871, 31010, 30691, 1678, 1823, 1678, 30718, 13, 29950, 1032, 920, 526, 366, 2599] # fmt: skip
integration_expected_decoded_text = "This is a test 😊\nI was born in 92000, and this is falsé.\n生活的真谛是\nHi Hello\nHi Hello\n\n \n \n Hello\n<s>\nhi<s>there\nThe following string should be properly encoded: Hello.\nBut ird and ปี ird ด\nHey how are you doing"
@classmethod
def setUpClass(cls):
super().setUpClass()
from_pretrained_id = "hf-internal-testing/llama-tokenizer"
tokenizer = LlamaTokenizer.from_pretrained(from_pretrained_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.save_pretrained(cls.tmpdirname)
def get_tokenizers(self, **kwargs):
kwargs.setdefault("pad_token", "<PAD>")
return super().get_tokenizers(**kwargs)
def test_load_tiktoken_tokenizer(self):
"""Test loading a Llama tokenizer from tiktoken.model file"""
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/llama3-tokenizer-tiktoken")
text = "This is a test"
tokens = tokenizer.encode(text, add_special_tokens=False)
decoded = tokenizer.decode(tokens, skip_special_tokens=True)
self.assertEqual(decoded, text)
tokenizer = LlamaTokenizer.from_pretrained("hf-internal-testing/llama3-tokenizer-tiktoken")
text = "This is a test"
tokens = tokenizer.encode(text, add_special_tokens=False)
decoded = tokenizer.decode(tokens, skip_special_tokens=True)
self.assertEqual(decoded, text)
@slow
def test_llama3_bpe_skips_clean_up_tokenization_spaces(self):
# Llama 3 ships with `clean_up_tokenization_spaces=True` in its config, but as a
# BPE tokenizer it must skip the cleanup — otherwise legitimate spaces around
# punctuation get stripped (e.g. "x != y" -> "x!= y"). Regression test for #44915.
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")
# Precondition: the shipped config sets the flag, which is what triggers the bug.
self.assertTrue(tokenizer.clean_up_tokenization_spaces)
cases = [("x != y", "x!= y"), ("! ! !", "!!!"), ("a , b", "a, b")]
for text, _ in cases:
ids = tokenizer.encode(text, add_special_tokens=False)
self.assertEqual(tokenizer.decode(ids), text)
# Escape hatch: the override flag reintroduces the destructive cleanup.
tokenizer.clean_up_tokenization_spaces_for_bpe_even_though_it_will_corrupt_output = True
for text, corrupted in cases:
ids = tokenizer.encode(text, add_special_tokens=False)
self.assertEqual(tokenizer.decode(ids), corrupted)