Some checks failed
Self-hosted runner (nightly-past-ci-caller) / Get number (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.11 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.10 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.9 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.8 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.7 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.6 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.5 (push) Has been cancelled
Self-hosted runner (benchmark) / Benchmark (aws-g5-4xlarge-cache) (push) Has been cancelled
Build documentation / build (push) Has been cancelled
Build documentation / build_other_lang (push) Has been cancelled
CodeQL Security Analysis / CodeQL Analysis (push) Has been cancelled
New model PR merged notification / Notify new model (push) Has been cancelled
PR CI / pr-ci (push) Has been cancelled
Slow tests on important models (on Push - A10) / Get all modified files (push) Has been cancelled
Secret Leaks / trufflehog (push) Has been cancelled
Update Transformers metadata / build_and_package (push) Has been cancelled
Slow tests on important models (on Push - A10) / Model CI (push) Has been cancelled
Check Tiny Models / Check tiny models (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Model CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Pipeline CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Example CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / DeepSpeed CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Trainer/FSDP CI (push) Has been cancelled
Nvidia CI - Flash Attn / Setup (push) Has been cancelled
Nvidia CI - Flash Attn / Model CI (push) Has been cancelled
Nvidia CI / Setup (push) Has been cancelled
Nvidia CI / Model CI (push) Has been cancelled
Nvidia CI / Torch pipeline CI (push) Has been cancelled
Nvidia CI / Example CI (push) Has been cancelled
Nvidia CI / Trainer/FSDP CI (push) Has been cancelled
Nvidia CI / DeepSpeed CI (push) Has been cancelled
Nvidia CI / Quantization CI (push) Has been cancelled
Nvidia CI / Kernels CI (push) Has been cancelled
Doctests / Setup (push) Has been cancelled
Doctests / Call doctest jobs (push) Has been cancelled
Doctests / Send results to webhook (push) Has been cancelled
Extras Smoke Test / Get supported Python versions (push) Has been cancelled
Extras Smoke Test / Test extras on Python ${{ matrix.python-version }} (push) Has been cancelled
Extras Smoke Test / Check Slack token availability (push) Has been cancelled
Extras Smoke Test / Notify failures to Slack (push) Has been cancelled
Self-hosted runner (AMD scheduled CI caller) / Trigger Scheduled AMD CI (push) Has been cancelled
Stale Bot / Close Stale Issues (push) Has been cancelled
75 lines
4.6 KiB
Python
75 lines
4.6 KiB
Python
import unittest
|
|
|
|
from tests.test_tokenization_common import TokenizerTesterMixin
|
|
from transformers import AutoTokenizer
|
|
from transformers.models.llama.tokenization_llama import LlamaTokenizer
|
|
from transformers.testing_utils import (
|
|
require_tokenizers,
|
|
slow,
|
|
)
|
|
|
|
|
|
@require_tokenizers
|
|
class LlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|
from_pretrained_id = [
|
|
"hf-internal-testing/llama-tokenizer",
|
|
"meta-llama/Llama-2-7b-hf",
|
|
"meta-llama/Meta-Llama-3-8B",
|
|
]
|
|
tokenizer_class = LlamaTokenizer
|
|
from_pretrained_kwargs = {}
|
|
|
|
# Integration test data - expected outputs for the default input string
|
|
integration_expected_tokens = ["▁This", "▁is", "▁a", "▁test", "▁", "<0xF0>", "<0x9F>", "<0x98>", "<0x8A>", "<0x0A>", "I", "▁was", "▁born", "▁in", "▁", "9", "2", "0", "0", "0", ",", "▁and", "▁this", "▁is", "▁f", "als", "é", ".", "<0x0A>", "生", "活", "的", "真", "<0xE8>", "<0xB0>", "<0x9B>", "是", "<0x0A>", "Hi", "▁", "▁Hello", "<0x0A>", "Hi", "▁▁", "▁Hello", "<0x0A>", "<0x0A>", "▁", "<0x0A>", "▁▁", "<0x0A>", "▁Hello", "<0x0A>", "<s>", "<0x0A>", "hi", "<s>", "there", "<0x0A>", "The", "▁following", "▁string", "▁should", "▁be", "▁properly", "▁encoded", ":", "▁Hello", ".", "<0x0A>", "But", "▁", "ird", "▁and", "▁", "ป", "ี", "▁▁▁", "ird", "▁▁▁", "ด", "<0x0A>", "H", "ey", "▁how", "▁are", "▁you", "▁doing"] # fmt: skip
|
|
integration_expected_token_ids = [910, 338, 263, 1243, 29871, 243, 162, 155, 141, 13, 29902, 471, 6345, 297, 29871, 29929, 29906, 29900, 29900, 29900, 29892, 322, 445, 338, 285, 1338, 29948, 29889, 13, 30486, 31704, 30210, 30848, 235, 179, 158, 30392, 13, 18567, 29871, 15043, 13, 18567, 259, 15043, 13, 13, 29871, 13, 259, 13, 15043, 13, 1, 13, 2918, 1, 12711, 13, 1576, 1494, 1347, 881, 367, 6284, 18511, 29901, 15043, 29889, 13, 6246, 29871, 1823, 322, 29871, 31010, 30691, 1678, 1823, 1678, 30718, 13, 29950, 1032, 920, 526, 366, 2599] # fmt: skip
|
|
integration_expected_decoded_text = "This is a test 😊\nI was born in 92000, and this is falsé.\n生活的真谛是\nHi Hello\nHi Hello\n\n \n \n Hello\n<s>\nhi<s>there\nThe following string should be properly encoded: Hello.\nBut ird and ปี ird ด\nHey how are you doing"
|
|
|
|
@classmethod
|
|
def setUpClass(cls):
|
|
super().setUpClass()
|
|
|
|
from_pretrained_id = "hf-internal-testing/llama-tokenizer"
|
|
|
|
tokenizer = LlamaTokenizer.from_pretrained(from_pretrained_id)
|
|
tokenizer.pad_token = tokenizer.eos_token
|
|
tokenizer.save_pretrained(cls.tmpdirname)
|
|
|
|
def get_tokenizers(self, **kwargs):
|
|
kwargs.setdefault("pad_token", "<PAD>")
|
|
return super().get_tokenizers(**kwargs)
|
|
|
|
def test_load_tiktoken_tokenizer(self):
|
|
"""Test loading a Llama tokenizer from tiktoken.model file"""
|
|
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/llama3-tokenizer-tiktoken")
|
|
|
|
text = "This is a test"
|
|
tokens = tokenizer.encode(text, add_special_tokens=False)
|
|
decoded = tokenizer.decode(tokens, skip_special_tokens=True)
|
|
self.assertEqual(decoded, text)
|
|
|
|
tokenizer = LlamaTokenizer.from_pretrained("hf-internal-testing/llama3-tokenizer-tiktoken")
|
|
text = "This is a test"
|
|
tokens = tokenizer.encode(text, add_special_tokens=False)
|
|
decoded = tokenizer.decode(tokens, skip_special_tokens=True)
|
|
self.assertEqual(decoded, text)
|
|
|
|
@slow
|
|
def test_llama3_bpe_skips_clean_up_tokenization_spaces(self):
|
|
# Llama 3 ships with `clean_up_tokenization_spaces=True` in its config, but as a
|
|
# BPE tokenizer it must skip the cleanup — otherwise legitimate spaces around
|
|
# punctuation get stripped (e.g. "x != y" -> "x!= y"). Regression test for #44915.
|
|
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")
|
|
# Precondition: the shipped config sets the flag, which is what triggers the bug.
|
|
self.assertTrue(tokenizer.clean_up_tokenization_spaces)
|
|
|
|
cases = [("x != y", "x!= y"), ("! ! !", "!!!"), ("a , b", "a, b")]
|
|
for text, _ in cases:
|
|
ids = tokenizer.encode(text, add_special_tokens=False)
|
|
self.assertEqual(tokenizer.decode(ids), text)
|
|
|
|
# Escape hatch: the override flag reintroduces the destructive cleanup.
|
|
tokenizer.clean_up_tokenization_spaces_for_bpe_even_though_it_will_corrupt_output = True
|
|
for text, corrupted in cases:
|
|
ids = tokenizer.encode(text, add_special_tokens=False)
|
|
self.assertEqual(tokenizer.decode(ids), corrupted)
|