Some checks failed
Self-hosted runner (nightly-past-ci-caller) / Get number (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.11 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.10 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.9 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.8 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.7 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.6 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.5 (push) Has been cancelled
Self-hosted runner (benchmark) / Benchmark (aws-g5-4xlarge-cache) (push) Has been cancelled
Build documentation / build (push) Has been cancelled
Build documentation / build_other_lang (push) Has been cancelled
CodeQL Security Analysis / CodeQL Analysis (push) Has been cancelled
New model PR merged notification / Notify new model (push) Has been cancelled
PR CI / pr-ci (push) Has been cancelled
Slow tests on important models (on Push - A10) / Get all modified files (push) Has been cancelled
Secret Leaks / trufflehog (push) Has been cancelled
Update Transformers metadata / build_and_package (push) Has been cancelled
Slow tests on important models (on Push - A10) / Model CI (push) Has been cancelled
Check Tiny Models / Check tiny models (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Model CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Pipeline CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Example CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / DeepSpeed CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Trainer/FSDP CI (push) Has been cancelled
Nvidia CI - Flash Attn / Setup (push) Has been cancelled
Nvidia CI - Flash Attn / Model CI (push) Has been cancelled
Nvidia CI / Setup (push) Has been cancelled
Nvidia CI / Model CI (push) Has been cancelled
Nvidia CI / Torch pipeline CI (push) Has been cancelled
Nvidia CI / Example CI (push) Has been cancelled
Nvidia CI / Trainer/FSDP CI (push) Has been cancelled
Nvidia CI / DeepSpeed CI (push) Has been cancelled
Nvidia CI / Quantization CI (push) Has been cancelled
Nvidia CI / Kernels CI (push) Has been cancelled
Doctests / Setup (push) Has been cancelled
Doctests / Call doctest jobs (push) Has been cancelled
Doctests / Send results to webhook (push) Has been cancelled
Extras Smoke Test / Get supported Python versions (push) Has been cancelled
Extras Smoke Test / Test extras on Python ${{ matrix.python-version }} (push) Has been cancelled
Extras Smoke Test / Check Slack token availability (push) Has been cancelled
Extras Smoke Test / Notify failures to Slack (push) Has been cancelled
Self-hosted runner (AMD scheduled CI caller) / Trigger Scheduled AMD CI (push) Has been cancelled
Stale Bot / Close Stale Issues (push) Has been cancelled
24 lines
3.2 KiB
Python
24 lines
3.2 KiB
Python
import unittest
|
||
|
||
from transformers.models.blenderbot.tokenization_blenderbot import BlenderbotTokenizer
|
||
from transformers.testing_utils import require_tokenizers
|
||
|
||
from ...test_tokenization_common import TokenizerTesterMixin
|
||
|
||
|
||
@require_tokenizers
|
||
class BlenderbotTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||
from_pretrained_id = ["facebook/blenderbot-3B"]
|
||
tokenizer_class = BlenderbotTokenizer
|
||
|
||
integration_expected_tokens = ['ĠThis', 'Ġis', 'Ġa', 'Ġtest', 'ĠðŁĺ', 'Ĭ', 'Ċ', 'I', 'Ġwas', 'Ġborn', 'Ġin', 'Ġ9', '2', '000', ',', 'Ġand', 'Ġthis', 'Ġis', 'Ġf', 'als', 'é', '.', 'Ċ', 'ç', 'Ķ', 'Ł', 'æ', '´', '»', 'ç', 'ļ', 'Ħ', 'ç', 'ľ', 'Ł', 'è', '°', 'Ľ', 'æ', 'ĺ', '¯', 'Ċ', 'H', 'i', 'Ġ', 'ĠHello', 'Ċ', 'H', 'i', 'Ġ', 'Ġ', 'ĠHello', 'Ċ', 'Ċ', 'Ġ', 'Ċ', 'Ġ', 'Ġ', 'Ċ', 'ĠHello', 'Ċ', '<s>', 'Ġ', 'Ċ', 'hi', '<s>', 'Ġthere', 'Ċ', 'The', 'Ġfollowing', 'Ġstring', 'Ġshould', 'Ġbe', 'Ġproperly', 'Ġenc', 'od', 'ed', ':', 'ĠHello', '.', 'Ċ', 'B', 'ut', 'Ġ', 'ird', 'Ġand', 'Ġ', 'à', '¸', 'Ľ', 'à', '¸', 'µ', 'Ġ', 'Ġ', 'Ġ', 'ird', 'Ġ', 'Ġ', 'Ġ', 'à', '¸', 'Ķ', 'Ċ', 'H', 'ey', 'Ġhow', 'Ġare', 'Ġyou', 'Ġdoing'] # fmt: skip
|
||
integration_expected_token_ids = [678, 315, 265, 1689, 3417, 240, 206, 48, 372, 3647, 302, 1207, 25, 1694, 19, 298, 381, 315, 284, 1095, 3952, 21, 206, 171, 250, 261, 170, 120, 127, 171, 256, 234, 171, 258, 261, 172, 116, 257, 170, 254, 115, 206, 47, 80, 228, 6950, 206, 47, 80, 228, 228, 6950, 206, 206, 228, 206, 228, 228, 206, 6950, 206, 1, 228, 206, 7417, 1, 505, 206, 2839, 3504, 7884, 636, 310, 3867, 2525, 621, 296, 33, 6950, 21, 206, 41, 329, 228, 1221, 298, 228, 164, 124, 257, 164, 124, 121, 228, 228, 228, 1221, 228, 228, 228, 164, 124, 250, 206, 47, 3110, 544, 366, 304, 929] # fmt: skip
|
||
expected_tokens_from_ids = ['ĠThis', 'Ġis', 'Ġa', 'Ġtest', 'ĠðŁĺ', 'Ĭ', 'Ċ', 'I', 'Ġwas', 'Ġborn', 'Ġin', 'Ġ9', '2', '000', ',', 'Ġand', 'Ġthis', 'Ġis', 'Ġf', 'als', 'é', '.', 'Ċ', 'ç', 'Ķ', 'Ł', 'æ', '´', '»', 'ç', 'ļ', 'Ħ', 'ç', 'ľ', 'Ł', 'è', '°', 'Ľ', 'æ', 'ĺ', '¯', 'Ċ', 'H', 'i', 'Ġ', 'ĠHello', 'Ċ', 'H', 'i', 'Ġ', 'Ġ', 'ĠHello', 'Ċ', 'Ċ', 'Ġ', 'Ċ', 'Ġ', 'Ġ', 'Ċ', 'ĠHello', 'Ċ', '<s>', 'Ġ', 'Ċ', 'hi', '<s>', 'Ġthere', 'Ċ', 'The', 'Ġfollowing', 'Ġstring', 'Ġshould', 'Ġbe', 'Ġproperly', 'Ġenc', 'od', 'ed', ':', 'ĠHello', '.', 'Ċ', 'B', 'ut', 'Ġ', 'ird', 'Ġand', 'Ġ', 'à', '¸', 'Ľ', 'à', '¸', 'µ', 'Ġ', 'Ġ', 'Ġ', 'ird', 'Ġ', 'Ġ', 'Ġ', 'à', '¸', 'Ķ', 'Ċ', 'H', 'ey', 'Ġhow', 'Ġare', 'Ġyou', 'Ġdoing'] # fmt: skip
|
||
integration_expected_decoded_text = " This is a test 😊\nI was born in 92000, and this is falsé.\n生活的真谛是\nHi Hello\nHi Hello\n\n \n \n Hello\n<s> \nhi<s> there\nThe following string should be properly encoded: Hello.\nBut ird and ปี ird ด\nHey how are you doing"
|
||
|
||
def test_pretokenized_inputs(self, *args, **kwargs):
|
||
# It's very difficult to mix/test pretokenization with byte-level tokenizers
|
||
# The issue is that when you have a sequence with leading spaces, splitting it
|
||
# with .split() loses the leading spaces, so the tokenization results differ
|
||
pass
|