first commit
Some checks failed
Self-hosted runner (nightly-past-ci-caller) / Get number (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.11 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.10 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.9 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.8 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.7 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.6 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.5 (push) Has been cancelled
Self-hosted runner (benchmark) / Benchmark (aws-g5-4xlarge-cache) (push) Has been cancelled
Build documentation / build (push) Has been cancelled
Build documentation / build_other_lang (push) Has been cancelled
CodeQL Security Analysis / CodeQL Analysis (push) Has been cancelled
New model PR merged notification / Notify new model (push) Has been cancelled
PR CI / pr-ci (push) Has been cancelled
Slow tests on important models (on Push - A10) / Get all modified files (push) Has been cancelled
Secret Leaks / trufflehog (push) Has been cancelled
Update Transformers metadata / build_and_package (push) Has been cancelled
Slow tests on important models (on Push - A10) / Model CI (push) Has been cancelled
Check Tiny Models / Check tiny models (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Model CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Pipeline CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Example CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / DeepSpeed CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Trainer/FSDP CI (push) Has been cancelled
Nvidia CI - Flash Attn / Setup (push) Has been cancelled
Nvidia CI - Flash Attn / Model CI (push) Has been cancelled
Nvidia CI / Setup (push) Has been cancelled
Nvidia CI / Model CI (push) Has been cancelled
Nvidia CI / Torch pipeline CI (push) Has been cancelled
Nvidia CI / Example CI (push) Has been cancelled
Nvidia CI / Trainer/FSDP CI (push) Has been cancelled
Nvidia CI / DeepSpeed CI (push) Has been cancelled
Nvidia CI / Quantization CI (push) Has been cancelled
Nvidia CI / Kernels CI (push) Has been cancelled
Doctests / Setup (push) Has been cancelled
Doctests / Call doctest jobs (push) Has been cancelled
Doctests / Send results to webhook (push) Has been cancelled
Extras Smoke Test / Get supported Python versions (push) Has been cancelled
Extras Smoke Test / Test extras on Python ${{ matrix.python-version }} (push) Has been cancelled
Extras Smoke Test / Check Slack token availability (push) Has been cancelled
Extras Smoke Test / Notify failures to Slack (push) Has been cancelled
Self-hosted runner (AMD scheduled CI caller) / Trigger Scheduled AMD CI (push) Has been cancelled
Stale Bot / Close Stale Issues (push) Has been cancelled

This commit is contained in:
陈赣
2026-06-05 16:53:03 +08:00
commit 06f1fd69a6
6047 changed files with 1895387 additions and 0 deletions

View File

View File

@@ -0,0 +1,484 @@
# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import unittest
from transformers import XGLMConfig, is_torch_available
from transformers.testing_utils import (
Expectations,
cleanup,
is_torch_greater_or_equal,
require_torch,
require_torch_accelerator,
require_torch_fp16,
slow,
torch_device,
)
from ...generation.test_utils import GenerationTesterMixin
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
from ...test_pipeline_mixin import PipelineTesterMixin
if is_torch_available():
import torch
from transformers import XGLMForCausalLM, XGLMModel, XGLMTokenizer
class XGLMModelTester:
def __init__(
self,
parent,
batch_size=14,
seq_length=7,
is_training=True,
use_input_mask=True,
use_labels=True,
vocab_size=99,
d_model=32,
num_hidden_layers=2,
num_attention_heads=4,
ffn_dim=37,
activation_function="gelu",
activation_dropout=0.1,
attention_dropout=0.1,
max_position_embeddings=512,
initializer_range=0.02,
scope=None,
):
self.parent = parent
self.batch_size = batch_size
self.seq_length = seq_length
self.is_training = is_training
self.use_input_mask = use_input_mask
self.use_labels = use_labels
self.vocab_size = vocab_size
self.hidden_size = d_model
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.ffn_dim = ffn_dim
self.activation_function = activation_function
self.activation_dropout = activation_dropout
self.attention_dropout = attention_dropout
self.max_position_embeddings = max_position_embeddings
self.initializer_range = initializer_range
self.scope = None
self.bos_token_id = 0
self.eos_token_id = 2
self.pad_token_id = 1
def prepare_config_and_inputs(
self, gradient_checkpointing=False, scale_attn_by_inverse_layer_idx=False, reorder_and_upcast_attn=False
):
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(3)
input_mask = None
if self.use_input_mask:
input_mask = random_attention_mask([self.batch_size, self.seq_length])
config = self.get_config(gradient_checkpointing=gradient_checkpointing)
return (
config,
input_ids,
input_mask,
)
def get_config(
self, gradient_checkpointing=False, scale_attn_by_inverse_layer_idx=False, reorder_and_upcast_attn=False
):
return XGLMConfig(
vocab_size=self.vocab_size,
d_model=self.hidden_size,
num_layers=self.num_hidden_layers,
attention_heads=self.num_attention_heads,
ffn_dim=self.ffn_dim,
activation_function=self.activation_function,
activation_dropout=self.activation_dropout,
attention_dropout=self.attention_dropout,
max_position_embeddings=self.max_position_embeddings,
initializer_range=self.initializer_range,
use_cache=True,
bos_token_id=self.bos_token_id,
eos_token_id=self.eos_token_id,
pad_token_id=self.pad_token_id,
gradient_checkpointing=gradient_checkpointing,
)
def create_and_check_xglm_model(self, config, input_ids, input_mask, *args):
model = XGLMModel(config=config)
model.to(torch_device)
model.eval()
result = model(input_ids)
result = model(input_ids)
self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
self.parent.assertEqual(len(result.past_key_values), config.num_hidden_layers)
def create_and_check_xglm_model_past(self, config, input_ids, input_mask, *args):
model = XGLMModel(config=config)
model.to(torch_device)
model.eval()
# first forward pass
outputs = model(input_ids, use_cache=True)
outputs_no_past = model(input_ids, use_cache=False)
self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
output, past = outputs.to_tuple()
# create hypothetical next token and extent to next_input_ids
next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
# append to next input_ids and token_type_ids
next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
output_from_no_past = model(next_input_ids)["last_hidden_state"]
output_from_past = model(next_tokens, past_key_values=past)["last_hidden_state"]
# select random slice
random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach()
output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
# test that outputs are equal for slice
self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
def create_and_check_xglm_model_attention_mask_past(self, config, input_ids, input_mask, *args):
model = XGLMModel(config=config)
model.to(torch_device)
model.eval()
# create attention mask
attn_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
half_seq_length = self.seq_length // 2
attn_mask[:, half_seq_length:] = 0
# first forward pass
output, past = model(input_ids, attention_mask=attn_mask).to_tuple()
# create hypothetical next token and extent to next_input_ids
next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
# append to next input_ids and attn_mask
next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
attn_mask = torch.cat(
[attn_mask, torch.zeros((attn_mask.shape[0], 1), dtype=torch.long, device=torch_device)],
dim=1,
)
# get two different outputs
output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"]
output_from_past = model(next_tokens, past_key_values=past, attention_mask=attn_mask)["last_hidden_state"]
# select random slice
random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach()
output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
# test that outputs are equal for slice
self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
def create_and_check_xglm_model_past_large_inputs(self, config, input_ids, input_mask, *args):
model = XGLMModel(config=config)
model.to(torch_device)
model.eval()
# first forward pass
outputs = model(input_ids, attention_mask=input_mask, use_cache=True)
output, past = outputs.to_tuple()
# create hypothetical next token and extent to next_input_ids
next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
next_mask = ids_tensor((self.batch_size, 3), vocab_size=1)
# append to next input_ids
next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past)[
"last_hidden_state"
]
self.parent.assertTrue(output_from_past.shape[1] == next_tokens.shape[1])
# select random slice
random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
# test that outputs are equal for slice
self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
def create_and_check_lm_head_model(self, config, input_ids, input_mask, *args):
model = XGLMForCausalLM(config)
model.to(torch_device)
model.eval()
result = model(input_ids, labels=input_ids)
self.parent.assertEqual(result.loss.shape, ())
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
def create_and_check_forward_and_backwards(
self, config, input_ids, input_mask, *args, gradient_checkpointing=False
):
model = XGLMForCausalLM(config)
model.to(torch_device)
if gradient_checkpointing:
model.gradient_checkpointing_enable()
result = model(input_ids, labels=input_ids)
self.parent.assertEqual(result.loss.shape, ())
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
result.loss.backward()
def create_and_check_xglm_weight_initialization(self, config, *args):
model = XGLMModel(config)
model_std = model.config.initializer_range / math.sqrt(2 * model.config.num_hidden_layers)
for key in model.state_dict():
if "c_proj" in key and "weight" in key:
self.parent.assertLessEqual(abs(torch.std(model.state_dict()[key]) - model_std), 0.001)
self.parent.assertLessEqual(abs(torch.mean(model.state_dict()[key]) - 0.0), 0.01)
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
(
config,
input_ids,
input_mask,
) = config_and_inputs
inputs_dict = {
"input_ids": input_ids,
}
return config, inputs_dict
@require_torch
class XGLMModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
all_model_classes = (XGLMModel, XGLMForCausalLM) if is_torch_available() else ()
pipeline_model_mapping = (
{"feature-extraction": XGLMModel, "text-generation": XGLMForCausalLM} if is_torch_available() else {}
)
test_missing_keys = False
def setUp(self):
self.model_tester = XGLMModelTester(self)
self.config_tester = ConfigTester(self, config_class=XGLMConfig, n_embd=37)
def test_config(self):
self.config_tester.run_common_tests()
def test_xglm_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_xglm_model(*config_and_inputs)
def test_xglm_model_past(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_xglm_model_past(*config_and_inputs)
def test_xglm_model_att_mask_past(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_xglm_model_attention_mask_past(*config_and_inputs)
def test_xglm_model_past_large_inputs(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_xglm_model_past_large_inputs(*config_and_inputs)
def test_xglm_lm_head_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_lm_head_model(*config_and_inputs)
def test_xglm_gradient_checkpointing(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_forward_and_backwards(*config_and_inputs, gradient_checkpointing=True)
def test_xglm_weight_initialization(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_xglm_weight_initialization(*config_and_inputs)
@slow
def test_model_from_pretrained(self):
model_name = "facebook/xglm-564M"
model = XGLMModel.from_pretrained(model_name)
self.assertIsNotNone(model)
@unittest.skip(reason="Does not work on the tiny model as we keep hitting edge cases.")
def test_model_parallelism(self):
super().test_model_parallelism()
@require_torch
class XGLMModelLanguageGenerationTest(unittest.TestCase):
def tearDown(self):
super().tearDown()
# clean-up as much as possible GPU memory occupied by PyTorch
cleanup(torch_device, gc_collect=True)
def _test_lm_generate_xglm_helper(
self,
gradient_checkpointing=False,
verify_outputs=True,
):
model = XGLMForCausalLM.from_pretrained("facebook/xglm-564M")
if gradient_checkpointing:
model.gradient_checkpointing_enable()
else:
model.gradient_checkpointing_disable()
model.to(torch_device)
input_ids = torch.tensor([[2, 268, 9865]], dtype=torch.long, device=torch_device) # The dog
# </s> The dog is a very friendly dog. He is very affectionate and loves to play with other
expected_output_ids = [2, 268, 9865, 67, 11, 1988, 57252, 9865, 5, 984, 67, 1988, 213838, 1658, 53, 70446, 33, 6657, 278, 1581, 72616, 5, 984] # fmt: skip
output_ids = model.generate(input_ids, do_sample=False, num_beams=1)
if verify_outputs:
self.assertListEqual(output_ids[0].tolist(), expected_output_ids)
@slow
def test_batch_generation(self):
model = XGLMForCausalLM.from_pretrained("facebook/xglm-564M")
model.to(torch_device)
tokenizer = XGLMTokenizer.from_pretrained("facebook/xglm-564M")
tokenizer.padding_side = "left"
# use different length sentences to test batching
sentences = [
"This is an extremely long sentence that only exists to test the ability of the model to cope with "
"left-padding, such as in batched generation. The output for the sequence below should be the same "
"regardless of whether left padding is applied or not. When",
"Hello, my dog is a little",
]
inputs = tokenizer(sentences, return_tensors="pt", padding=True)
input_ids = inputs["input_ids"].to(torch_device)
outputs = model.generate(
input_ids=input_ids, attention_mask=inputs["attention_mask"].to(torch_device), max_new_tokens=12
)
inputs_non_padded = tokenizer(sentences[0], return_tensors="pt").input_ids.to(torch_device)
output_non_padded = model.generate(input_ids=inputs_non_padded, max_new_tokens=12)
inputs_padded = tokenizer(sentences[1], return_tensors="pt").input_ids.to(torch_device)
output_padded = model.generate(input_ids=inputs_padded, max_new_tokens=12)
batch_out_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True)
non_padded_sentence = tokenizer.decode(output_non_padded[0], skip_special_tokens=True)
padded_sentence = tokenizer.decode(output_padded[0], skip_special_tokens=True)
# fmt: off
expected_output_sentences = Expectations(
{
("xpu", None): [
'This is an extremely long sentence that only exists to test the ability of the model to cope with left-padding, such as in batched generation. The output for the sequence below should be the same regardless of whether left padding is applied or not. When left padding is applied, the model will not be able',
'Hello, my dog is a little bit of a shy one, but he is very friendly'
],
("cuda", None): [
"This is an extremely long sentence that only exists to test the ability of the model to cope with left-padding, such as in batched generation. The output for the sequence below should be the same regardless of whether left padding is applied or not. When left padding is applied, the sequence will be a single",
"Hello, my dog is a little bit of a shy one, but he is very friendly",
],
}
)
# fmt: on
expected_output_sentence = expected_output_sentences.get_expectation()
self.assertListEqual(expected_output_sentence, batch_out_sentence)
self.assertListEqual(expected_output_sentence, [non_padded_sentence, padded_sentence])
@slow
def test_lm_generate_xglm(self):
self._test_lm_generate_xglm_helper()
@slow
def test_lm_generate_xglm_with_gradient_checkpointing(self):
self._test_lm_generate_xglm_helper(gradient_checkpointing=True)
@slow
def test_xglm_sample(self):
tokenizer = XGLMTokenizer.from_pretrained("facebook/xglm-564M")
model = XGLMForCausalLM.from_pretrained("facebook/xglm-564M")
torch.manual_seed(0)
tokenized = tokenizer("Today is a nice day and", return_tensors="pt")
input_ids = tokenized.input_ids
output_ids = model.generate(input_ids, do_sample=True, num_beams=1)
output_str = tokenizer.decode(output_ids[0], skip_special_tokens=True)
if is_torch_greater_or_equal("2.7.0"):
cuda_expectation = (
"Today is a nice day and the sun is shining. A nice day with warm rainy and windy weather today."
)
else:
cuda_expectation = "Today is a nice day and the water is still cold. We just stopped off for some fresh coffee. This place looks like a"
expected_output_strings = Expectations(
{
("xpu", None): "Today is a nice day and the sun is shining. A nice day with warm rainy and windy weather today.",
("rocm", (9, 5)): "Today is a nice day and the sun is shining. A nice day with warm rainy and windy weather today.",
("cuda", None): cuda_expectation,
}
) # fmt: skip
EXPECTED_OUTPUT_STR = expected_output_strings.get_expectation()
self.assertEqual(output_str, EXPECTED_OUTPUT_STR)
@require_torch_accelerator
@require_torch_fp16
def test_batched_nan_fp16(self):
model_name = "facebook/xglm-564M"
tokenizer = XGLMTokenizer.from_pretrained(model_name, use_fast=False, padding_side="left")
model = XGLMForCausalLM.from_pretrained(model_name, dtype=torch.float16, use_cache=True).to(torch_device)
model = model.eval()
batch = tokenizer(["Who are you?", "Joe Biden is the president of"], padding=True, return_tensors="pt")
input_ids = batch["input_ids"].to(torch_device)
attention_mask = batch["attention_mask"].to(torch_device)
with torch.no_grad():
outputs = model(input_ids, attention_mask=attention_mask)
self.assertFalse(
torch.isnan(outputs.logits[0]).any().item()
) # the first logits could contain NaNs if it fails
@slow
def test_loss_with_padding(self):
tokenizer = XGLMTokenizer.from_pretrained("facebook/xglm-564M")
model = XGLMForCausalLM.from_pretrained("facebook/xglm-564M")
model.to(torch_device)
tokenizer.padding_side = "right"
sequence = "Sequence"
tokenized_non_padded = tokenizer(sequence, return_tensors="pt")
tokenized_non_padded.to(torch_device)
labels_non_padded = tokenized_non_padded.input_ids.clone()
loss_non_padded = model(**tokenized_non_padded, labels=labels_non_padded).loss
tokenized_padded = tokenizer(sequence, padding="max_length", max_length=16, return_tensors="pt")
tokenized_padded.to(torch_device)
labels_padded = tokenized_padded.input_ids.clone()
labels_padded[labels_padded == tokenizer.pad_token_id] = -100
loss_padded = model(**tokenized_padded, labels=labels_padded).loss
torch.testing.assert_close(loss_non_padded, loss_padded, rtol=1e-3, atol=1e-3)

View File

@@ -0,0 +1,41 @@
# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from tests.test_tokenization_common import TokenizerTesterMixin
from transformers.models.xglm.tokenization_xglm import XGLMTokenizer
from transformers.testing_utils import require_tokenizers
@require_tokenizers
class XGLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = ["facebook/xglm-564M"]
tokenizer_class = XGLMTokenizer
integration_expected_tokens = ['▁This', '▁is', '▁a', '▁test', '', '😊', '▁I', '▁was', '▁born', '▁in', '▁9', '2000', ',', '▁and', '▁this', '▁is', '▁fals', 'é', '.', '', '生活的', '', '', '', '▁Hi', '▁Hello', '▁Hi', '▁Hello', '▁Hello', '', '<s>', '▁hi', '<s>', '▁there', '▁The', '▁following', '▁string', '▁should', '▁be', '▁properly', '▁en', 'code', 'd', ':', '▁Hello', '.', '▁But', '▁ir', 'd', '▁and', '▁ปี', '▁ir', 'd', '▁ด', '▁Hey', '▁how', '▁are', '▁you', '▁doing'] # fmt: skip
integration_expected_token_ids = [1018, 67, 11, 3194, 6, 61533, 44, 254, 23572, 22, 465, 13323, 4, 53, 319, 67, 84785, 185, 5, 6, 63782, 2530, 3, 322, 2751, 31227, 2751, 31227, 31227, 6, 0, 1075, 0, 1193, 268, 12894, 44036, 2817, 113, 77749, 29, 21257, 72, 13, 31227, 5, 2079, 246, 72, 53, 10845, 246, 72, 30937, 20933, 1271, 256, 206, 7667] # fmt: skip
expected_tokens_from_ids = ['▁This', '▁is', '▁a', '▁test', '', '😊', '▁I', '▁was', '▁born', '▁in', '▁9', '2000', ',', '▁and', '▁this', '▁is', '▁fals', 'é', '.', '', '生活的', '', '<unk>', '', '▁Hi', '▁Hello', '▁Hi', '▁Hello', '▁Hello', '', '<s>', '▁hi', '<s>', '▁there', '▁The', '▁following', '▁string', '▁should', '▁be', '▁properly', '▁en', 'code', 'd', ':', '▁Hello', '.', '▁But', '▁ir', 'd', '▁and', '▁ปี', '▁ir', 'd', '▁ด', '▁Hey', '▁how', '▁are', '▁you', '▁doing'] # fmt: skip
integration_expected_decoded_text = "This is a test 😊 I was born in 92000, and this is falsé. 生活的真<unk>是 Hi Hello Hi Hello Hello <s> hi<s> there The following string should be properly encoded: Hello. But ird and ปี ird ด Hey how are you doing"
@classmethod
def setUpClass(cls):
super().setUpClass()
from_pretrained_id = "facebook/xglm-564M"
tokenizer = XGLMTokenizer.from_pretrained(from_pretrained_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.save_pretrained(cls.tmpdirname)
cls.tokenizers = [tokenizer]