first commit

2026-06-05 16:53:03 +08:00
commit 06f1fd69a6
6047 changed files with 1895387 additions and 0 deletions
--- a/tests/trainer/distributed/scripts/fsdp_generate.py
+++ b/tests/trainer/distributed/scripts/fsdp_generate.py
@@ -0,0 +1,125 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Worker script for FSDP generation tests.
+
+Launched via ``torchrun`` from ``test_trainer_distributed_fsdp.py``.
+"""
+
+import argparse
+import functools
+from collections.abc import Callable
+from typing import Any
+
+import torch
+import torch.distributed
+from torch.distributed._composable.fsdp import fully_shard, register_fsdp_forward_method
+from torch.distributed.device_mesh import init_device_mesh
+from torch.distributed.fsdp import FullyShardedDataParallel
+from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy
+
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.models.gpt2.modeling_gpt2 import GPT2Block
+from transformers.testing_utils import backend_device_count, backend_torch_accelerator_module, torch_device
+
+
+data = 4 * [
+    "Hello world!",
+    "The quick brown fox jumps over the lazy dog.",
+]
+
+
+def manage_process_group(func: Callable[..., Any]) -> Callable[..., Any]:
+    """Manage the creation and destruction of the distributed process group for the wrapped function."""
+
+    def wrapped(*args: Any, **kwargs: Any) -> Any:
+        device_count = backend_device_count(torch_device)
+        torch.distributed.init_process_group(world_size=device_count)
+        try:
+            return func(*args, **kwargs)
+        finally:
+            torch.distributed.destroy_process_group()
+
+    return wrapped
+
+
+@manage_process_group
+def fsdp_generate():
+    torch_accelerator_module = backend_torch_accelerator_module(torch_device)
+    torch_accelerator_module.set_device(device := torch.device(rank := torch.distributed.get_rank()))
+
+    model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(device)
+
+    fsdp_model = FullyShardedDataParallel(
+        model,
+        auto_wrap_policy=functools.partial(transformer_auto_wrap_policy, transformer_layer_cls={GPT2Block}),
+        limit_all_gathers=True,
+        use_orig_params=True,
+    )
+
+    tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
+    batch = tokenizer(data[rank], return_tensors="pt", return_attention_mask=True).to(device)
+
+    with FullyShardedDataParallel.summon_full_params(fsdp_model):
+        _ = fsdp_model.module.generate(
+            input_ids=batch["input_ids"],
+            attention_mask=batch["attention_mask"],
+            max_length=30,
+        )
+
+
+@manage_process_group
+def fsdp2_generate():
+    torch_accelerator_module = backend_torch_accelerator_module(torch_device)
+    torch_accelerator_module.set_device(device := torch.device(rank := torch.distributed.get_rank()))
+
+    model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(device)
+
+    mesh = init_device_mesh(device.type, (torch.distributed.get_world_size(),))
+    for submodule in model.modules():
+        if isinstance(submodule, GPT2Block):
+            fully_shard(submodule, mesh=mesh)
+    fully_shard(model, mesh=mesh)
+
+    register_fsdp_forward_method(model, "generate")
+
+    tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
+    batch = tokenizer(data[rank], return_tensors="pt", return_attention_mask=True).to(device)
+
+    _ = model.generate(
+        input_ids=batch["input_ids"],
+        attention_mask=batch["attention_mask"],
+        max_length=30,
+    )
+
+
+if __name__ == "__main__":
+
+    class CLIArgs(argparse.Namespace):
+        fsdp: bool
+        fsdp2: bool
+
+    parser = argparse.ArgumentParser()
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument("--fsdp", action="store_true")
+    group.add_argument("--fsdp2", action="store_true")
+    args = parser.parse_args(namespace=CLIArgs())
+
+    if args.fsdp:
+        fsdp_generate()
+    elif args.fsdp2:
+        fsdp2_generate()
+    else:
+        raise ValueError("Missing test selection")