first commit

2026-06-05 16:53:03 +08:00
commit 06f1fd69a6
6047 changed files with 1895387 additions and 0 deletions
--- a/benchmark_v2/benchmark_scripts/continuous_batching_overall.py
+++ b/benchmark_v2/benchmark_scripts/continuous_batching_overall.py
@@ -0,0 +1,443 @@
+"""
+Continuous batching overall benchmark suite.
+
+Runs CB in-process across many configurations (GSM8K prompts and synthetic
+data) and can compare throughput against a previously-saved run.
+"""
+
+import argparse
+import gc
+import json
+import time
+import types
+from collections.abc import Callable
+from dataclasses import asdict, dataclass
+from pathlib import Path
+from typing import Any
+
+import torch
+from lighteval.models.model_output import ModelResponse
+from lighteval.tasks.lighteval_task import LightevalTask, LightevalTaskConfig
+from lighteval.tasks.prompt_manager import PromptManager
+from lighteval.tasks.registry import Registry
+from lighteval.tasks.requests import Doc
+from tabulate import tabulate
+
+from transformers import AutoModelForCausalLM, AutoTokenizer, ContinuousBatchingConfig, GenerationConfig
+
+
+# Defaults
+RESULTS_DIR = Path(__file__).parent.parent / "benchmark_results/cb_overall/"
+
+
+def _fmt(val: Any, spec: str = "", missing: str = "X") -> str:
+    """Format `val` per `spec`, or return `missing` if val is None."""
+    return format(val, spec) if val is not None else missing
+
+
+def _build_gsm8k_platinum_module() -> types.ModuleType:
+    """Define the gsm8k_platinum custom task inline so lighteval's Registry can pick it up via `custom_tasks=`."""
+
+    def gsm8k_platinum_prompt(line, task_name=None):
+        return Doc(
+            task_name=task_name,
+            query=f"Question: {line['question']}\nAnswer:",
+            choices=[f" {line['answer']}"],
+            gold_index=0,
+        )
+
+    metrics = list(Registry().load_all_task_configs()["gsm8k"].metrics)
+
+    mod = types.ModuleType("_gsm8k_platinum_inline")
+    mod.TASKS_TABLE = [
+        LightevalTaskConfig(
+            name="gsm8k_platinum",
+            prompt_function=gsm8k_platinum_prompt,
+            hf_repo="madrylab/gsm8k-platinum",
+            hf_subset="main",
+            evaluation_splits=("test",),
+            few_shots_split="test",
+            few_shots_select="random_sampling",
+            generation_size=256,
+            stop_sequence=["Question:"],
+            metrics=metrics,
+        ),
+    ]
+    return mod
+
+
+def _build_lighteval_inputs_scorer(
+    tokenizer: AutoTokenizer,
+    *,
+    task_spec: str,
+    task_name: str,
+    use_chat_template: bool,
+    custom_tasks: Any = None,
+    primary_metric: str | None = None,
+    stop_sequences: tuple[str, ...] = (),
+) -> tuple[list[list[int]], Callable[[Any], float]]:
+    """Tokenize prompts and build a per-sample scorer for any lighteval task."""
+    r = Registry(tasks=task_spec, **({"custom_tasks": custom_tasks} if custom_tasks else {}))
+    metric = r.task_to_configs[task_name][0].metrics[0]
+    tasks_dict = r.load_tasks()
+    LightevalTask.load_datasets(tasks_dict, 1)
+    docs = next(iter(tasks_dict.values())).get_docs()
+
+    pm = PromptManager(use_chat_template=use_chat_template, tokenizer=tokenizer, system_prompt=None)
+    prompts = [pm.prepare_prompt(doc) for doc in docs]
+    inputs = tokenizer(prompts, add_special_tokens=not use_chat_template)["input_ids"]
+
+    def score(outputs) -> float:
+        scores = []
+        for doc, (_, out) in zip(docs, outputs.items()):
+            text = tokenizer.decode(out.generated_tokens, skip_special_tokens=True)
+            for s in stop_sequences:
+                text = text.split(s, 1)[0]
+            value = metric.sample_level_fn.compute(doc, ModelResponse(text=[text]))
+            # Grouped metrics return a dict keyed by sub-metric — pick the primary one.
+            scores.append(value[primary_metric] if isinstance(value, dict) else value)
+        return sum(scores) / len(scores)
+
+    return inputs, score
+
+
+# Data helpers
+def get_tokenized_gsm8k(
+    tokenizer: AutoTokenizer, n_fewshot: int = 8
+) -> tuple[list[list[int]], Callable[[Any], float]]:
+    """GSM8K-Platinum few-shot inputs and scorer using the same lighteval extractive_match as the gsm8k task."""
+    return _build_lighteval_inputs_scorer(
+        tokenizer,
+        task_spec=f"gsm8k_platinum|{n_fewshot}",
+        task_name="gsm8k_platinum",
+        use_chat_template=False,
+        custom_tasks=_build_gsm8k_platinum_module(),
+        stop_sequences=("Question:",),
+    )
+
+
+def get_tokenized_ifeval(tokenizer: AutoTokenizer) -> tuple[list[list[int]], Callable[[Any], float]]:
+    """IFEval inputs (chat-templated, 0-shot) and scorer reporting prompt-level strict accuracy."""
+    return _build_lighteval_inputs_scorer(
+        tokenizer,
+        task_spec="ifeval|0",
+        task_name="ifeval",
+        use_chat_template=True,
+        primary_metric="prompt_level_strict_acc",
+    )
+
+
+def get_random_data(batch_size: int, num_tokens: int, vocab_size: int = 16000) -> list[list[int]]:
+    """Random token sequences of fixed length, for raw throughput tests."""
+    rng = torch.Generator().manual_seed(0)
+    return [torch.randint(0, vocab_size, (num_tokens,), generator=rng).tolist() for _ in range(batch_size)]
+
+
+# Benchmark entries and collection
+@dataclass
+class BenchmarkEntry:
+    """Single CB run: what was fed in, which configs were used, and the resulting metrics."""
+
+    label: str
+    num_samples: int
+    avg_input_tokens: float
+    max_new_tokens: int
+    cb_config: dict[str, Any]
+    gen_config: dict[str, Any]
+    time_seconds: float | None = None
+    num_tokens: int | None = None
+    throughput_tok_per_sec: float | None = None
+    peak_memory_gb: float | None = None
+    accuracy: float | None = None
+    error: str | None = None
+
+
+def _config_summary(cfg: Any) -> dict[str, Any]:
+    """Extract a JSON-friendly summary of a dataclass/config object."""
+    raw = cfg.to_dict() if hasattr(cfg, "to_dict") else cfg.__dict__
+    return {k: v for k, v in raw.items() if isinstance(v, (int, float, str, bool, type(None)))}
+
+
+class BenchmarkResults:
+    """Holds all CB benchmark runs and the shared model they execute against."""
+
+    def __init__(self, model_id: str, attn_impl: str, tp_size: int = 1):
+        self.model_id = model_id
+        self.attn_impl = attn_impl
+        self.tp_size = tp_size
+        self.entries: list[BenchmarkEntry] = []
+
+    def cleanup(self) -> None:
+        torch.cuda.empty_cache()
+        gc.collect()
+        torch.cuda.reset_peak_memory_stats()
+
+    def _get_model(self) -> Any:
+        self.cleanup()
+        # tp_plan and device_map are mutually exclusive — TP uses its own placement.
+        placement = {"tp_plan": "auto"} if self.tp_size > 1 else {"device_map": 0}
+        model = AutoModelForCausalLM.from_pretrained(self.model_id, attn_implementation=self.attn_impl, **placement)
+        return model.eval()
+
+    def add_benchmark(
+        self,
+        data: list[list[int]],
+        max_new_tokens: int,
+        cb_config: ContinuousBatchingConfig,
+        gen_config: GenerationConfig | None = None,
+        label: str | None = None,
+        score_fn: Callable[[Any], float] | None = None,
+    ) -> BenchmarkEntry:
+        """Run one CB benchmark and record time, tokens, and peak memory."""
+
+        gen_config = GenerationConfig() if gen_config is None else gen_config
+        gen_config.max_new_tokens = max_new_tokens
+
+        model = self._get_model()
+
+        avg_input = sum(len(x) for x in data) / max(len(data), 1)
+        entry = BenchmarkEntry(
+            label=label or f"bench_{len(self.entries)}",
+            num_samples=len(data),
+            avg_input_tokens=avg_input,
+            max_new_tokens=max_new_tokens,
+            cb_config=_config_summary(cb_config),
+            gen_config=_config_summary(gen_config),
+        )
+
+        print(f"\n[{entry.label}] samples={entry.num_samples} avg_in={avg_input:.1f} max_new={max_new_tokens}")
+
+        self.cleanup()
+
+        try:
+            outputs = model.generate_batch(
+                inputs=data,
+                generation_config=gen_config,
+                continuous_batching_config=cb_config,
+                progress_bar=False,
+            )
+            gen_start = min(out.created_time for out in outputs.values())
+            gen_end = max(out.lifespan[1] for out in outputs.values())
+            gen_time = gen_end - gen_start
+            num_tokens = sum(len(out.generated_tokens) for out in outputs.values())
+
+            entry.time_seconds = gen_time
+            entry.num_tokens = num_tokens
+            entry.throughput_tok_per_sec = num_tokens / gen_time if gen_time > 0 else 0.0
+            entry.peak_memory_gb = torch.cuda.max_memory_allocated() / (1024**3)
+            if score_fn is not None:
+                entry.accuracy = score_fn(outputs)
+            print(
+                f"   {gen_time:.2f}s, {num_tokens} tokens, "
+                f"{entry.throughput_tok_per_sec:.2f} tok/s, peak {entry.peak_memory_gb:.2f} GB"
+                + (f", acc {entry.accuracy:.3f}" if entry.accuracy is not None else "")
+            )
+        except Exception as e:
+            entry.error = str(e)
+            print(f"   ERROR: {e}")
+
+        self.entries.append(entry)
+        self.cleanup()
+        return entry
+
+    # Persistence
+    def save(self, name: str) -> Path:
+        """Save all entries to a timestamped JSON file keyed by name."""
+        RESULTS_DIR.mkdir(parents=True, exist_ok=True)
+        filename = RESULTS_DIR / f"{name}__{int(time.time())}.json"
+        payload = {
+            "model_id": self.model_id,
+            "attn_impl": self.attn_impl,
+            "entries": [asdict(e) for e in self.entries],
+        }
+        filename.write_text(json.dumps(payload, indent=2))
+        print(f"\nResults saved to {filename}")
+        return filename
+
+    @classmethod
+    def load_most_recent(cls, name: str) -> "BenchmarkResults":
+        """Load the most recent JSON file matching name."""
+        candidates = sorted(RESULTS_DIR.glob(f"{name}__*.json"))
+        if not candidates:
+            raise FileNotFoundError(f"No baseline with name '{name}' in {RESULTS_DIR}")
+        data = json.loads(candidates[-1].read_text())
+        instance = cls(
+            model_id=data.get("model_id"),
+            attn_impl=data.get("attn_impl"),
+        )
+        instance.entries = [BenchmarkEntry(**e) for e in data["entries"]]
+        print(f"Loaded baseline from {candidates[-1]}")
+        return instance
+
+    # Display
+    def print_summary(self) -> None:
+        rows = [
+            {
+                "label": e.label,
+                "samples": e.num_samples,
+                "avg_in": f"{e.avg_input_tokens:.1f}",
+                "max_new": e.max_new_tokens,
+                "time (s)": _fmt(e.time_seconds, ".2f"),
+                "tokens": _fmt(e.num_tokens, "d"),
+                "tok/s": _fmt(e.throughput_tok_per_sec, ".2f", "ERROR"),
+                "mem (GB)": _fmt(e.peak_memory_gb, ".2f"),
+                "acc": _fmt(e.accuracy, ".3f", "-"),
+            }
+            for e in self.entries
+        ]
+        print("\n" + tabulate(rows, headers="keys", tablefmt="github"))
+
+    def compare_to(self, baseline: "BenchmarkResults") -> None:
+        """Print a side-by-side throughput comparison against a baseline run."""
+        base_tps = {e.label: e.throughput_tok_per_sec for e in baseline.entries}
+
+        def diff(cur: float | None, base: float | None) -> str:
+            if cur is None or not base:
+                return "N/A"
+            return f"{(cur - base) / base * 100:+.1f}%"
+
+        rows = [
+            {
+                "label": e.label,
+                "baseline (tok/s)": _fmt(base_tps.get(e.label), ".2f", "N/A"),
+                "current (tok/s)": _fmt(e.throughput_tok_per_sec, ".2f", e.error or "N/A"),
+                "diff": diff(e.throughput_tok_per_sec, base_tps.get(e.label)),
+            }
+            for e in self.entries
+        ]
+        print(f"\nComparison against baseline (model={baseline.model_id}):")
+        print(tabulate(rows, headers="keys", tablefmt="github"))
+
+
+# Main
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--name", type=str, default=None, help="Name of the benchmark run (for saving).")
+    parser.add_argument("--compare-to", type=str, default=None, help="Name of a previous run to compare against.")
+    parser.add_argument("--model-id", type=str, default="meta-llama/Llama-3.1-8B-Instruct")
+    parser.add_argument("--attn", type=str, default="kernels-community/flash-attn3")
+    parser.add_argument("--tp-size", type=int, default=1, help="Tensor parallel size (1 = no TP).")
+    parser.add_argument(
+        "--rollouts-lengths",
+        "-rl",
+        type=int,
+        nargs="+",
+        help="If this is specified, only the rollouts benchmarks run, with the given sizes (in tokens).",
+    )
+    cli_args = parser.parse_args()
+
+    results = BenchmarkResults(model_id=cli_args.model_id, attn_impl=cli_args.attn, tp_size=cli_args.tp_size)
+    tokenizer = AutoTokenizer.from_pretrained(cli_args.model_id, padding_side="left")
+
+    if cli_args.rollouts_lengths is not None:
+        rollouts_only = True
+        rollout_sizes = cli_args.rollouts_lengths
+    else:
+        rollouts_only = False
+        rollout_sizes = [1024, 2048, 4096, 8192, 16384]
+
+    if not rollouts_only:
+        # GSM8K benchmarks (256 max new tokens) — gsm8k_platinum dataset, 8-shot, lighteval extractive_match
+        gsm8k_data, gsm8k_score_fn = get_tokenized_gsm8k(tokenizer)
+
+        ## No options
+        results.add_benchmark(
+            data=gsm8k_data,
+            max_new_tokens=256,
+            cb_config=ContinuousBatchingConfig(),
+            gen_config=GenerationConfig(eos_token_id=-1),
+            label="gsm8k_default",
+            score_fn=gsm8k_score_fn,
+        )
+
+        ## With sampling. Recommended chat sampling (T=0.6, top_p=0.9), low enough that math reasoning isn't derailed
+        results.add_benchmark(
+            data=gsm8k_data,
+            max_new_tokens=256,
+            cb_config=ContinuousBatchingConfig(),
+            gen_config=GenerationConfig(eos_token_id=-1, do_sample=True, temperature=0.6, top_p=0.9),
+            label="gsm8k_sampling",
+            score_fn=gsm8k_score_fn,
+        )
+
+        ## With compile
+        results.add_benchmark(
+            data=gsm8k_data,
+            max_new_tokens=256,
+            cb_config=ContinuousBatchingConfig(use_default_compile_configs=True),
+            gen_config=GenerationConfig(eos_token_id=-1),
+            label="gsm8k_compile",
+            score_fn=gsm8k_score_fn,
+        )
+
+        ## No decode fast path
+        results.add_benchmark(
+            data=gsm8k_data,
+            max_new_tokens=256,
+            cb_config=ContinuousBatchingConfig(max_blocks_per_request=0),
+            gen_config=GenerationConfig(eos_token_id=-1),
+            label="gsm8k_no_fast_decode",
+            score_fn=gsm8k_score_fn,
+        )
+
+        ## Bare-bones CB config
+        results.add_benchmark(
+            data=gsm8k_data,
+            max_new_tokens=256,
+            cb_config=ContinuousBatchingConfig(
+                max_blocks_per_request=0, use_async_batching=False, use_cuda_graph=False
+            ),
+            gen_config=GenerationConfig(eos_token_id=-1),
+            label="gsm8k_bare_bones",
+            score_fn=gsm8k_score_fn,
+        )
+
+        # IFEval: 0-shot chat prompts; uses real EOS so instruction-following metrics see the model's natural stop.
+        ifeval_data, ifeval_score_fn = get_tokenized_ifeval(tokenizer)
+        results.add_benchmark(
+            data=ifeval_data,
+            max_new_tokens=1280,
+            cb_config=ContinuousBatchingConfig(),
+            label="ifeval_default",
+            score_fn=ifeval_score_fn,
+        )
+
+        # Raw benchmarks (various options)
+
+        ## Few blocks — tight cache pressure
+        results.add_benchmark(
+            data=get_random_data(batch_size=20, num_tokens=256),
+            max_new_tokens=256,
+            cb_config=ContinuousBatchingConfig(num_blocks=16),
+            gen_config=GenerationConfig(eos_token_id=-1),
+            label="few_blocks",
+        )
+
+        ## Multiple return sequences (sampling + parallel decoding)
+        results.add_benchmark(
+            data=get_random_data(batch_size=50, num_tokens=256),
+            max_new_tokens=256,
+            cb_config=ContinuousBatchingConfig(),
+            gen_config=GenerationConfig(eos_token_id=-1, do_sample=True, num_return_sequences=8),
+            label="multi_return_seq",
+        )
+
+    ## RL rollouts: small batch, growing generation lengths
+    for length in rollout_sizes:
+        results.add_benchmark(
+            data=get_random_data(batch_size=32, num_tokens=256),
+            max_new_tokens=length,
+            cb_config=ContinuousBatchingConfig(use_default_compile_configs=True),
+            gen_config=GenerationConfig(eos_token_id=-1),
+            label=f"rollouts_{length}",
+        )
+
+    # Post processing and display. Only on rank 0 in TP runs to avoid duplicate output / file writes.
+    is_rank_zero = not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0
+    if is_rank_zero:
+        results.print_summary()
+        if cli_args.compare_to:
+            baseline = BenchmarkResults.load_most_recent(cli_args.compare_to)
+            results.compare_to(baseline=baseline)
+        if cli_args.name:
+            results.save(cli_args.name)