update at 2026-06-04 14:09:16

2026-06-04 14:09:16 +08:00
parent 41bd03123c
commit 4603914e85
11 changed files with 692 additions and 68 deletions
--- a/.claude/worktrees/agent-a02a887cdc0c41851
+++ b/.claude/worktrees/agent-a02a887cdc0c41851
--- a/app/detector.py
+++ b/app/detector.py
@@ -36,7 +36,79 @@ class DetrVehicleDetector:
        inputs = {key: value.to(self.device) for key, value in inputs.items()}
        outputs = self.model(**inputs)
-        # DETR 后处理需要原图尺寸，PIL size 是 (宽, 高)，这里转成 (高, 宽)。
+        return self._detections_from_outputs(image, outputs)
    @torch.no_grad()
    def inspect_tokens(self, frame_rgb: Any) -> dict[str, Any]:
        image = Image.fromarray(frame_rgb)
        inputs = self.processor(images=image, return_tensors="pt")
        inputs = {key: value.to(self.device) for key, value in inputs.items()}
        features, object_queries_list = self.model.model.backbone(inputs["pixel_values"], inputs["pixel_mask"])
        feature_map, mask = features[-1]
        projected_feature_map = self.model.model.input_projection(feature_map)
        tokens = projected_feature_map.flatten(2).permute(0, 2, 1)
        object_queries = object_queries_list[-1].flatten(2).permute(0, 2, 1)
        outputs = self.model(**inputs, output_hidden_states=True)
        target_sizes = torch.tensor([image.size[::-1]], device=self.device)
        results = self.processor.post_process_object_detection(
            outputs,
            target_sizes=target_sizes,
            threshold=self.confidence,
        )[0]
        token_rows = int(projected_feature_map.shape[2])
        token_cols = int(projected_feature_map.shape[3])
        sample_count = min(48, int(tokens.shape[1]))
        sample_tokens = tokens[0, :sample_count, :8].detach().cpu()
        token_sequence = []
        for index, vector in enumerate(sample_tokens):
            token_sequence.append(
                {
                    "index": index,
                    "row": index // token_cols,
                    "col": index % token_cols,
                    "values": [round(float(value), 4) for value in vector.tolist()],
                    "magnitude": round(float(vector.norm()), 4),
                }
            )
        detections = []
        for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
            label_name = self.model.config.id2label[label.item()]
            if label_name not in self.vehicle_labels:
                continue
            x1, y1, x2, y2 = box.detach().cpu().numpy().astype(int).tolist()
            detections.append(
                {
                    "label": label_name,
                    "score": round(float(score.detach().cpu()), 4),
                    "box": [x1, y1, x2, y2],
                }
            )
        encoder_last_hidden_state = getattr(outputs, "encoder_last_hidden_state", None)
        last_hidden_state = getattr(outputs, "last_hidden_state", None)
        return {
            "image_size": {"width": image.size[0], "height": image.size[1]},
            "pixel_values_shape": list(inputs["pixel_values"].shape),
            "pixel_mask_shape": list(inputs["pixel_mask"].shape),
            "feature_map_shape": list(feature_map.shape),
            "projected_feature_map_shape": list(projected_feature_map.shape),
            "visual_tokens_shape": list(tokens.shape),
            "position_encoding_shape": list(object_queries.shape),
            "encoder_last_hidden_state_shape": list(encoder_last_hidden_state.shape) if encoder_last_hidden_state is not None else [],
            "decoder_last_hidden_state_shape": list(last_hidden_state.shape) if last_hidden_state is not None else [],
            "logits_shape": list(outputs.logits.shape),
            "pred_boxes_shape": list(outputs.pred_boxes.shape),
            "token_grid": {"rows": token_rows, "cols": token_cols, "total": int(tokens.shape[1]), "shown": sample_count},
            "token_sequence": token_sequence,
            "detections": detections,
        }
    def _detections_from_outputs(self, image: Image.Image, outputs: Any) -> list[dict[str, Any]]:
        target_sizes = torch.tensor([image.size[::-1]], device=self.device)
        results = self.processor.post_process_object_detection(
            outputs,
@@ -60,3 +132,4 @@ class DetrVehicleDetector:
            )
        return detections
--- a/app/device_manager.py
+++ b/app/device_manager.py
@@ -33,11 +33,16 @@ class DeviceManager:
        if device_num not in {device.device_num for device in self.devices}:
            raise ValueError("设备不在 devicelist.env 中")
        with self.lock:
            old_device_num = self.current_device_num
            self.current_device_num = device_num
            self.current_url = ""
            self.timings = {}
            self.updated_at = time.time()
            self.version += 1
            print(
                f"[device-switch] manager set old={old_device_num} new={device_num} version={self.version}",
                flush=True,
            )
            return self.version
    def resolve_stream_url(self) -> str:
@@ -49,16 +54,38 @@ class DeviceManager:
                return self.fallback_url
            raise RuntimeError("devicelist.env 中没有可用设备号")
        print(f"[device-switch] resolve start device={device_num} version={version}", flush=True)
        try:
            result = self.api_client.get_stream_url_details(device_num)
        except Exception as exc:
            print(
                f"[device-switch] resolve failed device={device_num} version={version} error={exc}",
                flush=True,
            )
            raise
        with self.lock:
            # 避免旧摄像头的慢接口响应覆盖用户刚切换的新选择。
            if version != self.version or device_num != self.current_device_num:
                print(
                    f"[device-switch] resolve stale device={device_num} version={version} current={self.current_device_num} current_version={self.version}",
                    flush=True,
                )
                return self.current_url
            self.current_url = result.url
            self.timings = dict(result.timings)
            self.updated_at = time.time()
        print(f"[device-switch] resolve success device={device_num} version={version}", flush=True)
        return result.url
    def resolve_stream_url_for(self, device_num: str) -> str:
        if device_num not in {device.device_num for device in self.devices}:
            raise ValueError("设备不在 devicelist.env 中")
        result = self.api_client.get_stream_url_details(device_num)
        return result.url
    def get_video_grid_devices(self, limit: int = 4) -> list[Device]:
        return self.devices[:limit]
    def get_snapshot(self) -> dict[str, Any]:
        with self.lock:
            return {
--- a/app/main.py
+++ b/app/main.py
@@ -45,6 +45,18 @@ worker = StreamWorker(
    resize_width=settings.resize_width,
 )
 video_grid_devices = device_manager.get_video_grid_devices()
 video_grid_workers = {
    device.device_num: StreamWorker(
        stream_url=lambda device_num=device.device_num: device_manager.resolve_stream_url_for(device_num),
        detector=detector,
        frame_skip=settings.frame_skip,
        jpeg_quality=settings.jpeg_quality,
        resize_width=settings.resize_width,
    )
    for device in video_grid_devices
 }
 app = FastAPI(title="DETR 动态打标")
 app.mount("/static", StaticFiles(directory="app/static"), name="static")
 templates = Jinja2Templates(directory="app/templates")
@@ -57,11 +69,15 @@ def display_model_name(model_name: str) -> str:
@app.on_event("startup")
 def startup() -> None:
    worker.start()
    for grid_worker in video_grid_workers.values():
        grid_worker.start()
@app.on_event("shutdown")
 def shutdown() -> None:
    worker.stop()
    for grid_worker in video_grid_workers.values():
        grid_worker.stop()
@app.get("/", response_class=HTMLResponse)
@@ -73,15 +89,67 @@ def index(request: Request) -> HTMLResponse:
            "model": display_model_name(settings.detr_model),
            "device": detector.device_name,
            "stream_url": f"设备号：{device_manager.get_snapshot()['current_device_num']}",
            "video_grid_devices": video_grid_devices,
        },
    )
@app.get("/tokenizer", response_class=HTMLResponse)
 def tokenizer(request: Request) -> HTMLResponse:
    return templates.TemplateResponse(
        "tokenizer.html",
        {
            "request": request,
            "model": display_model_name(settings.detr_model),
            "device": detector.device_name,
        },
    )
@app.get("/tokenizer/state")
 def tokenizer_state() -> JSONResponse:
    snapshot = worker.get_snapshot()
    frame = worker.get_frame_rgb()
    if frame is None:
        return JSONResponse(
            {
                "ready": False,
                "frame_id": snapshot["frame_id"],
                "connected": snapshot["connected"],
                "error": snapshot["error"] or "等待视频帧",
            }
        )
    data = detector.inspect_tokens(frame)
    data.update(
        {
            "ready": True,
            "frame_id": snapshot["frame_id"],
            "updated_at": snapshot["updated_at"],
            "connected": snapshot["connected"],
            "error": snapshot["error"],
        }
    )
    return JSONResponse(data)
@app.get("/video")
 def video() -> StreamingResponse:
    return stream_video(worker)
@app.get("/video/{device_num}")
 def video_device(device_num: str) -> StreamingResponse:
    grid_worker = video_grid_workers.get(device_num)
    if grid_worker is None:
        raise HTTPException(status_code=404, detail="设备不在视频网格中")
    return stream_video(grid_worker)
 def stream_video(stream_worker: StreamWorker) -> StreamingResponse:
    async def generate():
        while True:
-            frame = worker.get_jpeg()
+            frame = stream_worker.get_jpeg()
            if frame is None:
                await asyncio.sleep(0.1)
                continue
@@ -137,11 +205,13 @@ def status() -> JSONResponse:
@app.post("/devices/{device_num}")
 def switch_device(device_num: str) -> JSONResponse:
    try:
-        device_manager.set_current_device(device_num)
+        version = device_manager.set_current_device(device_num)
    except ValueError as exc:
        print(f"[device-switch] invalid device={device_num}", flush=True)
        raise HTTPException(status_code=404, detail=str(exc)) from exc
    worker.reconnect()
-    return JSONResponse({"current_device_num": device_num})
+    print(f"[device-switch] accepted device={device_num} version={version}", flush=True)
    return JSONResponse({"current_device_num": device_num, "version": version})
@app.websocket("/ws/detections")
--- a/app/static/app.js
+++ b/app/static/app.js
@@ -13,7 +13,10 @@ const timingFrameEl = document.querySelector("#timing-frame");
 let selectedDevice = "";
 let pendingDevice = "";
 let queuedDevice = "";
 let switching = false;
 let devicesSignature = "";
 let lastWsSignature = "";
 function setConnection(online, text) {
  connection.textContent = text;
@@ -84,16 +87,54 @@ function renderDetections(detections) {
    .join("");
 }
-async function switchDevice(deviceNum) {
+async function performSwitch(deviceNum) {
  switching = true;
  pendingDevice = deviceNum;
  devicesSignature = "";
  setConnection(false, "切换中");
  console.log("[device-switch] start", { deviceNum });
  try {
    const response = await fetch(`/devices/${encodeURIComponent(deviceNum)}`, { method: "POST" });
    if (!response.ok) {
      throw new Error("切换摄像头失败");
    }
    const result = await response.json();
    const video = document.querySelector("#video");
    if (video) {
      video.src = `/video?t=${Date.now()}`;
    }
    document.querySelectorAll(".grid-video").forEach((item) => {
      item.src = `${item.dataset.src}?t=${Date.now()}`;
    });
    console.log("[device-switch] requested", { deviceNum, version: result.version });
  } catch (error) {
    pendingDevice = "";
    devicesSignature = "";
    setConnection(false, "切换失败");
    console.error("[device-switch] failed", { deviceNum, error });
  } finally {
    switching = false;
    if (queuedDevice && queuedDevice !== deviceNum) {
      const nextDevice = queuedDevice;
      queuedDevice = "";
      return performSwitch(nextDevice);
    }
    queuedDevice = "";
  }
 }
 function switchDevice(deviceNum) {
  if (switching) {
    queuedDevice = deviceNum;
    pendingDevice = deviceNum;
    devicesSignature = "";
    setConnection(false, "等待切换");
    console.log("[device-switch] queued", { deviceNum });
    return Promise.resolve();
  }
  return performSwitch(deviceNum);
 }
 function connectWebSocket() {
@@ -109,6 +150,18 @@ function connectWebSocket() {
    errorEl.textContent = data.error || (data.connected ? "正常" : "未连接");
    sourceEl.textContent = data.source || "-";
    setConnection(Boolean(data.connected), data.connected ? "已连接" : "重连中");
    const wsSignature = `${data.current_device_num}|${data.connected}|${data.frame_id}|${data.error || ""}`;
    if (wsSignature !== lastWsSignature) {
      lastWsSignature = wsSignature;
      console.log("[device-switch] ws", {
        currentDeviceNum: data.current_device_num,
        pendingDevice,
        queuedDevice,
        connected: data.connected,
        frameId: data.frame_id,
        error: data.error,
      });
    }
    if (pendingDevice && data.current_device_num === pendingDevice) {
      pendingDevice = "";
      deviceSelect.disabled = false;
--- a/app/static/style.css
+++ b/app/static/style.css
@@ -73,6 +73,30 @@ p {
  color: var(--red);
 }
 .topbar-actions {
  display: flex;
  align-items: center;
  gap: 12px;
 }
 .button-link {
  display: inline-flex;
  align-items: center;
  justify-content: center;
  min-height: 36px;
  padding: 8px 14px;
  border: 1px solid var(--line);
  border-radius: 999px;
  color: var(--text);
  text-decoration: none;
  background: var(--panel);
 }
 .button-link:hover {
  border-color: var(--green);
  color: var(--green);
 }
 .layout {
  display: grid;
  grid-template-columns: minmax(0, 1fr) 360px;
@@ -92,33 +116,6 @@ p {
  overflow: hidden;
 }
 .pipeline {
  display: flex;
  align-items: center;
  gap: 10px;
  padding: 14px;
  border-bottom: 1px solid var(--line);
  overflow-x: auto;
 }
 .stage {
  flex: 0 0 auto;
  padding: 9px 12px;
  border: 1px solid var(--line);
  border-radius: 10px;
  color: var(--muted);
  background: var(--panel-2);
 }
 .stage.active {
  border-color: rgba(46, 232, 135, 0.5);
  color: var(--green);
 }
 .arrow {
  color: var(--muted);
 }
 .video-wrap {
  display: grid;
  place-items: center;
@@ -126,7 +123,8 @@ p {
  background: #05070b;
 }
-#video {
+#video,
 .grid-video {
  display: block;
  width: 100%;
  height: auto;
@@ -134,6 +132,36 @@ p {
  object-fit: contain;
 }
 .video-grid {
  display: grid;
  grid-template-columns: repeat(2, minmax(0, 1fr));
  gap: 14px;
  padding: 14px;
  background: #05070b;
 }
 .video-grid-item {
  overflow: hidden;
  border: 1px solid var(--line);
  border-radius: 14px;
  background: var(--panel-2);
 }
 .video-grid-title {
  padding: 10px 12px;
  border-bottom: 1px solid var(--line);
  color: var(--muted);
  font-size: 13px;
 }
 .video-grid-wrap {
  min-height: 240px;
 }
 .grid-video {
  max-height: calc((100vh - 260px) / 2);
 }
 .side-card {
  display: flex;
  flex-direction: column;
@@ -166,13 +194,19 @@ p {
  background: var(--panel-2);
 }
 .detections-panel {
  padding: 16px;
  border-top: 1px solid var(--line);
 }
 .detections {
-  display: flex;
+  display: grid;
-  flex-direction: column;
+  grid-template-columns: repeat(4, minmax(0, 1fr));
  gap: 10px;
 }
 .detections.empty {
  display: block;
  color: var(--muted);
 }
@@ -197,6 +231,133 @@ p {
  font-size: 12px;
 }
 .tokenizer-page .tokenizer-layout {
  display: grid;
  grid-template-columns: minmax(0, 1fr) minmax(0, 2fr);
  grid-template-areas: "flow side";
  align-items: start;
  width: 100%;
  gap: 18px;
  padding: 18px;
 }
 .tokenizer-page .tokenizer-side {
  display: grid;
  grid-area: side;
  min-width: 0;
  gap: 18px;
 }
 .tokenizer-page .tokenizer-flow-card {
  grid-area: flow;
  min-width: 0;
  min-height: calc(100vh - 122px);
 }
 .tokenizer-page .tokenizer-side .detections {
  grid-template-columns: repeat(3, minmax(0, 1fr));
 }
 .tokenizer-card {
  border: 1px solid var(--line);
  border-radius: 18px;
  padding: 18px;
  background: rgba(21, 27, 38, 0.9);
  box-shadow: 0 18px 40px rgba(0, 0, 0, 0.28);
 }
 .pipeline-steps {
  display: grid;
  gap: 10px;
 }
 .pipeline-step {
  display: grid;
  grid-template-columns: 34px minmax(0, 1fr);
  gap: 10px;
  align-items: start;
  padding: 12px;
  border: 1px solid var(--line);
  border-radius: 12px;
  background: var(--panel-2);
 }
 .step-index {
  display: grid;
  place-items: center;
  width: 28px;
  height: 28px;
  border-radius: 999px;
  color: #06100b;
  font-weight: 700;
  background: var(--green);
 }
 .step-title {
  margin-bottom: 5px;
  font-weight: 700;
 }
 .step-value,
 .token-summary,
 .selected-token {
  color: var(--muted);
  font-family: ui-monospace, SFMono-Regular, Menlo, monospace;
  font-size: 13px;
  word-break: break-all;
 }
 .token-sequence {
  display: grid;
  grid-template-columns: repeat(12, minmax(0, 1fr));
  gap: 8px;
  margin-top: 14px;
 }
 .token-cell {
  min-height: 50px;
  border: 1px solid var(--line);
  border-radius: 10px;
  color: var(--text);
  cursor: pointer;
  background: var(--panel-2);
 }
 .token-cell span,
 .token-cell small {
  display: block;
 }
 .token-cell small {
  margin-top: 3px;
  color: var(--muted);
 }
 .token-cell.selected,
 .token-cell:hover {
  border-color: var(--green);
  color: var(--green);
 }
 .token-detail-title {
  margin-bottom: 10px;
  color: var(--green);
  font-weight: 700;
 }
 .token-vector {
  padding: 12px;
  border: 1px solid var(--line);
  border-radius: 12px;
  background: var(--panel-2);
 }
@media (max-width: 1280px) {
  .detections {
    grid-template-columns: repeat(4, minmax(0, 1fr));
  }
 }
@media (max-width: 980px) {
  .layout {
    grid-template-columns: 1fr;
@@ -206,4 +367,18 @@ p {
    align-items: flex-start;
    flex-direction: column;
  }
  .detections {
    grid-template-columns: repeat(3, minmax(0, 1fr));
  }
 }
@media (max-width: 640px) {
  .video-grid {
    grid-template-columns: 1fr;
  }
  .detections {
    grid-template-columns: 1fr;
  }
 }
--- a/app/static/tokenizer.js
+++ b/app/static/tokenizer.js
@@ -0,0 +1,131 @@
 const statusEl = document.querySelector("#tokenizer-status");
 const pipelineEl = document.querySelector("#pipeline-steps");
 const tokenSummaryEl = document.querySelector("#token-summary");
 const tokenSequenceEl = document.querySelector("#token-sequence");
 const selectedTokenEl = document.querySelector("#selected-token");
 const detectionsEl = document.querySelector("#tokenizer-detections");
 let selectedTokenIndex = null;
 function formatShape(shape) {
  if (!shape || !shape.length) {
    return "-";
  }
  return `[${shape.join(", ")}]`;
 }
 function setStatus(ready, text) {
  statusEl.textContent = text;
  statusEl.classList.toggle("online", ready);
  statusEl.classList.toggle("offline", !ready);
 }
 function renderPipeline(data) {
  const steps = [
    ["OpenCV RGB 帧", `${data.image_size?.width ?? "-"} × ${data.image_size?.height ?? "-"}`],
    ["PIL Image", `${data.image_size?.width ?? "-"} × ${data.image_size?.height ?? "-"}`],
    ["DetrImageProcessor", `pixel_values ${formatShape(data.pixel_values_shape)} / pixel_mask ${formatShape(data.pixel_mask_shape)}`],
    ["ResNet-50 backbone", `feature map ${formatShape(data.feature_map_shape)}`],
    ["1×1 convolution", `projected ${formatShape(data.projected_feature_map_shape)}`],
    ["视觉 token embedding", `由 projected feature map flatten 得到 ${formatShape(data.visual_tokens_shape)}`],
    ["位置 embedding", `二维位置 embedding ${formatShape(data.position_encoding_shape)}`],
    ["Transformer Encoder", formatShape(data.encoder_last_hidden_state_shape)],
    ["Object query embedding + Decoder", `object query embedding 解码后 ${formatShape(data.decoder_last_hidden_state_shape)}`],
    ["类别 logits + boxes", `logits ${formatShape(data.logits_shape)} / boxes ${formatShape(data.pred_boxes_shape)}`],
    ["post_process_object_detection", `检测结果 ${data.detections?.length ?? 0} 个`],
  ];
  pipelineEl.innerHTML = steps
    .map(([title, value], index) => `
      <div class="pipeline-step">
        <div class="step-index">${index + 1}</div>
        <div>
          <div class="step-title">${title}</div>
          <div class="step-value">${value}</div>
        </div>
      </div>
    `)
    .join("");
 }
 function renderTokens(data) {
  const grid = data.token_grid || {};
  tokenSummaryEl.textContent = `帧号 ${data.frame_id ?? "-"} · token 网格 ${grid.rows ?? "-"} × ${grid.cols ?? "-"}，总数 ${grid.total ?? "-"}，展示前 ${grid.shown ?? 0} 个 token，每个显示前 8 维采样。`;
  tokenSequenceEl.innerHTML = (data.token_sequence || [])
    .map((token) => `
      <button class="token-cell ${token.index === selectedTokenIndex ? "selected" : ""}" data-index="${token.index}">
        <span>#${token.index}</span>
        <small>(${token.row}, ${token.col})</small>
      </button>
    `)
    .join("");
  tokenSequenceEl.querySelectorAll(".token-cell").forEach((button) => {
    button.addEventListener("click", () => {
      selectedTokenIndex = Number(button.dataset.index);
      renderSelectedToken(data);
      renderTokens(data);
    });
  });
  renderSelectedToken(data);
 }
 function renderSelectedToken(data) {
  const tokens = data.token_sequence || [];
  const token = tokens.find((item) => item.index === selectedTokenIndex) || tokens[0];
  if (!token) {
    selectedTokenEl.textContent = "暂无 token。";
    return;
  }
  selectedTokenIndex = token.index;
  selectedTokenEl.innerHTML = `
    <div class="token-detail-title">Token #${token.index} · 网格位置 (${token.row}, ${token.col}) · L2 ${token.magnitude}</div>
    <div class="token-vector">[${token.values.map((value) => Number(value).toFixed(4)).join(", ")}, ...]</div>
  `;
 }
 function renderDetections(detections) {
  if (!detections.length) {
    detectionsEl.className = "detections empty";
    detectionsEl.textContent = "暂无目标";
    return;
  }
  detectionsEl.className = "detections";
  detectionsEl.innerHTML = detections
    .map((det) => `
      <div class="det-item">
        <div class="det-title">
          <span>${det.label}</span>
          <span>${(det.score * 100).toFixed(1)}%</span>
        </div>
        <div class="det-box">box: [${det.box.join(", ")}]</div>
      </div>
    `)
    .join("");
 }
 async function refreshTokenizer() {
  try {
    const response = await fetch(`/tokenizer/state?t=${Date.now()}`);
    const data = await response.json();
    if (!data.ready) {
      setStatus(false, data.error || "等待帧");
      tokenSummaryEl.textContent = data.error || "等待视频帧";
      return;
    }
    setStatus(Boolean(data.connected), data.connected ? "动态更新中" : "未连接");
    renderPipeline(data);
    renderTokens(data);
    renderDetections(data.detections || []);
  } catch (error) {
    setStatus(false, "更新失败");
    tokenSummaryEl.textContent = `更新失败：${error}`;
  } finally {
    setTimeout(refreshTokenizer, 1200);
  }
 }
 refreshTokenizer();
--- a/app/stream_worker.py
+++ b/app/stream_worker.py
@@ -26,6 +26,7 @@ class StreamWorker:
        self.lock = threading.Lock()
        self.latest_jpeg: bytes | None = None
        self.latest_frame_rgb: Any | None = None
        self.latest_detections: list[dict[str, Any]] = []
        self.frame_id = 0
        self.updated_at = 0.0
@@ -56,21 +57,30 @@ class StreamWorker:
    def reconnect(self) -> None:
        with self.lock:
            self.latest_jpeg = None
            self.latest_frame_rgb = None
            self.latest_detections = []
            self.frame_id = 0
            self.fps = 0.0
            self.reconnect_requested = True
            self.reconnect_version += 1
            version = self.reconnect_version
            self.connected = False
            self.error = "正在切换视频源"
            self.resolve_ms = 0.0
            self.open_ms = 0.0
            self.first_frame_ms = 0.0
        print(f"[device-switch] worker reconnect requested version={version}", flush=True)
    def get_jpeg(self) -> bytes | None:
        with self.lock:
            return self.latest_jpeg
    def get_frame_rgb(self) -> Any | None:
        with self.lock:
            if self.latest_frame_rgb is None:
                return None
            return self.latest_frame_rgb.copy()
    def get_snapshot(self) -> dict[str, Any]:
        with self.lock:
            return {
@@ -100,16 +110,29 @@ class StreamWorker:
                run_version = self.reconnect_version
                self.reconnect_requested = False
            if should_reconnect:
                print(f"[device-switch] worker reconnect handling version={run_version}", flush=True)
                # 切换摄像头时必须释放旧连接，否则 OpenCV 会继续阻塞读旧流。
                if cap is not None:
                    cap.release()
                    cap = None
                    print(f"[device-switch] worker released old capture version={run_version}", flush=True)
            if cap is None or not cap.isOpened():
                started = time.monotonic()
                try:
                    stream_url = self.stream_url() if callable(self.stream_url) else self.stream_url
                except Exception as exc:
                    resolve_ms = round((time.monotonic() - started) * 1000, 2)
                    with self.lock:
                        self.resolve_ms = resolve_ms
                        self.open_ms = 0.0
                        self.first_frame_ms = 0.0
                    self._set_connection_state(False, f"获取播放地址失败：{exc}，2 秒后重试")
                    time.sleep(2)
                    continue
                resolve_ms = round((time.monotonic() - started) * 1000, 2)
                started = time.monotonic()
                print(f"[device-switch] worker open start version={run_version}", flush=True)
                cap = cv2.VideoCapture(stream_url)
                open_ms = round((time.monotonic() - started) * 1000, 2)
                with self.lock:
@@ -117,11 +140,13 @@ class StreamWorker:
                    self.resolve_ms = resolve_ms
                    self.first_frame_ms = 0.0
                if not cap.isOpened():
                    print(f"[device-switch] worker open failed version={run_version} open_ms={open_ms}", flush=True)
                    self._set_connection_state(False, "无法打开视频流，2 秒后重试")
                    cap.release()
                    cap = None
                    time.sleep(2)
                    continue
                print(f"[device-switch] worker open success version={run_version} open_ms={open_ms}", flush=True)
                self._set_connection_state(True, "已连接")
            started = time.monotonic()
@@ -143,9 +168,9 @@ class StreamWorker:
            frame = self._resize(frame)
            self.frame_id += 1
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            if self.frame_id % self.frame_skip == 0:
                frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                last_detections = self.detector.detect(frame_rgb)
            annotated = self._draw(frame, last_detections)
@@ -173,6 +198,7 @@ class StreamWorker:
            with self.lock:
                self.latest_jpeg = jpeg.tobytes()
                self.latest_frame_rgb = frame_rgb.copy()
                self.latest_detections = list(last_detections)
                self.updated_at = time.time()
                self.connected = True
--- a/app/templates/index.html
+++ b/app/templates/index.html
@@ -9,26 +9,30 @@
  <body>
    <header class="topbar">
      <div>
-        <h1>DETR 动态打标</h1>
+        <!-- <h1>DETR 动态打标</h1> -->
-        <p>使用 Python、OpenCV、PyTorch、Transformers DETR 和 FastAPI，Mac mini m2 上运行。</p>
+        <p>DETR动态打标：Python、OpenCV、PyTorch、Transformers DETR 和 FastAPI。</p>
      </div>
      <div class="topbar-actions">
        <a class="button-link" href="/tokenizer">tokenizer</a>
        <div class="badge" id="connection">连接中</div>
      </div>
    </header>
    <main class="layout">
      <section class="video-card">
-        <div class="pipeline">
+        <div class="video-grid">
-          <div class="stage active">源节点</div>
+          {% for device_item in video_grid_devices %}
-          <div class="arrow">→</div>
+            <article class="video-grid-item">
-          <div class="stage active">DETR 推理</div>
+              <div class="video-grid-title">{{ device_item.name }} · {{ device_item.device_num }}</div>
-          <div class="arrow">→</div>
+              <div class="video-wrap video-grid-wrap">
-          <div class="stage active">OSD 打标</div>
+                <img class="grid-video" src="/video/{{ device_item.device_num | urlencode }}" data-src="/video/{{ device_item.device_num | urlencode }}" alt="{{ device_item.name }} 视频流" />
          <div class="arrow">→</div>
          <div class="stage active">FastAPI 输出</div>
              </div>
-        <div class="video-wrap">
+            </article>
-          <img id="video" src="/video" alt="动态打标视频流" />
+          {% endfor %}
        </div>
        <section class="detections-panel">
          <div id="detections" class="detections empty">暂无目标</div>
        </section>
      </section>
      <aside class="side-card">
@@ -69,11 +73,6 @@
            <dd id="timing-frame">-</dd>
          </dl>
        </section>
        <section>
          <h2>检测结果</h2>
          <div id="detections" class="detections empty">暂无目标</div>
        </section>
      </aside>
    </main>
--- a/app/templates/tokenizer.html
+++ b/app/templates/tokenizer.html
@@ -0,0 +1,48 @@
 <!doctype html>
 <html lang="zh-CN">
  <head>
    <meta charset="utf-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1" />
    <title>DETR Tokenizer 动态可视化</title>
    <link rel="stylesheet" href="/static/style.css?v=tokenizer-layout-1-3" />
  </head>
  <body class="tokenizer-page">
    <header class="topbar">
      <div>
        <!--<h1>DETR Tokenizer 动态可视化</h1>-->
        <p>实时展示当前视频帧从图像预处理、视觉特征、token 序列到检测输出的过程。</p>
      </div>
      <div class="topbar-actions">
        <a class="button-link" href="/">返回视频</a>
        <div class="badge" id="tokenizer-status">等待帧</div>
      </div>
    </header>
    <main class="tokenizer-layout">
      <section class="tokenizer-card tokenizer-flow-card">
        <h2>实时流程</h2>
        <div class="pipeline-steps" id="pipeline-steps"></div>
      </section>
      <aside class="tokenizer-side">
        <section class="tokenizer-card">
          <h2>选中 Token</h2>
          <div class="selected-token" id="selected-token">点击下方 token 查看向量采样。</div>
        </section>
        <section class="tokenizer-card">
          <h2>Token 序列</h2>
          <div class="token-summary" id="token-summary">等待视频帧</div>
          <div class="token-sequence" id="token-sequence"></div>
        </section>
        <section class="tokenizer-card">
          <h2>检测输出</h2>
          <div id="tokenizer-detections" class="detections empty">暂无目标</div>
        </section>
      </aside>
    </main>
    <script src="/static/tokenizer.js"></script>
  </body>
 </html>
--- a/tokenizer.md
+++ b/tokenizer.md
@@ -1,3 +1,4 @@
 # DETR 的视觉 token 化过程说明
 本文基于当前项目代码 `app/detector.py` 中的实现说明 DETR 的“token 化”过程。
@@ -58,19 +59,39 @@ projected feature map: [batch, 256, H', W']
  ↓
 flatten 空间维度 H' × W'
  ↓
-visual tokens: [batch, H'×W', 256]
+visual token embeddings: [batch, H'×W', 256]
  ↓
-加入二维位置编码
+加入二维位置 embedding
  ↓
 Transformer Encoder
  ↓
-Object Queries + Transformer Decoder
+Object query embeddings + Transformer Decoder
  ↓
 类别 logits + 边界框 boxes
  ↓
 post_process_object_detection 还原到原图坐标
 ```
 ## Embedding 在 1-11 个环节中的位置
 在这个 DETR 流程里，embedding 不是单独只有一步，而是出现在 3 个关键环节：
 | 页面步骤 | 名称 | embedding 含义 |
 | --- | --- | --- |
 | 第 6 步 | visual token embedding | `projected feature map` 经过 flatten 后，每个空间网格点变成一个 256 维视觉 token embedding。 |
 | 第 7 步 | position embedding | 给每个视觉 token 加入二维位置 embedding，让 Transformer 知道 token 原本在图像中的位置。 |
 | 第 9 步 | object query embedding | DETR 使用一组可学习的 object query embeddings 进入 Decoder，每个 query 最终预测一个候选目标。 |
 所以如果问“embedding 在 1-11 哪个环节”，最核心的是：
 ```text
 第 6 步：产生视觉 token embedding
 第 7 步：加入位置 embedding
 第 9 步：object query embedding 进入 Decoder
 ```
 第 6 步是图像内容 embedding，第 7 步是空间位置 embedding，第 9 步是检测目标查询 embedding。
 ## 第 1 步：图像预处理
 代码：
@@ -212,7 +233,7 @@ x = x.permute(2, 0, 1)    # [h*w, batch, 256]
 两种写法本质相同，只是 Transformer 接口期望的维度顺序不同。
-## 第 5 步：加入二维位置编码
+## 第 5 步：加入二维位置 embedding
 Transformer 本身不理解图像中的二维空间位置。
@@ -264,11 +285,11 @@ Encoder 会通过 self-attention 建模图像中不同区域之间的关系。
 - 道路区域可以影响车辆判断。
 - 远处小目标可以和周围上下文一起被理解。
-## 第 7 步：Object Queries 和 Transformer Decoder
+## 第 7 步：Object query embedding 和 Transformer Decoder
 DETR 与传统检测器不同，它不是先生成大量 anchor box。
-它使用一组可学习的 object queries。常见数量是：
+它使用一组可学习的 object query embeddings。常见数量是：
 ```text
 100 个 object queries