From 4603914e85d8dd722d471c0973a28e55925b9bf3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=99=88=E8=B5=A3?= Date: Thu, 4 Jun 2026 14:09:16 +0800 Subject: [PATCH] update at 2026-06-04 14:09:16 --- .claude/worktrees/agent-a02a887cdc0c41851 | 1 + app/detector.py | 75 ++++++- app/device_manager.py | 29 ++- app/main.py | 76 ++++++- app/static/app.js | 65 +++++- app/static/style.css | 235 +++++++++++++++++++--- app/static/tokenizer.js | 131 ++++++++++++ app/stream_worker.py | 30 ++- app/templates/index.html | 37 ++-- app/templates/tokenizer.html | 48 +++++ tokenizer.md | 33 ++- 11 files changed, 692 insertions(+), 68 deletions(-) create mode 160000 .claude/worktrees/agent-a02a887cdc0c41851 create mode 100644 app/static/tokenizer.js create mode 100644 app/templates/tokenizer.html diff --git a/.claude/worktrees/agent-a02a887cdc0c41851 b/.claude/worktrees/agent-a02a887cdc0c41851 new file mode 160000 index 0000000..41bd031 --- /dev/null +++ b/.claude/worktrees/agent-a02a887cdc0c41851 @@ -0,0 +1 @@ +Subproject commit 41bd03123c40744f76ca4c5fbf9ac05b98b98606 diff --git a/app/detector.py b/app/detector.py index b56ef48..876a6c7 100644 --- a/app/detector.py +++ b/app/detector.py @@ -36,7 +36,79 @@ class DetrVehicleDetector: inputs = {key: value.to(self.device) for key, value in inputs.items()} outputs = self.model(**inputs) - # DETR 后处理需要原图尺寸,PIL size 是 (宽, 高),这里转成 (高, 宽)。 + return self._detections_from_outputs(image, outputs) + + @torch.no_grad() + def inspect_tokens(self, frame_rgb: Any) -> dict[str, Any]: + image = Image.fromarray(frame_rgb) + inputs = self.processor(images=image, return_tensors="pt") + inputs = {key: value.to(self.device) for key, value in inputs.items()} + + features, object_queries_list = self.model.model.backbone(inputs["pixel_values"], inputs["pixel_mask"]) + feature_map, mask = features[-1] + projected_feature_map = self.model.model.input_projection(feature_map) + tokens = projected_feature_map.flatten(2).permute(0, 2, 1) + object_queries = object_queries_list[-1].flatten(2).permute(0, 2, 1) + + outputs = self.model(**inputs, output_hidden_states=True) + target_sizes = torch.tensor([image.size[::-1]], device=self.device) + results = self.processor.post_process_object_detection( + outputs, + target_sizes=target_sizes, + threshold=self.confidence, + )[0] + + token_rows = int(projected_feature_map.shape[2]) + token_cols = int(projected_feature_map.shape[3]) + sample_count = min(48, int(tokens.shape[1])) + sample_tokens = tokens[0, :sample_count, :8].detach().cpu() + token_sequence = [] + for index, vector in enumerate(sample_tokens): + token_sequence.append( + { + "index": index, + "row": index // token_cols, + "col": index % token_cols, + "values": [round(float(value), 4) for value in vector.tolist()], + "magnitude": round(float(vector.norm()), 4), + } + ) + + detections = [] + for score, label, box in zip(results["scores"], results["labels"], results["boxes"]): + label_name = self.model.config.id2label[label.item()] + if label_name not in self.vehicle_labels: + continue + + x1, y1, x2, y2 = box.detach().cpu().numpy().astype(int).tolist() + detections.append( + { + "label": label_name, + "score": round(float(score.detach().cpu()), 4), + "box": [x1, y1, x2, y2], + } + ) + + encoder_last_hidden_state = getattr(outputs, "encoder_last_hidden_state", None) + last_hidden_state = getattr(outputs, "last_hidden_state", None) + return { + "image_size": {"width": image.size[0], "height": image.size[1]}, + "pixel_values_shape": list(inputs["pixel_values"].shape), + "pixel_mask_shape": list(inputs["pixel_mask"].shape), + "feature_map_shape": list(feature_map.shape), + "projected_feature_map_shape": list(projected_feature_map.shape), + "visual_tokens_shape": list(tokens.shape), + "position_encoding_shape": list(object_queries.shape), + "encoder_last_hidden_state_shape": list(encoder_last_hidden_state.shape) if encoder_last_hidden_state is not None else [], + "decoder_last_hidden_state_shape": list(last_hidden_state.shape) if last_hidden_state is not None else [], + "logits_shape": list(outputs.logits.shape), + "pred_boxes_shape": list(outputs.pred_boxes.shape), + "token_grid": {"rows": token_rows, "cols": token_cols, "total": int(tokens.shape[1]), "shown": sample_count}, + "token_sequence": token_sequence, + "detections": detections, + } + + def _detections_from_outputs(self, image: Image.Image, outputs: Any) -> list[dict[str, Any]]: target_sizes = torch.tensor([image.size[::-1]], device=self.device) results = self.processor.post_process_object_detection( outputs, @@ -60,3 +132,4 @@ class DetrVehicleDetector: ) return detections + diff --git a/app/device_manager.py b/app/device_manager.py index 63227d5..62e59fd 100644 --- a/app/device_manager.py +++ b/app/device_manager.py @@ -33,11 +33,16 @@ class DeviceManager: if device_num not in {device.device_num for device in self.devices}: raise ValueError("设备不在 devicelist.env 中") with self.lock: + old_device_num = self.current_device_num self.current_device_num = device_num self.current_url = "" self.timings = {} self.updated_at = time.time() self.version += 1 + print( + f"[device-switch] manager set old={old_device_num} new={device_num} version={self.version}", + flush=True, + ) return self.version def resolve_stream_url(self) -> str: @@ -49,16 +54,38 @@ class DeviceManager: return self.fallback_url raise RuntimeError("devicelist.env 中没有可用设备号") - result = self.api_client.get_stream_url_details(device_num) + print(f"[device-switch] resolve start device={device_num} version={version}", flush=True) + try: + result = self.api_client.get_stream_url_details(device_num) + except Exception as exc: + print( + f"[device-switch] resolve failed device={device_num} version={version} error={exc}", + flush=True, + ) + raise with self.lock: # 避免旧摄像头的慢接口响应覆盖用户刚切换的新选择。 if version != self.version or device_num != self.current_device_num: + print( + f"[device-switch] resolve stale device={device_num} version={version} current={self.current_device_num} current_version={self.version}", + flush=True, + ) return self.current_url self.current_url = result.url self.timings = dict(result.timings) self.updated_at = time.time() + print(f"[device-switch] resolve success device={device_num} version={version}", flush=True) return result.url + def resolve_stream_url_for(self, device_num: str) -> str: + if device_num not in {device.device_num for device in self.devices}: + raise ValueError("设备不在 devicelist.env 中") + result = self.api_client.get_stream_url_details(device_num) + return result.url + + def get_video_grid_devices(self, limit: int = 4) -> list[Device]: + return self.devices[:limit] + def get_snapshot(self) -> dict[str, Any]: with self.lock: return { diff --git a/app/main.py b/app/main.py index f4ee4ce..2277155 100644 --- a/app/main.py +++ b/app/main.py @@ -45,6 +45,18 @@ worker = StreamWorker( resize_width=settings.resize_width, ) +video_grid_devices = device_manager.get_video_grid_devices() +video_grid_workers = { + device.device_num: StreamWorker( + stream_url=lambda device_num=device.device_num: device_manager.resolve_stream_url_for(device_num), + detector=detector, + frame_skip=settings.frame_skip, + jpeg_quality=settings.jpeg_quality, + resize_width=settings.resize_width, + ) + for device in video_grid_devices +} + app = FastAPI(title="DETR 动态打标") app.mount("/static", StaticFiles(directory="app/static"), name="static") templates = Jinja2Templates(directory="app/templates") @@ -57,11 +69,15 @@ def display_model_name(model_name: str) -> str: @app.on_event("startup") def startup() -> None: worker.start() + for grid_worker in video_grid_workers.values(): + grid_worker.start() @app.on_event("shutdown") def shutdown() -> None: worker.stop() + for grid_worker in video_grid_workers.values(): + grid_worker.stop() @app.get("/", response_class=HTMLResponse) @@ -73,15 +89,67 @@ def index(request: Request) -> HTMLResponse: "model": display_model_name(settings.detr_model), "device": detector.device_name, "stream_url": f"设备号:{device_manager.get_snapshot()['current_device_num']}", + "video_grid_devices": video_grid_devices, }, ) +@app.get("/tokenizer", response_class=HTMLResponse) +def tokenizer(request: Request) -> HTMLResponse: + return templates.TemplateResponse( + "tokenizer.html", + { + "request": request, + "model": display_model_name(settings.detr_model), + "device": detector.device_name, + }, + ) + + +@app.get("/tokenizer/state") +def tokenizer_state() -> JSONResponse: + snapshot = worker.get_snapshot() + frame = worker.get_frame_rgb() + if frame is None: + return JSONResponse( + { + "ready": False, + "frame_id": snapshot["frame_id"], + "connected": snapshot["connected"], + "error": snapshot["error"] or "等待视频帧", + } + ) + + data = detector.inspect_tokens(frame) + data.update( + { + "ready": True, + "frame_id": snapshot["frame_id"], + "updated_at": snapshot["updated_at"], + "connected": snapshot["connected"], + "error": snapshot["error"], + } + ) + return JSONResponse(data) + + @app.get("/video") def video() -> StreamingResponse: + return stream_video(worker) + + +@app.get("/video/{device_num}") +def video_device(device_num: str) -> StreamingResponse: + grid_worker = video_grid_workers.get(device_num) + if grid_worker is None: + raise HTTPException(status_code=404, detail="设备不在视频网格中") + return stream_video(grid_worker) + + +def stream_video(stream_worker: StreamWorker) -> StreamingResponse: async def generate(): while True: - frame = worker.get_jpeg() + frame = stream_worker.get_jpeg() if frame is None: await asyncio.sleep(0.1) continue @@ -137,11 +205,13 @@ def status() -> JSONResponse: @app.post("/devices/{device_num}") def switch_device(device_num: str) -> JSONResponse: try: - device_manager.set_current_device(device_num) + version = device_manager.set_current_device(device_num) except ValueError as exc: + print(f"[device-switch] invalid device={device_num}", flush=True) raise HTTPException(status_code=404, detail=str(exc)) from exc worker.reconnect() - return JSONResponse({"current_device_num": device_num}) + print(f"[device-switch] accepted device={device_num} version={version}", flush=True) + return JSONResponse({"current_device_num": device_num, "version": version}) @app.websocket("/ws/detections") diff --git a/app/static/app.js b/app/static/app.js index dc7d757..a7f5195 100644 --- a/app/static/app.js +++ b/app/static/app.js @@ -13,7 +13,10 @@ const timingFrameEl = document.querySelector("#timing-frame"); let selectedDevice = ""; let pendingDevice = ""; +let queuedDevice = ""; +let switching = false; let devicesSignature = ""; +let lastWsSignature = ""; function setConnection(online, text) { connection.textContent = text; @@ -84,16 +87,54 @@ function renderDetections(detections) { .join(""); } -async function switchDevice(deviceNum) { +async function performSwitch(deviceNum) { + switching = true; pendingDevice = deviceNum; devicesSignature = ""; setConnection(false, "切换中"); - const response = await fetch(`/devices/${encodeURIComponent(deviceNum)}`, { method: "POST" }); - if (!response.ok) { - throw new Error("切换摄像头失败"); + console.log("[device-switch] start", { deviceNum }); + + try { + const response = await fetch(`/devices/${encodeURIComponent(deviceNum)}`, { method: "POST" }); + if (!response.ok) { + throw new Error("切换摄像头失败"); + } + const result = await response.json(); + const video = document.querySelector("#video"); + if (video) { + video.src = `/video?t=${Date.now()}`; + } + document.querySelectorAll(".grid-video").forEach((item) => { + item.src = `${item.dataset.src}?t=${Date.now()}`; + }); + console.log("[device-switch] requested", { deviceNum, version: result.version }); + } catch (error) { + pendingDevice = ""; + devicesSignature = ""; + setConnection(false, "切换失败"); + console.error("[device-switch] failed", { deviceNum, error }); + } finally { + switching = false; + if (queuedDevice && queuedDevice !== deviceNum) { + const nextDevice = queuedDevice; + queuedDevice = ""; + return performSwitch(nextDevice); + } + queuedDevice = ""; } - const video = document.querySelector("#video"); - video.src = `/video?t=${Date.now()}`; +} + +function switchDevice(deviceNum) { + if (switching) { + queuedDevice = deviceNum; + pendingDevice = deviceNum; + devicesSignature = ""; + setConnection(false, "等待切换"); + console.log("[device-switch] queued", { deviceNum }); + return Promise.resolve(); + } + + return performSwitch(deviceNum); } function connectWebSocket() { @@ -109,6 +150,18 @@ function connectWebSocket() { errorEl.textContent = data.error || (data.connected ? "正常" : "未连接"); sourceEl.textContent = data.source || "-"; setConnection(Boolean(data.connected), data.connected ? "已连接" : "重连中"); + const wsSignature = `${data.current_device_num}|${data.connected}|${data.frame_id}|${data.error || ""}`; + if (wsSignature !== lastWsSignature) { + lastWsSignature = wsSignature; + console.log("[device-switch] ws", { + currentDeviceNum: data.current_device_num, + pendingDevice, + queuedDevice, + connected: data.connected, + frameId: data.frame_id, + error: data.error, + }); + } if (pendingDevice && data.current_device_num === pendingDevice) { pendingDevice = ""; deviceSelect.disabled = false; diff --git a/app/static/style.css b/app/static/style.css index 55aa9b6..c19cd26 100644 --- a/app/static/style.css +++ b/app/static/style.css @@ -73,6 +73,30 @@ p { color: var(--red); } +.topbar-actions { + display: flex; + align-items: center; + gap: 12px; +} + +.button-link { + display: inline-flex; + align-items: center; + justify-content: center; + min-height: 36px; + padding: 8px 14px; + border: 1px solid var(--line); + border-radius: 999px; + color: var(--text); + text-decoration: none; + background: var(--panel); +} + +.button-link:hover { + border-color: var(--green); + color: var(--green); +} + .layout { display: grid; grid-template-columns: minmax(0, 1fr) 360px; @@ -92,33 +116,6 @@ p { overflow: hidden; } -.pipeline { - display: flex; - align-items: center; - gap: 10px; - padding: 14px; - border-bottom: 1px solid var(--line); - overflow-x: auto; -} - -.stage { - flex: 0 0 auto; - padding: 9px 12px; - border: 1px solid var(--line); - border-radius: 10px; - color: var(--muted); - background: var(--panel-2); -} - -.stage.active { - border-color: rgba(46, 232, 135, 0.5); - color: var(--green); -} - -.arrow { - color: var(--muted); -} - .video-wrap { display: grid; place-items: center; @@ -126,7 +123,8 @@ p { background: #05070b; } -#video { +#video, +.grid-video { display: block; width: 100%; height: auto; @@ -134,6 +132,36 @@ p { object-fit: contain; } +.video-grid { + display: grid; + grid-template-columns: repeat(2, minmax(0, 1fr)); + gap: 14px; + padding: 14px; + background: #05070b; +} + +.video-grid-item { + overflow: hidden; + border: 1px solid var(--line); + border-radius: 14px; + background: var(--panel-2); +} + +.video-grid-title { + padding: 10px 12px; + border-bottom: 1px solid var(--line); + color: var(--muted); + font-size: 13px; +} + +.video-grid-wrap { + min-height: 240px; +} + +.grid-video { + max-height: calc((100vh - 260px) / 2); +} + .side-card { display: flex; flex-direction: column; @@ -166,13 +194,19 @@ p { background: var(--panel-2); } +.detections-panel { + padding: 16px; + border-top: 1px solid var(--line); +} + .detections { - display: flex; - flex-direction: column; + display: grid; + grid-template-columns: repeat(4, minmax(0, 1fr)); gap: 10px; } .detections.empty { + display: block; color: var(--muted); } @@ -197,6 +231,133 @@ p { font-size: 12px; } +.tokenizer-page .tokenizer-layout { + display: grid; + grid-template-columns: minmax(0, 1fr) minmax(0, 2fr); + grid-template-areas: "flow side"; + align-items: start; + width: 100%; + gap: 18px; + padding: 18px; +} + +.tokenizer-page .tokenizer-side { + display: grid; + grid-area: side; + min-width: 0; + gap: 18px; +} + +.tokenizer-page .tokenizer-flow-card { + grid-area: flow; + min-width: 0; + min-height: calc(100vh - 122px); +} + +.tokenizer-page .tokenizer-side .detections { + grid-template-columns: repeat(3, minmax(0, 1fr)); +} + +.tokenizer-card { + border: 1px solid var(--line); + border-radius: 18px; + padding: 18px; + background: rgba(21, 27, 38, 0.9); + box-shadow: 0 18px 40px rgba(0, 0, 0, 0.28); +} + +.pipeline-steps { + display: grid; + gap: 10px; +} + +.pipeline-step { + display: grid; + grid-template-columns: 34px minmax(0, 1fr); + gap: 10px; + align-items: start; + padding: 12px; + border: 1px solid var(--line); + border-radius: 12px; + background: var(--panel-2); +} + +.step-index { + display: grid; + place-items: center; + width: 28px; + height: 28px; + border-radius: 999px; + color: #06100b; + font-weight: 700; + background: var(--green); +} + +.step-title { + margin-bottom: 5px; + font-weight: 700; +} + +.step-value, +.token-summary, +.selected-token { + color: var(--muted); + font-family: ui-monospace, SFMono-Regular, Menlo, monospace; + font-size: 13px; + word-break: break-all; +} + +.token-sequence { + display: grid; + grid-template-columns: repeat(12, minmax(0, 1fr)); + gap: 8px; + margin-top: 14px; +} + +.token-cell { + min-height: 50px; + border: 1px solid var(--line); + border-radius: 10px; + color: var(--text); + cursor: pointer; + background: var(--panel-2); +} + +.token-cell span, +.token-cell small { + display: block; +} + +.token-cell small { + margin-top: 3px; + color: var(--muted); +} + +.token-cell.selected, +.token-cell:hover { + border-color: var(--green); + color: var(--green); +} + +.token-detail-title { + margin-bottom: 10px; + color: var(--green); + font-weight: 700; +} + +.token-vector { + padding: 12px; + border: 1px solid var(--line); + border-radius: 12px; + background: var(--panel-2); +} + +@media (max-width: 1280px) { + .detections { + grid-template-columns: repeat(4, minmax(0, 1fr)); + } +} + @media (max-width: 980px) { .layout { grid-template-columns: 1fr; @@ -206,4 +367,18 @@ p { align-items: flex-start; flex-direction: column; } + + .detections { + grid-template-columns: repeat(3, minmax(0, 1fr)); + } +} + +@media (max-width: 640px) { + .video-grid { + grid-template-columns: 1fr; + } + + .detections { + grid-template-columns: 1fr; + } } diff --git a/app/static/tokenizer.js b/app/static/tokenizer.js new file mode 100644 index 0000000..f53ff83 --- /dev/null +++ b/app/static/tokenizer.js @@ -0,0 +1,131 @@ +const statusEl = document.querySelector("#tokenizer-status"); +const pipelineEl = document.querySelector("#pipeline-steps"); +const tokenSummaryEl = document.querySelector("#token-summary"); +const tokenSequenceEl = document.querySelector("#token-sequence"); +const selectedTokenEl = document.querySelector("#selected-token"); +const detectionsEl = document.querySelector("#tokenizer-detections"); + +let selectedTokenIndex = null; + +function formatShape(shape) { + if (!shape || !shape.length) { + return "-"; + } + return `[${shape.join(", ")}]`; +} + +function setStatus(ready, text) { + statusEl.textContent = text; + statusEl.classList.toggle("online", ready); + statusEl.classList.toggle("offline", !ready); +} + +function renderPipeline(data) { + const steps = [ + ["OpenCV RGB 帧", `${data.image_size?.width ?? "-"} × ${data.image_size?.height ?? "-"}`], + ["PIL Image", `${data.image_size?.width ?? "-"} × ${data.image_size?.height ?? "-"}`], + ["DetrImageProcessor", `pixel_values ${formatShape(data.pixel_values_shape)} / pixel_mask ${formatShape(data.pixel_mask_shape)}`], + ["ResNet-50 backbone", `feature map ${formatShape(data.feature_map_shape)}`], + ["1×1 convolution", `projected ${formatShape(data.projected_feature_map_shape)}`], + ["视觉 token embedding", `由 projected feature map flatten 得到 ${formatShape(data.visual_tokens_shape)}`], + ["位置 embedding", `二维位置 embedding ${formatShape(data.position_encoding_shape)}`], + ["Transformer Encoder", formatShape(data.encoder_last_hidden_state_shape)], + ["Object query embedding + Decoder", `object query embedding 解码后 ${formatShape(data.decoder_last_hidden_state_shape)}`], + ["类别 logits + boxes", `logits ${formatShape(data.logits_shape)} / boxes ${formatShape(data.pred_boxes_shape)}`], + ["post_process_object_detection", `检测结果 ${data.detections?.length ?? 0} 个`], + ]; + + pipelineEl.innerHTML = steps + .map(([title, value], index) => ` +
+
${index + 1}
+
+
${title}
+
${value}
+
+
+ `) + .join(""); +} + +function renderTokens(data) { + const grid = data.token_grid || {}; + tokenSummaryEl.textContent = `帧号 ${data.frame_id ?? "-"} · token 网格 ${grid.rows ?? "-"} × ${grid.cols ?? "-"},总数 ${grid.total ?? "-"},展示前 ${grid.shown ?? 0} 个 token,每个显示前 8 维采样。`; + tokenSequenceEl.innerHTML = (data.token_sequence || []) + .map((token) => ` + + `) + .join(""); + + tokenSequenceEl.querySelectorAll(".token-cell").forEach((button) => { + button.addEventListener("click", () => { + selectedTokenIndex = Number(button.dataset.index); + renderSelectedToken(data); + renderTokens(data); + }); + }); + + renderSelectedToken(data); +} + +function renderSelectedToken(data) { + const tokens = data.token_sequence || []; + const token = tokens.find((item) => item.index === selectedTokenIndex) || tokens[0]; + if (!token) { + selectedTokenEl.textContent = "暂无 token。"; + return; + } + selectedTokenIndex = token.index; + selectedTokenEl.innerHTML = ` +
Token #${token.index} · 网格位置 (${token.row}, ${token.col}) · L2 ${token.magnitude}
+
[${token.values.map((value) => Number(value).toFixed(4)).join(", ")}, ...]
+ `; +} + +function renderDetections(detections) { + if (!detections.length) { + detectionsEl.className = "detections empty"; + detectionsEl.textContent = "暂无目标"; + return; + } + + detectionsEl.className = "detections"; + detectionsEl.innerHTML = detections + .map((det) => ` +
+
+ ${det.label} + ${(det.score * 100).toFixed(1)}% +
+
box: [${det.box.join(", ")}]
+
+ `) + .join(""); +} + +async function refreshTokenizer() { + try { + const response = await fetch(`/tokenizer/state?t=${Date.now()}`); + const data = await response.json(); + if (!data.ready) { + setStatus(false, data.error || "等待帧"); + tokenSummaryEl.textContent = data.error || "等待视频帧"; + return; + } + + setStatus(Boolean(data.connected), data.connected ? "动态更新中" : "未连接"); + renderPipeline(data); + renderTokens(data); + renderDetections(data.detections || []); + } catch (error) { + setStatus(false, "更新失败"); + tokenSummaryEl.textContent = `更新失败:${error}`; + } finally { + setTimeout(refreshTokenizer, 1200); + } +} + +refreshTokenizer(); diff --git a/app/stream_worker.py b/app/stream_worker.py index 1e83401..ed4e56b 100644 --- a/app/stream_worker.py +++ b/app/stream_worker.py @@ -26,6 +26,7 @@ class StreamWorker: self.lock = threading.Lock() self.latest_jpeg: bytes | None = None + self.latest_frame_rgb: Any | None = None self.latest_detections: list[dict[str, Any]] = [] self.frame_id = 0 self.updated_at = 0.0 @@ -56,21 +57,30 @@ class StreamWorker: def reconnect(self) -> None: with self.lock: self.latest_jpeg = None + self.latest_frame_rgb = None self.latest_detections = [] self.frame_id = 0 self.fps = 0.0 self.reconnect_requested = True self.reconnect_version += 1 + version = self.reconnect_version self.connected = False self.error = "正在切换视频源" self.resolve_ms = 0.0 self.open_ms = 0.0 self.first_frame_ms = 0.0 + print(f"[device-switch] worker reconnect requested version={version}", flush=True) def get_jpeg(self) -> bytes | None: with self.lock: return self.latest_jpeg + def get_frame_rgb(self) -> Any | None: + with self.lock: + if self.latest_frame_rgb is None: + return None + return self.latest_frame_rgb.copy() + def get_snapshot(self) -> dict[str, Any]: with self.lock: return { @@ -100,16 +110,29 @@ class StreamWorker: run_version = self.reconnect_version self.reconnect_requested = False if should_reconnect: + print(f"[device-switch] worker reconnect handling version={run_version}", flush=True) # 切换摄像头时必须释放旧连接,否则 OpenCV 会继续阻塞读旧流。 if cap is not None: cap.release() cap = None + print(f"[device-switch] worker released old capture version={run_version}", flush=True) if cap is None or not cap.isOpened(): started = time.monotonic() - stream_url = self.stream_url() if callable(self.stream_url) else self.stream_url + try: + stream_url = self.stream_url() if callable(self.stream_url) else self.stream_url + except Exception as exc: + resolve_ms = round((time.monotonic() - started) * 1000, 2) + with self.lock: + self.resolve_ms = resolve_ms + self.open_ms = 0.0 + self.first_frame_ms = 0.0 + self._set_connection_state(False, f"获取播放地址失败:{exc},2 秒后重试") + time.sleep(2) + continue resolve_ms = round((time.monotonic() - started) * 1000, 2) started = time.monotonic() + print(f"[device-switch] worker open start version={run_version}", flush=True) cap = cv2.VideoCapture(stream_url) open_ms = round((time.monotonic() - started) * 1000, 2) with self.lock: @@ -117,11 +140,13 @@ class StreamWorker: self.resolve_ms = resolve_ms self.first_frame_ms = 0.0 if not cap.isOpened(): + print(f"[device-switch] worker open failed version={run_version} open_ms={open_ms}", flush=True) self._set_connection_state(False, "无法打开视频流,2 秒后重试") cap.release() cap = None time.sleep(2) continue + print(f"[device-switch] worker open success version={run_version} open_ms={open_ms}", flush=True) self._set_connection_state(True, "已连接") started = time.monotonic() @@ -143,9 +168,9 @@ class StreamWorker: frame = self._resize(frame) self.frame_id += 1 + frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) if self.frame_id % self.frame_skip == 0: - frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) last_detections = self.detector.detect(frame_rgb) annotated = self._draw(frame, last_detections) @@ -173,6 +198,7 @@ class StreamWorker: with self.lock: self.latest_jpeg = jpeg.tobytes() + self.latest_frame_rgb = frame_rgb.copy() self.latest_detections = list(last_detections) self.updated_at = time.time() self.connected = True diff --git a/app/templates/index.html b/app/templates/index.html index 247b5b4..25ae501 100644 --- a/app/templates/index.html +++ b/app/templates/index.html @@ -9,26 +9,30 @@
-

DETR 动态打标

-

使用 Python、OpenCV、PyTorch、Transformers DETR 和 FastAPI,Mac mini m2 上运行。

+ +

DETR动态打标:Python、OpenCV、PyTorch、Transformers DETR 和 FastAPI。

+
+
+ tokenizer +
连接中
-
连接中
-
-
源节点
-
-
DETR 推理
-
-
OSD 打标
-
-
FastAPI 输出
-
-
- 动态打标视频流 +
+ {% for device_item in video_grid_devices %} +
+
{{ device_item.name }} · {{ device_item.device_num }}
+
+ {{ device_item.name }} 视频流 +
+
+ {% endfor %}
+
+
暂无目标
+
diff --git a/app/templates/tokenizer.html b/app/templates/tokenizer.html new file mode 100644 index 0000000..5f53455 --- /dev/null +++ b/app/templates/tokenizer.html @@ -0,0 +1,48 @@ + + + + + + DETR Tokenizer 动态可视化 + + + +
+
+ +

实时展示当前视频帧从图像预处理、视觉特征、token 序列到检测输出的过程。

+
+
+ 返回视频 +
等待帧
+
+
+ +
+
+

实时流程

+
+
+ + +
+ + + + diff --git a/tokenizer.md b/tokenizer.md index 611af19..b8a803c 100644 --- a/tokenizer.md +++ b/tokenizer.md @@ -1,3 +1,4 @@ + # DETR 的视觉 token 化过程说明 本文基于当前项目代码 `app/detector.py` 中的实现说明 DETR 的“token 化”过程。 @@ -58,19 +59,39 @@ projected feature map: [batch, 256, H', W'] ↓ flatten 空间维度 H' × W' ↓ -visual tokens: [batch, H'×W', 256] +visual token embeddings: [batch, H'×W', 256] ↓ -加入二维位置编码 +加入二维位置 embedding ↓ Transformer Encoder ↓ -Object Queries + Transformer Decoder +Object query embeddings + Transformer Decoder ↓ 类别 logits + 边界框 boxes ↓ post_process_object_detection 还原到原图坐标 ``` +## Embedding 在 1-11 个环节中的位置 + +在这个 DETR 流程里,embedding 不是单独只有一步,而是出现在 3 个关键环节: + +| 页面步骤 | 名称 | embedding 含义 | +| --- | --- | --- | +| 第 6 步 | visual token embedding | `projected feature map` 经过 flatten 后,每个空间网格点变成一个 256 维视觉 token embedding。 | +| 第 7 步 | position embedding | 给每个视觉 token 加入二维位置 embedding,让 Transformer 知道 token 原本在图像中的位置。 | +| 第 9 步 | object query embedding | DETR 使用一组可学习的 object query embeddings 进入 Decoder,每个 query 最终预测一个候选目标。 | + +所以如果问“embedding 在 1-11 哪个环节”,最核心的是: + +```text +第 6 步:产生视觉 token embedding +第 7 步:加入位置 embedding +第 9 步:object query embedding 进入 Decoder +``` + +第 6 步是图像内容 embedding,第 7 步是空间位置 embedding,第 9 步是检测目标查询 embedding。 + ## 第 1 步:图像预处理 代码: @@ -212,7 +233,7 @@ x = x.permute(2, 0, 1) # [h*w, batch, 256] 两种写法本质相同,只是 Transformer 接口期望的维度顺序不同。 -## 第 5 步:加入二维位置编码 +## 第 5 步:加入二维位置 embedding Transformer 本身不理解图像中的二维空间位置。 @@ -264,11 +285,11 @@ Encoder 会通过 self-attention 建模图像中不同区域之间的关系。 - 道路区域可以影响车辆判断。 - 远处小目标可以和周围上下文一起被理解。 -## 第 7 步:Object Queries 和 Transformer Decoder +## 第 7 步:Object query embedding 和 Transformer Decoder DETR 与传统检测器不同,它不是先生成大量 anchor box。 -它使用一组可学习的 object queries。常见数量是: +它使用一组可学习的 object query embeddings。常见数量是: ```text 100 个 object queries