实时展示当前视频帧从图像预处理、视觉特征、token 序列到检测输出的过程。
+diff --git a/.claude/worktrees/agent-a02a887cdc0c41851 b/.claude/worktrees/agent-a02a887cdc0c41851 new file mode 160000 index 0000000..41bd031 --- /dev/null +++ b/.claude/worktrees/agent-a02a887cdc0c41851 @@ -0,0 +1 @@ +Subproject commit 41bd03123c40744f76ca4c5fbf9ac05b98b98606 diff --git a/app/detector.py b/app/detector.py index b56ef48..876a6c7 100644 --- a/app/detector.py +++ b/app/detector.py @@ -36,7 +36,79 @@ class DetrVehicleDetector: inputs = {key: value.to(self.device) for key, value in inputs.items()} outputs = self.model(**inputs) - # DETR 后处理需要原图尺寸,PIL size 是 (宽, 高),这里转成 (高, 宽)。 + return self._detections_from_outputs(image, outputs) + + @torch.no_grad() + def inspect_tokens(self, frame_rgb: Any) -> dict[str, Any]: + image = Image.fromarray(frame_rgb) + inputs = self.processor(images=image, return_tensors="pt") + inputs = {key: value.to(self.device) for key, value in inputs.items()} + + features, object_queries_list = self.model.model.backbone(inputs["pixel_values"], inputs["pixel_mask"]) + feature_map, mask = features[-1] + projected_feature_map = self.model.model.input_projection(feature_map) + tokens = projected_feature_map.flatten(2).permute(0, 2, 1) + object_queries = object_queries_list[-1].flatten(2).permute(0, 2, 1) + + outputs = self.model(**inputs, output_hidden_states=True) + target_sizes = torch.tensor([image.size[::-1]], device=self.device) + results = self.processor.post_process_object_detection( + outputs, + target_sizes=target_sizes, + threshold=self.confidence, + )[0] + + token_rows = int(projected_feature_map.shape[2]) + token_cols = int(projected_feature_map.shape[3]) + sample_count = min(48, int(tokens.shape[1])) + sample_tokens = tokens[0, :sample_count, :8].detach().cpu() + token_sequence = [] + for index, vector in enumerate(sample_tokens): + token_sequence.append( + { + "index": index, + "row": index // token_cols, + "col": index % token_cols, + "values": [round(float(value), 4) for value in vector.tolist()], + "magnitude": round(float(vector.norm()), 4), + } + ) + + detections = [] + for score, label, box in zip(results["scores"], results["labels"], results["boxes"]): + label_name = self.model.config.id2label[label.item()] + if label_name not in self.vehicle_labels: + continue + + x1, y1, x2, y2 = box.detach().cpu().numpy().astype(int).tolist() + detections.append( + { + "label": label_name, + "score": round(float(score.detach().cpu()), 4), + "box": [x1, y1, x2, y2], + } + ) + + encoder_last_hidden_state = getattr(outputs, "encoder_last_hidden_state", None) + last_hidden_state = getattr(outputs, "last_hidden_state", None) + return { + "image_size": {"width": image.size[0], "height": image.size[1]}, + "pixel_values_shape": list(inputs["pixel_values"].shape), + "pixel_mask_shape": list(inputs["pixel_mask"].shape), + "feature_map_shape": list(feature_map.shape), + "projected_feature_map_shape": list(projected_feature_map.shape), + "visual_tokens_shape": list(tokens.shape), + "position_encoding_shape": list(object_queries.shape), + "encoder_last_hidden_state_shape": list(encoder_last_hidden_state.shape) if encoder_last_hidden_state is not None else [], + "decoder_last_hidden_state_shape": list(last_hidden_state.shape) if last_hidden_state is not None else [], + "logits_shape": list(outputs.logits.shape), + "pred_boxes_shape": list(outputs.pred_boxes.shape), + "token_grid": {"rows": token_rows, "cols": token_cols, "total": int(tokens.shape[1]), "shown": sample_count}, + "token_sequence": token_sequence, + "detections": detections, + } + + def _detections_from_outputs(self, image: Image.Image, outputs: Any) -> list[dict[str, Any]]: target_sizes = torch.tensor([image.size[::-1]], device=self.device) results = self.processor.post_process_object_detection( outputs, @@ -60,3 +132,4 @@ class DetrVehicleDetector: ) return detections + diff --git a/app/device_manager.py b/app/device_manager.py index 63227d5..62e59fd 100644 --- a/app/device_manager.py +++ b/app/device_manager.py @@ -33,11 +33,16 @@ class DeviceManager: if device_num not in {device.device_num for device in self.devices}: raise ValueError("设备不在 devicelist.env 中") with self.lock: + old_device_num = self.current_device_num self.current_device_num = device_num self.current_url = "" self.timings = {} self.updated_at = time.time() self.version += 1 + print( + f"[device-switch] manager set old={old_device_num} new={device_num} version={self.version}", + flush=True, + ) return self.version def resolve_stream_url(self) -> str: @@ -49,16 +54,38 @@ class DeviceManager: return self.fallback_url raise RuntimeError("devicelist.env 中没有可用设备号") - result = self.api_client.get_stream_url_details(device_num) + print(f"[device-switch] resolve start device={device_num} version={version}", flush=True) + try: + result = self.api_client.get_stream_url_details(device_num) + except Exception as exc: + print( + f"[device-switch] resolve failed device={device_num} version={version} error={exc}", + flush=True, + ) + raise with self.lock: # 避免旧摄像头的慢接口响应覆盖用户刚切换的新选择。 if version != self.version or device_num != self.current_device_num: + print( + f"[device-switch] resolve stale device={device_num} version={version} current={self.current_device_num} current_version={self.version}", + flush=True, + ) return self.current_url self.current_url = result.url self.timings = dict(result.timings) self.updated_at = time.time() + print(f"[device-switch] resolve success device={device_num} version={version}", flush=True) return result.url + def resolve_stream_url_for(self, device_num: str) -> str: + if device_num not in {device.device_num for device in self.devices}: + raise ValueError("设备不在 devicelist.env 中") + result = self.api_client.get_stream_url_details(device_num) + return result.url + + def get_video_grid_devices(self, limit: int = 4) -> list[Device]: + return self.devices[:limit] + def get_snapshot(self) -> dict[str, Any]: with self.lock: return { diff --git a/app/main.py b/app/main.py index f4ee4ce..2277155 100644 --- a/app/main.py +++ b/app/main.py @@ -45,6 +45,18 @@ worker = StreamWorker( resize_width=settings.resize_width, ) +video_grid_devices = device_manager.get_video_grid_devices() +video_grid_workers = { + device.device_num: StreamWorker( + stream_url=lambda device_num=device.device_num: device_manager.resolve_stream_url_for(device_num), + detector=detector, + frame_skip=settings.frame_skip, + jpeg_quality=settings.jpeg_quality, + resize_width=settings.resize_width, + ) + for device in video_grid_devices +} + app = FastAPI(title="DETR 动态打标") app.mount("/static", StaticFiles(directory="app/static"), name="static") templates = Jinja2Templates(directory="app/templates") @@ -57,11 +69,15 @@ def display_model_name(model_name: str) -> str: @app.on_event("startup") def startup() -> None: worker.start() + for grid_worker in video_grid_workers.values(): + grid_worker.start() @app.on_event("shutdown") def shutdown() -> None: worker.stop() + for grid_worker in video_grid_workers.values(): + grid_worker.stop() @app.get("/", response_class=HTMLResponse) @@ -73,15 +89,67 @@ def index(request: Request) -> HTMLResponse: "model": display_model_name(settings.detr_model), "device": detector.device_name, "stream_url": f"设备号:{device_manager.get_snapshot()['current_device_num']}", + "video_grid_devices": video_grid_devices, }, ) +@app.get("/tokenizer", response_class=HTMLResponse) +def tokenizer(request: Request) -> HTMLResponse: + return templates.TemplateResponse( + "tokenizer.html", + { + "request": request, + "model": display_model_name(settings.detr_model), + "device": detector.device_name, + }, + ) + + +@app.get("/tokenizer/state") +def tokenizer_state() -> JSONResponse: + snapshot = worker.get_snapshot() + frame = worker.get_frame_rgb() + if frame is None: + return JSONResponse( + { + "ready": False, + "frame_id": snapshot["frame_id"], + "connected": snapshot["connected"], + "error": snapshot["error"] or "等待视频帧", + } + ) + + data = detector.inspect_tokens(frame) + data.update( + { + "ready": True, + "frame_id": snapshot["frame_id"], + "updated_at": snapshot["updated_at"], + "connected": snapshot["connected"], + "error": snapshot["error"], + } + ) + return JSONResponse(data) + + @app.get("/video") def video() -> StreamingResponse: + return stream_video(worker) + + +@app.get("/video/{device_num}") +def video_device(device_num: str) -> StreamingResponse: + grid_worker = video_grid_workers.get(device_num) + if grid_worker is None: + raise HTTPException(status_code=404, detail="设备不在视频网格中") + return stream_video(grid_worker) + + +def stream_video(stream_worker: StreamWorker) -> StreamingResponse: async def generate(): while True: - frame = worker.get_jpeg() + frame = stream_worker.get_jpeg() if frame is None: await asyncio.sleep(0.1) continue @@ -137,11 +205,13 @@ def status() -> JSONResponse: @app.post("/devices/{device_num}") def switch_device(device_num: str) -> JSONResponse: try: - device_manager.set_current_device(device_num) + version = device_manager.set_current_device(device_num) except ValueError as exc: + print(f"[device-switch] invalid device={device_num}", flush=True) raise HTTPException(status_code=404, detail=str(exc)) from exc worker.reconnect() - return JSONResponse({"current_device_num": device_num}) + print(f"[device-switch] accepted device={device_num} version={version}", flush=True) + return JSONResponse({"current_device_num": device_num, "version": version}) @app.websocket("/ws/detections") diff --git a/app/static/app.js b/app/static/app.js index dc7d757..a7f5195 100644 --- a/app/static/app.js +++ b/app/static/app.js @@ -13,7 +13,10 @@ const timingFrameEl = document.querySelector("#timing-frame"); let selectedDevice = ""; let pendingDevice = ""; +let queuedDevice = ""; +let switching = false; let devicesSignature = ""; +let lastWsSignature = ""; function setConnection(online, text) { connection.textContent = text; @@ -84,16 +87,54 @@ function renderDetections(detections) { .join(""); } -async function switchDevice(deviceNum) { +async function performSwitch(deviceNum) { + switching = true; pendingDevice = deviceNum; devicesSignature = ""; setConnection(false, "切换中"); - const response = await fetch(`/devices/${encodeURIComponent(deviceNum)}`, { method: "POST" }); - if (!response.ok) { - throw new Error("切换摄像头失败"); + console.log("[device-switch] start", { deviceNum }); + + try { + const response = await fetch(`/devices/${encodeURIComponent(deviceNum)}`, { method: "POST" }); + if (!response.ok) { + throw new Error("切换摄像头失败"); + } + const result = await response.json(); + const video = document.querySelector("#video"); + if (video) { + video.src = `/video?t=${Date.now()}`; + } + document.querySelectorAll(".grid-video").forEach((item) => { + item.src = `${item.dataset.src}?t=${Date.now()}`; + }); + console.log("[device-switch] requested", { deviceNum, version: result.version }); + } catch (error) { + pendingDevice = ""; + devicesSignature = ""; + setConnection(false, "切换失败"); + console.error("[device-switch] failed", { deviceNum, error }); + } finally { + switching = false; + if (queuedDevice && queuedDevice !== deviceNum) { + const nextDevice = queuedDevice; + queuedDevice = ""; + return performSwitch(nextDevice); + } + queuedDevice = ""; } - const video = document.querySelector("#video"); - video.src = `/video?t=${Date.now()}`; +} + +function switchDevice(deviceNum) { + if (switching) { + queuedDevice = deviceNum; + pendingDevice = deviceNum; + devicesSignature = ""; + setConnection(false, "等待切换"); + console.log("[device-switch] queued", { deviceNum }); + return Promise.resolve(); + } + + return performSwitch(deviceNum); } function connectWebSocket() { @@ -109,6 +150,18 @@ function connectWebSocket() { errorEl.textContent = data.error || (data.connected ? "正常" : "未连接"); sourceEl.textContent = data.source || "-"; setConnection(Boolean(data.connected), data.connected ? "已连接" : "重连中"); + const wsSignature = `${data.current_device_num}|${data.connected}|${data.frame_id}|${data.error || ""}`; + if (wsSignature !== lastWsSignature) { + lastWsSignature = wsSignature; + console.log("[device-switch] ws", { + currentDeviceNum: data.current_device_num, + pendingDevice, + queuedDevice, + connected: data.connected, + frameId: data.frame_id, + error: data.error, + }); + } if (pendingDevice && data.current_device_num === pendingDevice) { pendingDevice = ""; deviceSelect.disabled = false; diff --git a/app/static/style.css b/app/static/style.css index 55aa9b6..c19cd26 100644 --- a/app/static/style.css +++ b/app/static/style.css @@ -73,6 +73,30 @@ p { color: var(--red); } +.topbar-actions { + display: flex; + align-items: center; + gap: 12px; +} + +.button-link { + display: inline-flex; + align-items: center; + justify-content: center; + min-height: 36px; + padding: 8px 14px; + border: 1px solid var(--line); + border-radius: 999px; + color: var(--text); + text-decoration: none; + background: var(--panel); +} + +.button-link:hover { + border-color: var(--green); + color: var(--green); +} + .layout { display: grid; grid-template-columns: minmax(0, 1fr) 360px; @@ -92,33 +116,6 @@ p { overflow: hidden; } -.pipeline { - display: flex; - align-items: center; - gap: 10px; - padding: 14px; - border-bottom: 1px solid var(--line); - overflow-x: auto; -} - -.stage { - flex: 0 0 auto; - padding: 9px 12px; - border: 1px solid var(--line); - border-radius: 10px; - color: var(--muted); - background: var(--panel-2); -} - -.stage.active { - border-color: rgba(46, 232, 135, 0.5); - color: var(--green); -} - -.arrow { - color: var(--muted); -} - .video-wrap { display: grid; place-items: center; @@ -126,7 +123,8 @@ p { background: #05070b; } -#video { +#video, +.grid-video { display: block; width: 100%; height: auto; @@ -134,6 +132,36 @@ p { object-fit: contain; } +.video-grid { + display: grid; + grid-template-columns: repeat(2, minmax(0, 1fr)); + gap: 14px; + padding: 14px; + background: #05070b; +} + +.video-grid-item { + overflow: hidden; + border: 1px solid var(--line); + border-radius: 14px; + background: var(--panel-2); +} + +.video-grid-title { + padding: 10px 12px; + border-bottom: 1px solid var(--line); + color: var(--muted); + font-size: 13px; +} + +.video-grid-wrap { + min-height: 240px; +} + +.grid-video { + max-height: calc((100vh - 260px) / 2); +} + .side-card { display: flex; flex-direction: column; @@ -166,13 +194,19 @@ p { background: var(--panel-2); } +.detections-panel { + padding: 16px; + border-top: 1px solid var(--line); +} + .detections { - display: flex; - flex-direction: column; + display: grid; + grid-template-columns: repeat(4, minmax(0, 1fr)); gap: 10px; } .detections.empty { + display: block; color: var(--muted); } @@ -197,6 +231,133 @@ p { font-size: 12px; } +.tokenizer-page .tokenizer-layout { + display: grid; + grid-template-columns: minmax(0, 1fr) minmax(0, 2fr); + grid-template-areas: "flow side"; + align-items: start; + width: 100%; + gap: 18px; + padding: 18px; +} + +.tokenizer-page .tokenizer-side { + display: grid; + grid-area: side; + min-width: 0; + gap: 18px; +} + +.tokenizer-page .tokenizer-flow-card { + grid-area: flow; + min-width: 0; + min-height: calc(100vh - 122px); +} + +.tokenizer-page .tokenizer-side .detections { + grid-template-columns: repeat(3, minmax(0, 1fr)); +} + +.tokenizer-card { + border: 1px solid var(--line); + border-radius: 18px; + padding: 18px; + background: rgba(21, 27, 38, 0.9); + box-shadow: 0 18px 40px rgba(0, 0, 0, 0.28); +} + +.pipeline-steps { + display: grid; + gap: 10px; +} + +.pipeline-step { + display: grid; + grid-template-columns: 34px minmax(0, 1fr); + gap: 10px; + align-items: start; + padding: 12px; + border: 1px solid var(--line); + border-radius: 12px; + background: var(--panel-2); +} + +.step-index { + display: grid; + place-items: center; + width: 28px; + height: 28px; + border-radius: 999px; + color: #06100b; + font-weight: 700; + background: var(--green); +} + +.step-title { + margin-bottom: 5px; + font-weight: 700; +} + +.step-value, +.token-summary, +.selected-token { + color: var(--muted); + font-family: ui-monospace, SFMono-Regular, Menlo, monospace; + font-size: 13px; + word-break: break-all; +} + +.token-sequence { + display: grid; + grid-template-columns: repeat(12, minmax(0, 1fr)); + gap: 8px; + margin-top: 14px; +} + +.token-cell { + min-height: 50px; + border: 1px solid var(--line); + border-radius: 10px; + color: var(--text); + cursor: pointer; + background: var(--panel-2); +} + +.token-cell span, +.token-cell small { + display: block; +} + +.token-cell small { + margin-top: 3px; + color: var(--muted); +} + +.token-cell.selected, +.token-cell:hover { + border-color: var(--green); + color: var(--green); +} + +.token-detail-title { + margin-bottom: 10px; + color: var(--green); + font-weight: 700; +} + +.token-vector { + padding: 12px; + border: 1px solid var(--line); + border-radius: 12px; + background: var(--panel-2); +} + +@media (max-width: 1280px) { + .detections { + grid-template-columns: repeat(4, minmax(0, 1fr)); + } +} + @media (max-width: 980px) { .layout { grid-template-columns: 1fr; @@ -206,4 +367,18 @@ p { align-items: flex-start; flex-direction: column; } + + .detections { + grid-template-columns: repeat(3, minmax(0, 1fr)); + } +} + +@media (max-width: 640px) { + .video-grid { + grid-template-columns: 1fr; + } + + .detections { + grid-template-columns: 1fr; + } } diff --git a/app/static/tokenizer.js b/app/static/tokenizer.js new file mode 100644 index 0000000..f53ff83 --- /dev/null +++ b/app/static/tokenizer.js @@ -0,0 +1,131 @@ +const statusEl = document.querySelector("#tokenizer-status"); +const pipelineEl = document.querySelector("#pipeline-steps"); +const tokenSummaryEl = document.querySelector("#token-summary"); +const tokenSequenceEl = document.querySelector("#token-sequence"); +const selectedTokenEl = document.querySelector("#selected-token"); +const detectionsEl = document.querySelector("#tokenizer-detections"); + +let selectedTokenIndex = null; + +function formatShape(shape) { + if (!shape || !shape.length) { + return "-"; + } + return `[${shape.join(", ")}]`; +} + +function setStatus(ready, text) { + statusEl.textContent = text; + statusEl.classList.toggle("online", ready); + statusEl.classList.toggle("offline", !ready); +} + +function renderPipeline(data) { + const steps = [ + ["OpenCV RGB 帧", `${data.image_size?.width ?? "-"} × ${data.image_size?.height ?? "-"}`], + ["PIL Image", `${data.image_size?.width ?? "-"} × ${data.image_size?.height ?? "-"}`], + ["DetrImageProcessor", `pixel_values ${formatShape(data.pixel_values_shape)} / pixel_mask ${formatShape(data.pixel_mask_shape)}`], + ["ResNet-50 backbone", `feature map ${formatShape(data.feature_map_shape)}`], + ["1×1 convolution", `projected ${formatShape(data.projected_feature_map_shape)}`], + ["视觉 token embedding", `由 projected feature map flatten 得到 ${formatShape(data.visual_tokens_shape)}`], + ["位置 embedding", `二维位置 embedding ${formatShape(data.position_encoding_shape)}`], + ["Transformer Encoder", formatShape(data.encoder_last_hidden_state_shape)], + ["Object query embedding + Decoder", `object query embedding 解码后 ${formatShape(data.decoder_last_hidden_state_shape)}`], + ["类别 logits + boxes", `logits ${formatShape(data.logits_shape)} / boxes ${formatShape(data.pred_boxes_shape)}`], + ["post_process_object_detection", `检测结果 ${data.detections?.length ?? 0} 个`], + ]; + + pipelineEl.innerHTML = steps + .map(([title, value], index) => ` +
使用 Python、OpenCV、PyTorch、Transformers DETR 和 FastAPI,Mac mini m2 上运行。
+ +DETR动态打标:Python、OpenCV、PyTorch、Transformers DETR 和 FastAPI。
+实时展示当前视频帧从图像预处理、视觉特征、token 序列到检测输出的过程。
+