update at 2026-06-04 14:09:16
This commit is contained in:
1
.claude/worktrees/agent-a02a887cdc0c41851
Submodule
1
.claude/worktrees/agent-a02a887cdc0c41851
Submodule
Submodule .claude/worktrees/agent-a02a887cdc0c41851 added at 41bd03123c
@@ -36,7 +36,79 @@ class DetrVehicleDetector:
|
||||
inputs = {key: value.to(self.device) for key, value in inputs.items()}
|
||||
|
||||
outputs = self.model(**inputs)
|
||||
# DETR 后处理需要原图尺寸,PIL size 是 (宽, 高),这里转成 (高, 宽)。
|
||||
return self._detections_from_outputs(image, outputs)
|
||||
|
||||
@torch.no_grad()
|
||||
def inspect_tokens(self, frame_rgb: Any) -> dict[str, Any]:
|
||||
image = Image.fromarray(frame_rgb)
|
||||
inputs = self.processor(images=image, return_tensors="pt")
|
||||
inputs = {key: value.to(self.device) for key, value in inputs.items()}
|
||||
|
||||
features, object_queries_list = self.model.model.backbone(inputs["pixel_values"], inputs["pixel_mask"])
|
||||
feature_map, mask = features[-1]
|
||||
projected_feature_map = self.model.model.input_projection(feature_map)
|
||||
tokens = projected_feature_map.flatten(2).permute(0, 2, 1)
|
||||
object_queries = object_queries_list[-1].flatten(2).permute(0, 2, 1)
|
||||
|
||||
outputs = self.model(**inputs, output_hidden_states=True)
|
||||
target_sizes = torch.tensor([image.size[::-1]], device=self.device)
|
||||
results = self.processor.post_process_object_detection(
|
||||
outputs,
|
||||
target_sizes=target_sizes,
|
||||
threshold=self.confidence,
|
||||
)[0]
|
||||
|
||||
token_rows = int(projected_feature_map.shape[2])
|
||||
token_cols = int(projected_feature_map.shape[3])
|
||||
sample_count = min(48, int(tokens.shape[1]))
|
||||
sample_tokens = tokens[0, :sample_count, :8].detach().cpu()
|
||||
token_sequence = []
|
||||
for index, vector in enumerate(sample_tokens):
|
||||
token_sequence.append(
|
||||
{
|
||||
"index": index,
|
||||
"row": index // token_cols,
|
||||
"col": index % token_cols,
|
||||
"values": [round(float(value), 4) for value in vector.tolist()],
|
||||
"magnitude": round(float(vector.norm()), 4),
|
||||
}
|
||||
)
|
||||
|
||||
detections = []
|
||||
for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
|
||||
label_name = self.model.config.id2label[label.item()]
|
||||
if label_name not in self.vehicle_labels:
|
||||
continue
|
||||
|
||||
x1, y1, x2, y2 = box.detach().cpu().numpy().astype(int).tolist()
|
||||
detections.append(
|
||||
{
|
||||
"label": label_name,
|
||||
"score": round(float(score.detach().cpu()), 4),
|
||||
"box": [x1, y1, x2, y2],
|
||||
}
|
||||
)
|
||||
|
||||
encoder_last_hidden_state = getattr(outputs, "encoder_last_hidden_state", None)
|
||||
last_hidden_state = getattr(outputs, "last_hidden_state", None)
|
||||
return {
|
||||
"image_size": {"width": image.size[0], "height": image.size[1]},
|
||||
"pixel_values_shape": list(inputs["pixel_values"].shape),
|
||||
"pixel_mask_shape": list(inputs["pixel_mask"].shape),
|
||||
"feature_map_shape": list(feature_map.shape),
|
||||
"projected_feature_map_shape": list(projected_feature_map.shape),
|
||||
"visual_tokens_shape": list(tokens.shape),
|
||||
"position_encoding_shape": list(object_queries.shape),
|
||||
"encoder_last_hidden_state_shape": list(encoder_last_hidden_state.shape) if encoder_last_hidden_state is not None else [],
|
||||
"decoder_last_hidden_state_shape": list(last_hidden_state.shape) if last_hidden_state is not None else [],
|
||||
"logits_shape": list(outputs.logits.shape),
|
||||
"pred_boxes_shape": list(outputs.pred_boxes.shape),
|
||||
"token_grid": {"rows": token_rows, "cols": token_cols, "total": int(tokens.shape[1]), "shown": sample_count},
|
||||
"token_sequence": token_sequence,
|
||||
"detections": detections,
|
||||
}
|
||||
|
||||
def _detections_from_outputs(self, image: Image.Image, outputs: Any) -> list[dict[str, Any]]:
|
||||
target_sizes = torch.tensor([image.size[::-1]], device=self.device)
|
||||
results = self.processor.post_process_object_detection(
|
||||
outputs,
|
||||
@@ -60,3 +132,4 @@ class DetrVehicleDetector:
|
||||
)
|
||||
|
||||
return detections
|
||||
|
||||
|
||||
@@ -33,11 +33,16 @@ class DeviceManager:
|
||||
if device_num not in {device.device_num for device in self.devices}:
|
||||
raise ValueError("设备不在 devicelist.env 中")
|
||||
with self.lock:
|
||||
old_device_num = self.current_device_num
|
||||
self.current_device_num = device_num
|
||||
self.current_url = ""
|
||||
self.timings = {}
|
||||
self.updated_at = time.time()
|
||||
self.version += 1
|
||||
print(
|
||||
f"[device-switch] manager set old={old_device_num} new={device_num} version={self.version}",
|
||||
flush=True,
|
||||
)
|
||||
return self.version
|
||||
|
||||
def resolve_stream_url(self) -> str:
|
||||
@@ -49,16 +54,38 @@ class DeviceManager:
|
||||
return self.fallback_url
|
||||
raise RuntimeError("devicelist.env 中没有可用设备号")
|
||||
|
||||
print(f"[device-switch] resolve start device={device_num} version={version}", flush=True)
|
||||
try:
|
||||
result = self.api_client.get_stream_url_details(device_num)
|
||||
except Exception as exc:
|
||||
print(
|
||||
f"[device-switch] resolve failed device={device_num} version={version} error={exc}",
|
||||
flush=True,
|
||||
)
|
||||
raise
|
||||
with self.lock:
|
||||
# 避免旧摄像头的慢接口响应覆盖用户刚切换的新选择。
|
||||
if version != self.version or device_num != self.current_device_num:
|
||||
print(
|
||||
f"[device-switch] resolve stale device={device_num} version={version} current={self.current_device_num} current_version={self.version}",
|
||||
flush=True,
|
||||
)
|
||||
return self.current_url
|
||||
self.current_url = result.url
|
||||
self.timings = dict(result.timings)
|
||||
self.updated_at = time.time()
|
||||
print(f"[device-switch] resolve success device={device_num} version={version}", flush=True)
|
||||
return result.url
|
||||
|
||||
def resolve_stream_url_for(self, device_num: str) -> str:
|
||||
if device_num not in {device.device_num for device in self.devices}:
|
||||
raise ValueError("设备不在 devicelist.env 中")
|
||||
result = self.api_client.get_stream_url_details(device_num)
|
||||
return result.url
|
||||
|
||||
def get_video_grid_devices(self, limit: int = 4) -> list[Device]:
|
||||
return self.devices[:limit]
|
||||
|
||||
def get_snapshot(self) -> dict[str, Any]:
|
||||
with self.lock:
|
||||
return {
|
||||
|
||||
76
app/main.py
76
app/main.py
@@ -45,6 +45,18 @@ worker = StreamWorker(
|
||||
resize_width=settings.resize_width,
|
||||
)
|
||||
|
||||
video_grid_devices = device_manager.get_video_grid_devices()
|
||||
video_grid_workers = {
|
||||
device.device_num: StreamWorker(
|
||||
stream_url=lambda device_num=device.device_num: device_manager.resolve_stream_url_for(device_num),
|
||||
detector=detector,
|
||||
frame_skip=settings.frame_skip,
|
||||
jpeg_quality=settings.jpeg_quality,
|
||||
resize_width=settings.resize_width,
|
||||
)
|
||||
for device in video_grid_devices
|
||||
}
|
||||
|
||||
app = FastAPI(title="DETR 动态打标")
|
||||
app.mount("/static", StaticFiles(directory="app/static"), name="static")
|
||||
templates = Jinja2Templates(directory="app/templates")
|
||||
@@ -57,11 +69,15 @@ def display_model_name(model_name: str) -> str:
|
||||
@app.on_event("startup")
|
||||
def startup() -> None:
|
||||
worker.start()
|
||||
for grid_worker in video_grid_workers.values():
|
||||
grid_worker.start()
|
||||
|
||||
|
||||
@app.on_event("shutdown")
|
||||
def shutdown() -> None:
|
||||
worker.stop()
|
||||
for grid_worker in video_grid_workers.values():
|
||||
grid_worker.stop()
|
||||
|
||||
|
||||
@app.get("/", response_class=HTMLResponse)
|
||||
@@ -73,15 +89,67 @@ def index(request: Request) -> HTMLResponse:
|
||||
"model": display_model_name(settings.detr_model),
|
||||
"device": detector.device_name,
|
||||
"stream_url": f"设备号:{device_manager.get_snapshot()['current_device_num']}",
|
||||
"video_grid_devices": video_grid_devices,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
@app.get("/tokenizer", response_class=HTMLResponse)
|
||||
def tokenizer(request: Request) -> HTMLResponse:
|
||||
return templates.TemplateResponse(
|
||||
"tokenizer.html",
|
||||
{
|
||||
"request": request,
|
||||
"model": display_model_name(settings.detr_model),
|
||||
"device": detector.device_name,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
@app.get("/tokenizer/state")
|
||||
def tokenizer_state() -> JSONResponse:
|
||||
snapshot = worker.get_snapshot()
|
||||
frame = worker.get_frame_rgb()
|
||||
if frame is None:
|
||||
return JSONResponse(
|
||||
{
|
||||
"ready": False,
|
||||
"frame_id": snapshot["frame_id"],
|
||||
"connected": snapshot["connected"],
|
||||
"error": snapshot["error"] or "等待视频帧",
|
||||
}
|
||||
)
|
||||
|
||||
data = detector.inspect_tokens(frame)
|
||||
data.update(
|
||||
{
|
||||
"ready": True,
|
||||
"frame_id": snapshot["frame_id"],
|
||||
"updated_at": snapshot["updated_at"],
|
||||
"connected": snapshot["connected"],
|
||||
"error": snapshot["error"],
|
||||
}
|
||||
)
|
||||
return JSONResponse(data)
|
||||
|
||||
|
||||
@app.get("/video")
|
||||
def video() -> StreamingResponse:
|
||||
return stream_video(worker)
|
||||
|
||||
|
||||
@app.get("/video/{device_num}")
|
||||
def video_device(device_num: str) -> StreamingResponse:
|
||||
grid_worker = video_grid_workers.get(device_num)
|
||||
if grid_worker is None:
|
||||
raise HTTPException(status_code=404, detail="设备不在视频网格中")
|
||||
return stream_video(grid_worker)
|
||||
|
||||
|
||||
def stream_video(stream_worker: StreamWorker) -> StreamingResponse:
|
||||
async def generate():
|
||||
while True:
|
||||
frame = worker.get_jpeg()
|
||||
frame = stream_worker.get_jpeg()
|
||||
if frame is None:
|
||||
await asyncio.sleep(0.1)
|
||||
continue
|
||||
@@ -137,11 +205,13 @@ def status() -> JSONResponse:
|
||||
@app.post("/devices/{device_num}")
|
||||
def switch_device(device_num: str) -> JSONResponse:
|
||||
try:
|
||||
device_manager.set_current_device(device_num)
|
||||
version = device_manager.set_current_device(device_num)
|
||||
except ValueError as exc:
|
||||
print(f"[device-switch] invalid device={device_num}", flush=True)
|
||||
raise HTTPException(status_code=404, detail=str(exc)) from exc
|
||||
worker.reconnect()
|
||||
return JSONResponse({"current_device_num": device_num})
|
||||
print(f"[device-switch] accepted device={device_num} version={version}", flush=True)
|
||||
return JSONResponse({"current_device_num": device_num, "version": version})
|
||||
|
||||
|
||||
@app.websocket("/ws/detections")
|
||||
|
||||
@@ -13,7 +13,10 @@ const timingFrameEl = document.querySelector("#timing-frame");
|
||||
|
||||
let selectedDevice = "";
|
||||
let pendingDevice = "";
|
||||
let queuedDevice = "";
|
||||
let switching = false;
|
||||
let devicesSignature = "";
|
||||
let lastWsSignature = "";
|
||||
|
||||
function setConnection(online, text) {
|
||||
connection.textContent = text;
|
||||
@@ -84,17 +87,55 @@ function renderDetections(detections) {
|
||||
.join("");
|
||||
}
|
||||
|
||||
async function switchDevice(deviceNum) {
|
||||
async function performSwitch(deviceNum) {
|
||||
switching = true;
|
||||
pendingDevice = deviceNum;
|
||||
devicesSignature = "";
|
||||
setConnection(false, "切换中");
|
||||
console.log("[device-switch] start", { deviceNum });
|
||||
|
||||
try {
|
||||
const response = await fetch(`/devices/${encodeURIComponent(deviceNum)}`, { method: "POST" });
|
||||
if (!response.ok) {
|
||||
throw new Error("切换摄像头失败");
|
||||
}
|
||||
const result = await response.json();
|
||||
const video = document.querySelector("#video");
|
||||
if (video) {
|
||||
video.src = `/video?t=${Date.now()}`;
|
||||
}
|
||||
document.querySelectorAll(".grid-video").forEach((item) => {
|
||||
item.src = `${item.dataset.src}?t=${Date.now()}`;
|
||||
});
|
||||
console.log("[device-switch] requested", { deviceNum, version: result.version });
|
||||
} catch (error) {
|
||||
pendingDevice = "";
|
||||
devicesSignature = "";
|
||||
setConnection(false, "切换失败");
|
||||
console.error("[device-switch] failed", { deviceNum, error });
|
||||
} finally {
|
||||
switching = false;
|
||||
if (queuedDevice && queuedDevice !== deviceNum) {
|
||||
const nextDevice = queuedDevice;
|
||||
queuedDevice = "";
|
||||
return performSwitch(nextDevice);
|
||||
}
|
||||
queuedDevice = "";
|
||||
}
|
||||
}
|
||||
|
||||
function switchDevice(deviceNum) {
|
||||
if (switching) {
|
||||
queuedDevice = deviceNum;
|
||||
pendingDevice = deviceNum;
|
||||
devicesSignature = "";
|
||||
setConnection(false, "等待切换");
|
||||
console.log("[device-switch] queued", { deviceNum });
|
||||
return Promise.resolve();
|
||||
}
|
||||
|
||||
return performSwitch(deviceNum);
|
||||
}
|
||||
|
||||
function connectWebSocket() {
|
||||
const protocol = window.location.protocol === "https:" ? "wss" : "ws";
|
||||
@@ -109,6 +150,18 @@ function connectWebSocket() {
|
||||
errorEl.textContent = data.error || (data.connected ? "正常" : "未连接");
|
||||
sourceEl.textContent = data.source || "-";
|
||||
setConnection(Boolean(data.connected), data.connected ? "已连接" : "重连中");
|
||||
const wsSignature = `${data.current_device_num}|${data.connected}|${data.frame_id}|${data.error || ""}`;
|
||||
if (wsSignature !== lastWsSignature) {
|
||||
lastWsSignature = wsSignature;
|
||||
console.log("[device-switch] ws", {
|
||||
currentDeviceNum: data.current_device_num,
|
||||
pendingDevice,
|
||||
queuedDevice,
|
||||
connected: data.connected,
|
||||
frameId: data.frame_id,
|
||||
error: data.error,
|
||||
});
|
||||
}
|
||||
if (pendingDevice && data.current_device_num === pendingDevice) {
|
||||
pendingDevice = "";
|
||||
deviceSelect.disabled = false;
|
||||
|
||||
@@ -73,6 +73,30 @@ p {
|
||||
color: var(--red);
|
||||
}
|
||||
|
||||
.topbar-actions {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 12px;
|
||||
}
|
||||
|
||||
.button-link {
|
||||
display: inline-flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
min-height: 36px;
|
||||
padding: 8px 14px;
|
||||
border: 1px solid var(--line);
|
||||
border-radius: 999px;
|
||||
color: var(--text);
|
||||
text-decoration: none;
|
||||
background: var(--panel);
|
||||
}
|
||||
|
||||
.button-link:hover {
|
||||
border-color: var(--green);
|
||||
color: var(--green);
|
||||
}
|
||||
|
||||
.layout {
|
||||
display: grid;
|
||||
grid-template-columns: minmax(0, 1fr) 360px;
|
||||
@@ -92,33 +116,6 @@ p {
|
||||
overflow: hidden;
|
||||
}
|
||||
|
||||
.pipeline {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 10px;
|
||||
padding: 14px;
|
||||
border-bottom: 1px solid var(--line);
|
||||
overflow-x: auto;
|
||||
}
|
||||
|
||||
.stage {
|
||||
flex: 0 0 auto;
|
||||
padding: 9px 12px;
|
||||
border: 1px solid var(--line);
|
||||
border-radius: 10px;
|
||||
color: var(--muted);
|
||||
background: var(--panel-2);
|
||||
}
|
||||
|
||||
.stage.active {
|
||||
border-color: rgba(46, 232, 135, 0.5);
|
||||
color: var(--green);
|
||||
}
|
||||
|
||||
.arrow {
|
||||
color: var(--muted);
|
||||
}
|
||||
|
||||
.video-wrap {
|
||||
display: grid;
|
||||
place-items: center;
|
||||
@@ -126,7 +123,8 @@ p {
|
||||
background: #05070b;
|
||||
}
|
||||
|
||||
#video {
|
||||
#video,
|
||||
.grid-video {
|
||||
display: block;
|
||||
width: 100%;
|
||||
height: auto;
|
||||
@@ -134,6 +132,36 @@ p {
|
||||
object-fit: contain;
|
||||
}
|
||||
|
||||
.video-grid {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(2, minmax(0, 1fr));
|
||||
gap: 14px;
|
||||
padding: 14px;
|
||||
background: #05070b;
|
||||
}
|
||||
|
||||
.video-grid-item {
|
||||
overflow: hidden;
|
||||
border: 1px solid var(--line);
|
||||
border-radius: 14px;
|
||||
background: var(--panel-2);
|
||||
}
|
||||
|
||||
.video-grid-title {
|
||||
padding: 10px 12px;
|
||||
border-bottom: 1px solid var(--line);
|
||||
color: var(--muted);
|
||||
font-size: 13px;
|
||||
}
|
||||
|
||||
.video-grid-wrap {
|
||||
min-height: 240px;
|
||||
}
|
||||
|
||||
.grid-video {
|
||||
max-height: calc((100vh - 260px) / 2);
|
||||
}
|
||||
|
||||
.side-card {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
@@ -166,13 +194,19 @@ p {
|
||||
background: var(--panel-2);
|
||||
}
|
||||
|
||||
.detections-panel {
|
||||
padding: 16px;
|
||||
border-top: 1px solid var(--line);
|
||||
}
|
||||
|
||||
.detections {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
display: grid;
|
||||
grid-template-columns: repeat(4, minmax(0, 1fr));
|
||||
gap: 10px;
|
||||
}
|
||||
|
||||
.detections.empty {
|
||||
display: block;
|
||||
color: var(--muted);
|
||||
}
|
||||
|
||||
@@ -197,6 +231,133 @@ p {
|
||||
font-size: 12px;
|
||||
}
|
||||
|
||||
.tokenizer-page .tokenizer-layout {
|
||||
display: grid;
|
||||
grid-template-columns: minmax(0, 1fr) minmax(0, 2fr);
|
||||
grid-template-areas: "flow side";
|
||||
align-items: start;
|
||||
width: 100%;
|
||||
gap: 18px;
|
||||
padding: 18px;
|
||||
}
|
||||
|
||||
.tokenizer-page .tokenizer-side {
|
||||
display: grid;
|
||||
grid-area: side;
|
||||
min-width: 0;
|
||||
gap: 18px;
|
||||
}
|
||||
|
||||
.tokenizer-page .tokenizer-flow-card {
|
||||
grid-area: flow;
|
||||
min-width: 0;
|
||||
min-height: calc(100vh - 122px);
|
||||
}
|
||||
|
||||
.tokenizer-page .tokenizer-side .detections {
|
||||
grid-template-columns: repeat(3, minmax(0, 1fr));
|
||||
}
|
||||
|
||||
.tokenizer-card {
|
||||
border: 1px solid var(--line);
|
||||
border-radius: 18px;
|
||||
padding: 18px;
|
||||
background: rgba(21, 27, 38, 0.9);
|
||||
box-shadow: 0 18px 40px rgba(0, 0, 0, 0.28);
|
||||
}
|
||||
|
||||
.pipeline-steps {
|
||||
display: grid;
|
||||
gap: 10px;
|
||||
}
|
||||
|
||||
.pipeline-step {
|
||||
display: grid;
|
||||
grid-template-columns: 34px minmax(0, 1fr);
|
||||
gap: 10px;
|
||||
align-items: start;
|
||||
padding: 12px;
|
||||
border: 1px solid var(--line);
|
||||
border-radius: 12px;
|
||||
background: var(--panel-2);
|
||||
}
|
||||
|
||||
.step-index {
|
||||
display: grid;
|
||||
place-items: center;
|
||||
width: 28px;
|
||||
height: 28px;
|
||||
border-radius: 999px;
|
||||
color: #06100b;
|
||||
font-weight: 700;
|
||||
background: var(--green);
|
||||
}
|
||||
|
||||
.step-title {
|
||||
margin-bottom: 5px;
|
||||
font-weight: 700;
|
||||
}
|
||||
|
||||
.step-value,
|
||||
.token-summary,
|
||||
.selected-token {
|
||||
color: var(--muted);
|
||||
font-family: ui-monospace, SFMono-Regular, Menlo, monospace;
|
||||
font-size: 13px;
|
||||
word-break: break-all;
|
||||
}
|
||||
|
||||
.token-sequence {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(12, minmax(0, 1fr));
|
||||
gap: 8px;
|
||||
margin-top: 14px;
|
||||
}
|
||||
|
||||
.token-cell {
|
||||
min-height: 50px;
|
||||
border: 1px solid var(--line);
|
||||
border-radius: 10px;
|
||||
color: var(--text);
|
||||
cursor: pointer;
|
||||
background: var(--panel-2);
|
||||
}
|
||||
|
||||
.token-cell span,
|
||||
.token-cell small {
|
||||
display: block;
|
||||
}
|
||||
|
||||
.token-cell small {
|
||||
margin-top: 3px;
|
||||
color: var(--muted);
|
||||
}
|
||||
|
||||
.token-cell.selected,
|
||||
.token-cell:hover {
|
||||
border-color: var(--green);
|
||||
color: var(--green);
|
||||
}
|
||||
|
||||
.token-detail-title {
|
||||
margin-bottom: 10px;
|
||||
color: var(--green);
|
||||
font-weight: 700;
|
||||
}
|
||||
|
||||
.token-vector {
|
||||
padding: 12px;
|
||||
border: 1px solid var(--line);
|
||||
border-radius: 12px;
|
||||
background: var(--panel-2);
|
||||
}
|
||||
|
||||
@media (max-width: 1280px) {
|
||||
.detections {
|
||||
grid-template-columns: repeat(4, minmax(0, 1fr));
|
||||
}
|
||||
}
|
||||
|
||||
@media (max-width: 980px) {
|
||||
.layout {
|
||||
grid-template-columns: 1fr;
|
||||
@@ -206,4 +367,18 @@ p {
|
||||
align-items: flex-start;
|
||||
flex-direction: column;
|
||||
}
|
||||
|
||||
.detections {
|
||||
grid-template-columns: repeat(3, minmax(0, 1fr));
|
||||
}
|
||||
}
|
||||
|
||||
@media (max-width: 640px) {
|
||||
.video-grid {
|
||||
grid-template-columns: 1fr;
|
||||
}
|
||||
|
||||
.detections {
|
||||
grid-template-columns: 1fr;
|
||||
}
|
||||
}
|
||||
|
||||
131
app/static/tokenizer.js
Normal file
131
app/static/tokenizer.js
Normal file
@@ -0,0 +1,131 @@
|
||||
const statusEl = document.querySelector("#tokenizer-status");
|
||||
const pipelineEl = document.querySelector("#pipeline-steps");
|
||||
const tokenSummaryEl = document.querySelector("#token-summary");
|
||||
const tokenSequenceEl = document.querySelector("#token-sequence");
|
||||
const selectedTokenEl = document.querySelector("#selected-token");
|
||||
const detectionsEl = document.querySelector("#tokenizer-detections");
|
||||
|
||||
let selectedTokenIndex = null;
|
||||
|
||||
function formatShape(shape) {
|
||||
if (!shape || !shape.length) {
|
||||
return "-";
|
||||
}
|
||||
return `[${shape.join(", ")}]`;
|
||||
}
|
||||
|
||||
function setStatus(ready, text) {
|
||||
statusEl.textContent = text;
|
||||
statusEl.classList.toggle("online", ready);
|
||||
statusEl.classList.toggle("offline", !ready);
|
||||
}
|
||||
|
||||
function renderPipeline(data) {
|
||||
const steps = [
|
||||
["OpenCV RGB 帧", `${data.image_size?.width ?? "-"} × ${data.image_size?.height ?? "-"}`],
|
||||
["PIL Image", `${data.image_size?.width ?? "-"} × ${data.image_size?.height ?? "-"}`],
|
||||
["DetrImageProcessor", `pixel_values ${formatShape(data.pixel_values_shape)} / pixel_mask ${formatShape(data.pixel_mask_shape)}`],
|
||||
["ResNet-50 backbone", `feature map ${formatShape(data.feature_map_shape)}`],
|
||||
["1×1 convolution", `projected ${formatShape(data.projected_feature_map_shape)}`],
|
||||
["视觉 token embedding", `由 projected feature map flatten 得到 ${formatShape(data.visual_tokens_shape)}`],
|
||||
["位置 embedding", `二维位置 embedding ${formatShape(data.position_encoding_shape)}`],
|
||||
["Transformer Encoder", formatShape(data.encoder_last_hidden_state_shape)],
|
||||
["Object query embedding + Decoder", `object query embedding 解码后 ${formatShape(data.decoder_last_hidden_state_shape)}`],
|
||||
["类别 logits + boxes", `logits ${formatShape(data.logits_shape)} / boxes ${formatShape(data.pred_boxes_shape)}`],
|
||||
["post_process_object_detection", `检测结果 ${data.detections?.length ?? 0} 个`],
|
||||
];
|
||||
|
||||
pipelineEl.innerHTML = steps
|
||||
.map(([title, value], index) => `
|
||||
<div class="pipeline-step">
|
||||
<div class="step-index">${index + 1}</div>
|
||||
<div>
|
||||
<div class="step-title">${title}</div>
|
||||
<div class="step-value">${value}</div>
|
||||
</div>
|
||||
</div>
|
||||
`)
|
||||
.join("");
|
||||
}
|
||||
|
||||
function renderTokens(data) {
|
||||
const grid = data.token_grid || {};
|
||||
tokenSummaryEl.textContent = `帧号 ${data.frame_id ?? "-"} · token 网格 ${grid.rows ?? "-"} × ${grid.cols ?? "-"},总数 ${grid.total ?? "-"},展示前 ${grid.shown ?? 0} 个 token,每个显示前 8 维采样。`;
|
||||
tokenSequenceEl.innerHTML = (data.token_sequence || [])
|
||||
.map((token) => `
|
||||
<button class="token-cell ${token.index === selectedTokenIndex ? "selected" : ""}" data-index="${token.index}">
|
||||
<span>#${token.index}</span>
|
||||
<small>(${token.row}, ${token.col})</small>
|
||||
</button>
|
||||
`)
|
||||
.join("");
|
||||
|
||||
tokenSequenceEl.querySelectorAll(".token-cell").forEach((button) => {
|
||||
button.addEventListener("click", () => {
|
||||
selectedTokenIndex = Number(button.dataset.index);
|
||||
renderSelectedToken(data);
|
||||
renderTokens(data);
|
||||
});
|
||||
});
|
||||
|
||||
renderSelectedToken(data);
|
||||
}
|
||||
|
||||
function renderSelectedToken(data) {
|
||||
const tokens = data.token_sequence || [];
|
||||
const token = tokens.find((item) => item.index === selectedTokenIndex) || tokens[0];
|
||||
if (!token) {
|
||||
selectedTokenEl.textContent = "暂无 token。";
|
||||
return;
|
||||
}
|
||||
selectedTokenIndex = token.index;
|
||||
selectedTokenEl.innerHTML = `
|
||||
<div class="token-detail-title">Token #${token.index} · 网格位置 (${token.row}, ${token.col}) · L2 ${token.magnitude}</div>
|
||||
<div class="token-vector">[${token.values.map((value) => Number(value).toFixed(4)).join(", ")}, ...]</div>
|
||||
`;
|
||||
}
|
||||
|
||||
function renderDetections(detections) {
|
||||
if (!detections.length) {
|
||||
detectionsEl.className = "detections empty";
|
||||
detectionsEl.textContent = "暂无目标";
|
||||
return;
|
||||
}
|
||||
|
||||
detectionsEl.className = "detections";
|
||||
detectionsEl.innerHTML = detections
|
||||
.map((det) => `
|
||||
<div class="det-item">
|
||||
<div class="det-title">
|
||||
<span>${det.label}</span>
|
||||
<span>${(det.score * 100).toFixed(1)}%</span>
|
||||
</div>
|
||||
<div class="det-box">box: [${det.box.join(", ")}]</div>
|
||||
</div>
|
||||
`)
|
||||
.join("");
|
||||
}
|
||||
|
||||
async function refreshTokenizer() {
|
||||
try {
|
||||
const response = await fetch(`/tokenizer/state?t=${Date.now()}`);
|
||||
const data = await response.json();
|
||||
if (!data.ready) {
|
||||
setStatus(false, data.error || "等待帧");
|
||||
tokenSummaryEl.textContent = data.error || "等待视频帧";
|
||||
return;
|
||||
}
|
||||
|
||||
setStatus(Boolean(data.connected), data.connected ? "动态更新中" : "未连接");
|
||||
renderPipeline(data);
|
||||
renderTokens(data);
|
||||
renderDetections(data.detections || []);
|
||||
} catch (error) {
|
||||
setStatus(false, "更新失败");
|
||||
tokenSummaryEl.textContent = `更新失败:${error}`;
|
||||
} finally {
|
||||
setTimeout(refreshTokenizer, 1200);
|
||||
}
|
||||
}
|
||||
|
||||
refreshTokenizer();
|
||||
@@ -26,6 +26,7 @@ class StreamWorker:
|
||||
|
||||
self.lock = threading.Lock()
|
||||
self.latest_jpeg: bytes | None = None
|
||||
self.latest_frame_rgb: Any | None = None
|
||||
self.latest_detections: list[dict[str, Any]] = []
|
||||
self.frame_id = 0
|
||||
self.updated_at = 0.0
|
||||
@@ -56,21 +57,30 @@ class StreamWorker:
|
||||
def reconnect(self) -> None:
|
||||
with self.lock:
|
||||
self.latest_jpeg = None
|
||||
self.latest_frame_rgb = None
|
||||
self.latest_detections = []
|
||||
self.frame_id = 0
|
||||
self.fps = 0.0
|
||||
self.reconnect_requested = True
|
||||
self.reconnect_version += 1
|
||||
version = self.reconnect_version
|
||||
self.connected = False
|
||||
self.error = "正在切换视频源"
|
||||
self.resolve_ms = 0.0
|
||||
self.open_ms = 0.0
|
||||
self.first_frame_ms = 0.0
|
||||
print(f"[device-switch] worker reconnect requested version={version}", flush=True)
|
||||
|
||||
def get_jpeg(self) -> bytes | None:
|
||||
with self.lock:
|
||||
return self.latest_jpeg
|
||||
|
||||
def get_frame_rgb(self) -> Any | None:
|
||||
with self.lock:
|
||||
if self.latest_frame_rgb is None:
|
||||
return None
|
||||
return self.latest_frame_rgb.copy()
|
||||
|
||||
def get_snapshot(self) -> dict[str, Any]:
|
||||
with self.lock:
|
||||
return {
|
||||
@@ -100,16 +110,29 @@ class StreamWorker:
|
||||
run_version = self.reconnect_version
|
||||
self.reconnect_requested = False
|
||||
if should_reconnect:
|
||||
print(f"[device-switch] worker reconnect handling version={run_version}", flush=True)
|
||||
# 切换摄像头时必须释放旧连接,否则 OpenCV 会继续阻塞读旧流。
|
||||
if cap is not None:
|
||||
cap.release()
|
||||
cap = None
|
||||
print(f"[device-switch] worker released old capture version={run_version}", flush=True)
|
||||
|
||||
if cap is None or not cap.isOpened():
|
||||
started = time.monotonic()
|
||||
try:
|
||||
stream_url = self.stream_url() if callable(self.stream_url) else self.stream_url
|
||||
except Exception as exc:
|
||||
resolve_ms = round((time.monotonic() - started) * 1000, 2)
|
||||
with self.lock:
|
||||
self.resolve_ms = resolve_ms
|
||||
self.open_ms = 0.0
|
||||
self.first_frame_ms = 0.0
|
||||
self._set_connection_state(False, f"获取播放地址失败:{exc},2 秒后重试")
|
||||
time.sleep(2)
|
||||
continue
|
||||
resolve_ms = round((time.monotonic() - started) * 1000, 2)
|
||||
started = time.monotonic()
|
||||
print(f"[device-switch] worker open start version={run_version}", flush=True)
|
||||
cap = cv2.VideoCapture(stream_url)
|
||||
open_ms = round((time.monotonic() - started) * 1000, 2)
|
||||
with self.lock:
|
||||
@@ -117,11 +140,13 @@ class StreamWorker:
|
||||
self.resolve_ms = resolve_ms
|
||||
self.first_frame_ms = 0.0
|
||||
if not cap.isOpened():
|
||||
print(f"[device-switch] worker open failed version={run_version} open_ms={open_ms}", flush=True)
|
||||
self._set_connection_state(False, "无法打开视频流,2 秒后重试")
|
||||
cap.release()
|
||||
cap = None
|
||||
time.sleep(2)
|
||||
continue
|
||||
print(f"[device-switch] worker open success version={run_version} open_ms={open_ms}", flush=True)
|
||||
self._set_connection_state(True, "已连接")
|
||||
|
||||
started = time.monotonic()
|
||||
@@ -143,9 +168,9 @@ class StreamWorker:
|
||||
|
||||
frame = self._resize(frame)
|
||||
self.frame_id += 1
|
||||
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
||||
|
||||
if self.frame_id % self.frame_skip == 0:
|
||||
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
||||
last_detections = self.detector.detect(frame_rgb)
|
||||
|
||||
annotated = self._draw(frame, last_detections)
|
||||
@@ -173,6 +198,7 @@ class StreamWorker:
|
||||
|
||||
with self.lock:
|
||||
self.latest_jpeg = jpeg.tobytes()
|
||||
self.latest_frame_rgb = frame_rgb.copy()
|
||||
self.latest_detections = list(last_detections)
|
||||
self.updated_at = time.time()
|
||||
self.connected = True
|
||||
|
||||
@@ -9,26 +9,30 @@
|
||||
<body>
|
||||
<header class="topbar">
|
||||
<div>
|
||||
<h1>DETR 动态打标</h1>
|
||||
<p>使用 Python、OpenCV、PyTorch、Transformers DETR 和 FastAPI,Mac mini m2 上运行。</p>
|
||||
<!-- <h1>DETR 动态打标</h1> -->
|
||||
<p>DETR动态打标:Python、OpenCV、PyTorch、Transformers DETR 和 FastAPI。</p>
|
||||
</div>
|
||||
<div class="topbar-actions">
|
||||
<a class="button-link" href="/tokenizer">tokenizer</a>
|
||||
<div class="badge" id="connection">连接中</div>
|
||||
</div>
|
||||
</header>
|
||||
|
||||
<main class="layout">
|
||||
<section class="video-card">
|
||||
<div class="pipeline">
|
||||
<div class="stage active">源节点</div>
|
||||
<div class="arrow">→</div>
|
||||
<div class="stage active">DETR 推理</div>
|
||||
<div class="arrow">→</div>
|
||||
<div class="stage active">OSD 打标</div>
|
||||
<div class="arrow">→</div>
|
||||
<div class="stage active">FastAPI 输出</div>
|
||||
<div class="video-grid">
|
||||
{% for device_item in video_grid_devices %}
|
||||
<article class="video-grid-item">
|
||||
<div class="video-grid-title">{{ device_item.name }} · {{ device_item.device_num }}</div>
|
||||
<div class="video-wrap video-grid-wrap">
|
||||
<img class="grid-video" src="/video/{{ device_item.device_num | urlencode }}" data-src="/video/{{ device_item.device_num | urlencode }}" alt="{{ device_item.name }} 视频流" />
|
||||
</div>
|
||||
<div class="video-wrap">
|
||||
<img id="video" src="/video" alt="动态打标视频流" />
|
||||
</article>
|
||||
{% endfor %}
|
||||
</div>
|
||||
<section class="detections-panel">
|
||||
<div id="detections" class="detections empty">暂无目标</div>
|
||||
</section>
|
||||
</section>
|
||||
|
||||
<aside class="side-card">
|
||||
@@ -69,11 +73,6 @@
|
||||
<dd id="timing-frame">-</dd>
|
||||
</dl>
|
||||
</section>
|
||||
|
||||
<section>
|
||||
<h2>检测结果</h2>
|
||||
<div id="detections" class="detections empty">暂无目标</div>
|
||||
</section>
|
||||
</aside>
|
||||
</main>
|
||||
|
||||
|
||||
48
app/templates/tokenizer.html
Normal file
48
app/templates/tokenizer.html
Normal file
@@ -0,0 +1,48 @@
|
||||
<!doctype html>
|
||||
<html lang="zh-CN">
|
||||
<head>
|
||||
<meta charset="utf-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||
<title>DETR Tokenizer 动态可视化</title>
|
||||
<link rel="stylesheet" href="/static/style.css?v=tokenizer-layout-1-3" />
|
||||
</head>
|
||||
<body class="tokenizer-page">
|
||||
<header class="topbar">
|
||||
<div>
|
||||
<!--<h1>DETR Tokenizer 动态可视化</h1>-->
|
||||
<p>实时展示当前视频帧从图像预处理、视觉特征、token 序列到检测输出的过程。</p>
|
||||
</div>
|
||||
<div class="topbar-actions">
|
||||
<a class="button-link" href="/">返回视频</a>
|
||||
<div class="badge" id="tokenizer-status">等待帧</div>
|
||||
</div>
|
||||
</header>
|
||||
|
||||
<main class="tokenizer-layout">
|
||||
<section class="tokenizer-card tokenizer-flow-card">
|
||||
<h2>实时流程</h2>
|
||||
<div class="pipeline-steps" id="pipeline-steps"></div>
|
||||
</section>
|
||||
|
||||
<aside class="tokenizer-side">
|
||||
<section class="tokenizer-card">
|
||||
<h2>选中 Token</h2>
|
||||
<div class="selected-token" id="selected-token">点击下方 token 查看向量采样。</div>
|
||||
</section>
|
||||
|
||||
<section class="tokenizer-card">
|
||||
<h2>Token 序列</h2>
|
||||
<div class="token-summary" id="token-summary">等待视频帧</div>
|
||||
<div class="token-sequence" id="token-sequence"></div>
|
||||
</section>
|
||||
|
||||
<section class="tokenizer-card">
|
||||
<h2>检测输出</h2>
|
||||
<div id="tokenizer-detections" class="detections empty">暂无目标</div>
|
||||
</section>
|
||||
</aside>
|
||||
</main>
|
||||
|
||||
<script src="/static/tokenizer.js"></script>
|
||||
</body>
|
||||
</html>
|
||||
33
tokenizer.md
33
tokenizer.md
@@ -1,3 +1,4 @@
|
||||
|
||||
# DETR 的视觉 token 化过程说明
|
||||
|
||||
本文基于当前项目代码 `app/detector.py` 中的实现说明 DETR 的“token 化”过程。
|
||||
@@ -58,19 +59,39 @@ projected feature map: [batch, 256, H', W']
|
||||
↓
|
||||
flatten 空间维度 H' × W'
|
||||
↓
|
||||
visual tokens: [batch, H'×W', 256]
|
||||
visual token embeddings: [batch, H'×W', 256]
|
||||
↓
|
||||
加入二维位置编码
|
||||
加入二维位置 embedding
|
||||
↓
|
||||
Transformer Encoder
|
||||
↓
|
||||
Object Queries + Transformer Decoder
|
||||
Object query embeddings + Transformer Decoder
|
||||
↓
|
||||
类别 logits + 边界框 boxes
|
||||
↓
|
||||
post_process_object_detection 还原到原图坐标
|
||||
```
|
||||
|
||||
## Embedding 在 1-11 个环节中的位置
|
||||
|
||||
在这个 DETR 流程里,embedding 不是单独只有一步,而是出现在 3 个关键环节:
|
||||
|
||||
| 页面步骤 | 名称 | embedding 含义 |
|
||||
| --- | --- | --- |
|
||||
| 第 6 步 | visual token embedding | `projected feature map` 经过 flatten 后,每个空间网格点变成一个 256 维视觉 token embedding。 |
|
||||
| 第 7 步 | position embedding | 给每个视觉 token 加入二维位置 embedding,让 Transformer 知道 token 原本在图像中的位置。 |
|
||||
| 第 9 步 | object query embedding | DETR 使用一组可学习的 object query embeddings 进入 Decoder,每个 query 最终预测一个候选目标。 |
|
||||
|
||||
所以如果问“embedding 在 1-11 哪个环节”,最核心的是:
|
||||
|
||||
```text
|
||||
第 6 步:产生视觉 token embedding
|
||||
第 7 步:加入位置 embedding
|
||||
第 9 步:object query embedding 进入 Decoder
|
||||
```
|
||||
|
||||
第 6 步是图像内容 embedding,第 7 步是空间位置 embedding,第 9 步是检测目标查询 embedding。
|
||||
|
||||
## 第 1 步:图像预处理
|
||||
|
||||
代码:
|
||||
@@ -212,7 +233,7 @@ x = x.permute(2, 0, 1) # [h*w, batch, 256]
|
||||
|
||||
两种写法本质相同,只是 Transformer 接口期望的维度顺序不同。
|
||||
|
||||
## 第 5 步:加入二维位置编码
|
||||
## 第 5 步:加入二维位置 embedding
|
||||
|
||||
Transformer 本身不理解图像中的二维空间位置。
|
||||
|
||||
@@ -264,11 +285,11 @@ Encoder 会通过 self-attention 建模图像中不同区域之间的关系。
|
||||
- 道路区域可以影响车辆判断。
|
||||
- 远处小目标可以和周围上下文一起被理解。
|
||||
|
||||
## 第 7 步:Object Queries 和 Transformer Decoder
|
||||
## 第 7 步:Object query embedding 和 Transformer Decoder
|
||||
|
||||
DETR 与传统检测器不同,它不是先生成大量 anchor box。
|
||||
|
||||
它使用一组可学习的 object queries。常见数量是:
|
||||
它使用一组可学习的 object query embeddings。常见数量是:
|
||||
|
||||
```text
|
||||
100 个 object queries
|
||||
|
||||
Reference in New Issue
Block a user