update at 2026-06-04 14:09:16
This commit is contained in:
1
.claude/worktrees/agent-a02a887cdc0c41851
Submodule
1
.claude/worktrees/agent-a02a887cdc0c41851
Submodule
Submodule .claude/worktrees/agent-a02a887cdc0c41851 added at 41bd03123c
@@ -36,7 +36,79 @@ class DetrVehicleDetector:
|
|||||||
inputs = {key: value.to(self.device) for key, value in inputs.items()}
|
inputs = {key: value.to(self.device) for key, value in inputs.items()}
|
||||||
|
|
||||||
outputs = self.model(**inputs)
|
outputs = self.model(**inputs)
|
||||||
# DETR 后处理需要原图尺寸,PIL size 是 (宽, 高),这里转成 (高, 宽)。
|
return self._detections_from_outputs(image, outputs)
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def inspect_tokens(self, frame_rgb: Any) -> dict[str, Any]:
|
||||||
|
image = Image.fromarray(frame_rgb)
|
||||||
|
inputs = self.processor(images=image, return_tensors="pt")
|
||||||
|
inputs = {key: value.to(self.device) for key, value in inputs.items()}
|
||||||
|
|
||||||
|
features, object_queries_list = self.model.model.backbone(inputs["pixel_values"], inputs["pixel_mask"])
|
||||||
|
feature_map, mask = features[-1]
|
||||||
|
projected_feature_map = self.model.model.input_projection(feature_map)
|
||||||
|
tokens = projected_feature_map.flatten(2).permute(0, 2, 1)
|
||||||
|
object_queries = object_queries_list[-1].flatten(2).permute(0, 2, 1)
|
||||||
|
|
||||||
|
outputs = self.model(**inputs, output_hidden_states=True)
|
||||||
|
target_sizes = torch.tensor([image.size[::-1]], device=self.device)
|
||||||
|
results = self.processor.post_process_object_detection(
|
||||||
|
outputs,
|
||||||
|
target_sizes=target_sizes,
|
||||||
|
threshold=self.confidence,
|
||||||
|
)[0]
|
||||||
|
|
||||||
|
token_rows = int(projected_feature_map.shape[2])
|
||||||
|
token_cols = int(projected_feature_map.shape[3])
|
||||||
|
sample_count = min(48, int(tokens.shape[1]))
|
||||||
|
sample_tokens = tokens[0, :sample_count, :8].detach().cpu()
|
||||||
|
token_sequence = []
|
||||||
|
for index, vector in enumerate(sample_tokens):
|
||||||
|
token_sequence.append(
|
||||||
|
{
|
||||||
|
"index": index,
|
||||||
|
"row": index // token_cols,
|
||||||
|
"col": index % token_cols,
|
||||||
|
"values": [round(float(value), 4) for value in vector.tolist()],
|
||||||
|
"magnitude": round(float(vector.norm()), 4),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
detections = []
|
||||||
|
for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
|
||||||
|
label_name = self.model.config.id2label[label.item()]
|
||||||
|
if label_name not in self.vehicle_labels:
|
||||||
|
continue
|
||||||
|
|
||||||
|
x1, y1, x2, y2 = box.detach().cpu().numpy().astype(int).tolist()
|
||||||
|
detections.append(
|
||||||
|
{
|
||||||
|
"label": label_name,
|
||||||
|
"score": round(float(score.detach().cpu()), 4),
|
||||||
|
"box": [x1, y1, x2, y2],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
encoder_last_hidden_state = getattr(outputs, "encoder_last_hidden_state", None)
|
||||||
|
last_hidden_state = getattr(outputs, "last_hidden_state", None)
|
||||||
|
return {
|
||||||
|
"image_size": {"width": image.size[0], "height": image.size[1]},
|
||||||
|
"pixel_values_shape": list(inputs["pixel_values"].shape),
|
||||||
|
"pixel_mask_shape": list(inputs["pixel_mask"].shape),
|
||||||
|
"feature_map_shape": list(feature_map.shape),
|
||||||
|
"projected_feature_map_shape": list(projected_feature_map.shape),
|
||||||
|
"visual_tokens_shape": list(tokens.shape),
|
||||||
|
"position_encoding_shape": list(object_queries.shape),
|
||||||
|
"encoder_last_hidden_state_shape": list(encoder_last_hidden_state.shape) if encoder_last_hidden_state is not None else [],
|
||||||
|
"decoder_last_hidden_state_shape": list(last_hidden_state.shape) if last_hidden_state is not None else [],
|
||||||
|
"logits_shape": list(outputs.logits.shape),
|
||||||
|
"pred_boxes_shape": list(outputs.pred_boxes.shape),
|
||||||
|
"token_grid": {"rows": token_rows, "cols": token_cols, "total": int(tokens.shape[1]), "shown": sample_count},
|
||||||
|
"token_sequence": token_sequence,
|
||||||
|
"detections": detections,
|
||||||
|
}
|
||||||
|
|
||||||
|
def _detections_from_outputs(self, image: Image.Image, outputs: Any) -> list[dict[str, Any]]:
|
||||||
target_sizes = torch.tensor([image.size[::-1]], device=self.device)
|
target_sizes = torch.tensor([image.size[::-1]], device=self.device)
|
||||||
results = self.processor.post_process_object_detection(
|
results = self.processor.post_process_object_detection(
|
||||||
outputs,
|
outputs,
|
||||||
@@ -60,3 +132,4 @@ class DetrVehicleDetector:
|
|||||||
)
|
)
|
||||||
|
|
||||||
return detections
|
return detections
|
||||||
|
|
||||||
|
|||||||
@@ -33,11 +33,16 @@ class DeviceManager:
|
|||||||
if device_num not in {device.device_num for device in self.devices}:
|
if device_num not in {device.device_num for device in self.devices}:
|
||||||
raise ValueError("设备不在 devicelist.env 中")
|
raise ValueError("设备不在 devicelist.env 中")
|
||||||
with self.lock:
|
with self.lock:
|
||||||
|
old_device_num = self.current_device_num
|
||||||
self.current_device_num = device_num
|
self.current_device_num = device_num
|
||||||
self.current_url = ""
|
self.current_url = ""
|
||||||
self.timings = {}
|
self.timings = {}
|
||||||
self.updated_at = time.time()
|
self.updated_at = time.time()
|
||||||
self.version += 1
|
self.version += 1
|
||||||
|
print(
|
||||||
|
f"[device-switch] manager set old={old_device_num} new={device_num} version={self.version}",
|
||||||
|
flush=True,
|
||||||
|
)
|
||||||
return self.version
|
return self.version
|
||||||
|
|
||||||
def resolve_stream_url(self) -> str:
|
def resolve_stream_url(self) -> str:
|
||||||
@@ -49,16 +54,38 @@ class DeviceManager:
|
|||||||
return self.fallback_url
|
return self.fallback_url
|
||||||
raise RuntimeError("devicelist.env 中没有可用设备号")
|
raise RuntimeError("devicelist.env 中没有可用设备号")
|
||||||
|
|
||||||
|
print(f"[device-switch] resolve start device={device_num} version={version}", flush=True)
|
||||||
|
try:
|
||||||
result = self.api_client.get_stream_url_details(device_num)
|
result = self.api_client.get_stream_url_details(device_num)
|
||||||
|
except Exception as exc:
|
||||||
|
print(
|
||||||
|
f"[device-switch] resolve failed device={device_num} version={version} error={exc}",
|
||||||
|
flush=True,
|
||||||
|
)
|
||||||
|
raise
|
||||||
with self.lock:
|
with self.lock:
|
||||||
# 避免旧摄像头的慢接口响应覆盖用户刚切换的新选择。
|
# 避免旧摄像头的慢接口响应覆盖用户刚切换的新选择。
|
||||||
if version != self.version or device_num != self.current_device_num:
|
if version != self.version or device_num != self.current_device_num:
|
||||||
|
print(
|
||||||
|
f"[device-switch] resolve stale device={device_num} version={version} current={self.current_device_num} current_version={self.version}",
|
||||||
|
flush=True,
|
||||||
|
)
|
||||||
return self.current_url
|
return self.current_url
|
||||||
self.current_url = result.url
|
self.current_url = result.url
|
||||||
self.timings = dict(result.timings)
|
self.timings = dict(result.timings)
|
||||||
self.updated_at = time.time()
|
self.updated_at = time.time()
|
||||||
|
print(f"[device-switch] resolve success device={device_num} version={version}", flush=True)
|
||||||
return result.url
|
return result.url
|
||||||
|
|
||||||
|
def resolve_stream_url_for(self, device_num: str) -> str:
|
||||||
|
if device_num not in {device.device_num for device in self.devices}:
|
||||||
|
raise ValueError("设备不在 devicelist.env 中")
|
||||||
|
result = self.api_client.get_stream_url_details(device_num)
|
||||||
|
return result.url
|
||||||
|
|
||||||
|
def get_video_grid_devices(self, limit: int = 4) -> list[Device]:
|
||||||
|
return self.devices[:limit]
|
||||||
|
|
||||||
def get_snapshot(self) -> dict[str, Any]:
|
def get_snapshot(self) -> dict[str, Any]:
|
||||||
with self.lock:
|
with self.lock:
|
||||||
return {
|
return {
|
||||||
|
|||||||
76
app/main.py
76
app/main.py
@@ -45,6 +45,18 @@ worker = StreamWorker(
|
|||||||
resize_width=settings.resize_width,
|
resize_width=settings.resize_width,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
video_grid_devices = device_manager.get_video_grid_devices()
|
||||||
|
video_grid_workers = {
|
||||||
|
device.device_num: StreamWorker(
|
||||||
|
stream_url=lambda device_num=device.device_num: device_manager.resolve_stream_url_for(device_num),
|
||||||
|
detector=detector,
|
||||||
|
frame_skip=settings.frame_skip,
|
||||||
|
jpeg_quality=settings.jpeg_quality,
|
||||||
|
resize_width=settings.resize_width,
|
||||||
|
)
|
||||||
|
for device in video_grid_devices
|
||||||
|
}
|
||||||
|
|
||||||
app = FastAPI(title="DETR 动态打标")
|
app = FastAPI(title="DETR 动态打标")
|
||||||
app.mount("/static", StaticFiles(directory="app/static"), name="static")
|
app.mount("/static", StaticFiles(directory="app/static"), name="static")
|
||||||
templates = Jinja2Templates(directory="app/templates")
|
templates = Jinja2Templates(directory="app/templates")
|
||||||
@@ -57,11 +69,15 @@ def display_model_name(model_name: str) -> str:
|
|||||||
@app.on_event("startup")
|
@app.on_event("startup")
|
||||||
def startup() -> None:
|
def startup() -> None:
|
||||||
worker.start()
|
worker.start()
|
||||||
|
for grid_worker in video_grid_workers.values():
|
||||||
|
grid_worker.start()
|
||||||
|
|
||||||
|
|
||||||
@app.on_event("shutdown")
|
@app.on_event("shutdown")
|
||||||
def shutdown() -> None:
|
def shutdown() -> None:
|
||||||
worker.stop()
|
worker.stop()
|
||||||
|
for grid_worker in video_grid_workers.values():
|
||||||
|
grid_worker.stop()
|
||||||
|
|
||||||
|
|
||||||
@app.get("/", response_class=HTMLResponse)
|
@app.get("/", response_class=HTMLResponse)
|
||||||
@@ -73,15 +89,67 @@ def index(request: Request) -> HTMLResponse:
|
|||||||
"model": display_model_name(settings.detr_model),
|
"model": display_model_name(settings.detr_model),
|
||||||
"device": detector.device_name,
|
"device": detector.device_name,
|
||||||
"stream_url": f"设备号:{device_manager.get_snapshot()['current_device_num']}",
|
"stream_url": f"设备号:{device_manager.get_snapshot()['current_device_num']}",
|
||||||
|
"video_grid_devices": video_grid_devices,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/tokenizer", response_class=HTMLResponse)
|
||||||
|
def tokenizer(request: Request) -> HTMLResponse:
|
||||||
|
return templates.TemplateResponse(
|
||||||
|
"tokenizer.html",
|
||||||
|
{
|
||||||
|
"request": request,
|
||||||
|
"model": display_model_name(settings.detr_model),
|
||||||
|
"device": detector.device_name,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/tokenizer/state")
|
||||||
|
def tokenizer_state() -> JSONResponse:
|
||||||
|
snapshot = worker.get_snapshot()
|
||||||
|
frame = worker.get_frame_rgb()
|
||||||
|
if frame is None:
|
||||||
|
return JSONResponse(
|
||||||
|
{
|
||||||
|
"ready": False,
|
||||||
|
"frame_id": snapshot["frame_id"],
|
||||||
|
"connected": snapshot["connected"],
|
||||||
|
"error": snapshot["error"] or "等待视频帧",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
data = detector.inspect_tokens(frame)
|
||||||
|
data.update(
|
||||||
|
{
|
||||||
|
"ready": True,
|
||||||
|
"frame_id": snapshot["frame_id"],
|
||||||
|
"updated_at": snapshot["updated_at"],
|
||||||
|
"connected": snapshot["connected"],
|
||||||
|
"error": snapshot["error"],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
return JSONResponse(data)
|
||||||
|
|
||||||
|
|
||||||
@app.get("/video")
|
@app.get("/video")
|
||||||
def video() -> StreamingResponse:
|
def video() -> StreamingResponse:
|
||||||
|
return stream_video(worker)
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/video/{device_num}")
|
||||||
|
def video_device(device_num: str) -> StreamingResponse:
|
||||||
|
grid_worker = video_grid_workers.get(device_num)
|
||||||
|
if grid_worker is None:
|
||||||
|
raise HTTPException(status_code=404, detail="设备不在视频网格中")
|
||||||
|
return stream_video(grid_worker)
|
||||||
|
|
||||||
|
|
||||||
|
def stream_video(stream_worker: StreamWorker) -> StreamingResponse:
|
||||||
async def generate():
|
async def generate():
|
||||||
while True:
|
while True:
|
||||||
frame = worker.get_jpeg()
|
frame = stream_worker.get_jpeg()
|
||||||
if frame is None:
|
if frame is None:
|
||||||
await asyncio.sleep(0.1)
|
await asyncio.sleep(0.1)
|
||||||
continue
|
continue
|
||||||
@@ -137,11 +205,13 @@ def status() -> JSONResponse:
|
|||||||
@app.post("/devices/{device_num}")
|
@app.post("/devices/{device_num}")
|
||||||
def switch_device(device_num: str) -> JSONResponse:
|
def switch_device(device_num: str) -> JSONResponse:
|
||||||
try:
|
try:
|
||||||
device_manager.set_current_device(device_num)
|
version = device_manager.set_current_device(device_num)
|
||||||
except ValueError as exc:
|
except ValueError as exc:
|
||||||
|
print(f"[device-switch] invalid device={device_num}", flush=True)
|
||||||
raise HTTPException(status_code=404, detail=str(exc)) from exc
|
raise HTTPException(status_code=404, detail=str(exc)) from exc
|
||||||
worker.reconnect()
|
worker.reconnect()
|
||||||
return JSONResponse({"current_device_num": device_num})
|
print(f"[device-switch] accepted device={device_num} version={version}", flush=True)
|
||||||
|
return JSONResponse({"current_device_num": device_num, "version": version})
|
||||||
|
|
||||||
|
|
||||||
@app.websocket("/ws/detections")
|
@app.websocket("/ws/detections")
|
||||||
|
|||||||
@@ -13,7 +13,10 @@ const timingFrameEl = document.querySelector("#timing-frame");
|
|||||||
|
|
||||||
let selectedDevice = "";
|
let selectedDevice = "";
|
||||||
let pendingDevice = "";
|
let pendingDevice = "";
|
||||||
|
let queuedDevice = "";
|
||||||
|
let switching = false;
|
||||||
let devicesSignature = "";
|
let devicesSignature = "";
|
||||||
|
let lastWsSignature = "";
|
||||||
|
|
||||||
function setConnection(online, text) {
|
function setConnection(online, text) {
|
||||||
connection.textContent = text;
|
connection.textContent = text;
|
||||||
@@ -84,16 +87,54 @@ function renderDetections(detections) {
|
|||||||
.join("");
|
.join("");
|
||||||
}
|
}
|
||||||
|
|
||||||
async function switchDevice(deviceNum) {
|
async function performSwitch(deviceNum) {
|
||||||
|
switching = true;
|
||||||
pendingDevice = deviceNum;
|
pendingDevice = deviceNum;
|
||||||
devicesSignature = "";
|
devicesSignature = "";
|
||||||
setConnection(false, "切换中");
|
setConnection(false, "切换中");
|
||||||
|
console.log("[device-switch] start", { deviceNum });
|
||||||
|
|
||||||
|
try {
|
||||||
const response = await fetch(`/devices/${encodeURIComponent(deviceNum)}`, { method: "POST" });
|
const response = await fetch(`/devices/${encodeURIComponent(deviceNum)}`, { method: "POST" });
|
||||||
if (!response.ok) {
|
if (!response.ok) {
|
||||||
throw new Error("切换摄像头失败");
|
throw new Error("切换摄像头失败");
|
||||||
}
|
}
|
||||||
|
const result = await response.json();
|
||||||
const video = document.querySelector("#video");
|
const video = document.querySelector("#video");
|
||||||
|
if (video) {
|
||||||
video.src = `/video?t=${Date.now()}`;
|
video.src = `/video?t=${Date.now()}`;
|
||||||
|
}
|
||||||
|
document.querySelectorAll(".grid-video").forEach((item) => {
|
||||||
|
item.src = `${item.dataset.src}?t=${Date.now()}`;
|
||||||
|
});
|
||||||
|
console.log("[device-switch] requested", { deviceNum, version: result.version });
|
||||||
|
} catch (error) {
|
||||||
|
pendingDevice = "";
|
||||||
|
devicesSignature = "";
|
||||||
|
setConnection(false, "切换失败");
|
||||||
|
console.error("[device-switch] failed", { deviceNum, error });
|
||||||
|
} finally {
|
||||||
|
switching = false;
|
||||||
|
if (queuedDevice && queuedDevice !== deviceNum) {
|
||||||
|
const nextDevice = queuedDevice;
|
||||||
|
queuedDevice = "";
|
||||||
|
return performSwitch(nextDevice);
|
||||||
|
}
|
||||||
|
queuedDevice = "";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function switchDevice(deviceNum) {
|
||||||
|
if (switching) {
|
||||||
|
queuedDevice = deviceNum;
|
||||||
|
pendingDevice = deviceNum;
|
||||||
|
devicesSignature = "";
|
||||||
|
setConnection(false, "等待切换");
|
||||||
|
console.log("[device-switch] queued", { deviceNum });
|
||||||
|
return Promise.resolve();
|
||||||
|
}
|
||||||
|
|
||||||
|
return performSwitch(deviceNum);
|
||||||
}
|
}
|
||||||
|
|
||||||
function connectWebSocket() {
|
function connectWebSocket() {
|
||||||
@@ -109,6 +150,18 @@ function connectWebSocket() {
|
|||||||
errorEl.textContent = data.error || (data.connected ? "正常" : "未连接");
|
errorEl.textContent = data.error || (data.connected ? "正常" : "未连接");
|
||||||
sourceEl.textContent = data.source || "-";
|
sourceEl.textContent = data.source || "-";
|
||||||
setConnection(Boolean(data.connected), data.connected ? "已连接" : "重连中");
|
setConnection(Boolean(data.connected), data.connected ? "已连接" : "重连中");
|
||||||
|
const wsSignature = `${data.current_device_num}|${data.connected}|${data.frame_id}|${data.error || ""}`;
|
||||||
|
if (wsSignature !== lastWsSignature) {
|
||||||
|
lastWsSignature = wsSignature;
|
||||||
|
console.log("[device-switch] ws", {
|
||||||
|
currentDeviceNum: data.current_device_num,
|
||||||
|
pendingDevice,
|
||||||
|
queuedDevice,
|
||||||
|
connected: data.connected,
|
||||||
|
frameId: data.frame_id,
|
||||||
|
error: data.error,
|
||||||
|
});
|
||||||
|
}
|
||||||
if (pendingDevice && data.current_device_num === pendingDevice) {
|
if (pendingDevice && data.current_device_num === pendingDevice) {
|
||||||
pendingDevice = "";
|
pendingDevice = "";
|
||||||
deviceSelect.disabled = false;
|
deviceSelect.disabled = false;
|
||||||
|
|||||||
@@ -73,6 +73,30 @@ p {
|
|||||||
color: var(--red);
|
color: var(--red);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
.topbar-actions {
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
gap: 12px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.button-link {
|
||||||
|
display: inline-flex;
|
||||||
|
align-items: center;
|
||||||
|
justify-content: center;
|
||||||
|
min-height: 36px;
|
||||||
|
padding: 8px 14px;
|
||||||
|
border: 1px solid var(--line);
|
||||||
|
border-radius: 999px;
|
||||||
|
color: var(--text);
|
||||||
|
text-decoration: none;
|
||||||
|
background: var(--panel);
|
||||||
|
}
|
||||||
|
|
||||||
|
.button-link:hover {
|
||||||
|
border-color: var(--green);
|
||||||
|
color: var(--green);
|
||||||
|
}
|
||||||
|
|
||||||
.layout {
|
.layout {
|
||||||
display: grid;
|
display: grid;
|
||||||
grid-template-columns: minmax(0, 1fr) 360px;
|
grid-template-columns: minmax(0, 1fr) 360px;
|
||||||
@@ -92,33 +116,6 @@ p {
|
|||||||
overflow: hidden;
|
overflow: hidden;
|
||||||
}
|
}
|
||||||
|
|
||||||
.pipeline {
|
|
||||||
display: flex;
|
|
||||||
align-items: center;
|
|
||||||
gap: 10px;
|
|
||||||
padding: 14px;
|
|
||||||
border-bottom: 1px solid var(--line);
|
|
||||||
overflow-x: auto;
|
|
||||||
}
|
|
||||||
|
|
||||||
.stage {
|
|
||||||
flex: 0 0 auto;
|
|
||||||
padding: 9px 12px;
|
|
||||||
border: 1px solid var(--line);
|
|
||||||
border-radius: 10px;
|
|
||||||
color: var(--muted);
|
|
||||||
background: var(--panel-2);
|
|
||||||
}
|
|
||||||
|
|
||||||
.stage.active {
|
|
||||||
border-color: rgba(46, 232, 135, 0.5);
|
|
||||||
color: var(--green);
|
|
||||||
}
|
|
||||||
|
|
||||||
.arrow {
|
|
||||||
color: var(--muted);
|
|
||||||
}
|
|
||||||
|
|
||||||
.video-wrap {
|
.video-wrap {
|
||||||
display: grid;
|
display: grid;
|
||||||
place-items: center;
|
place-items: center;
|
||||||
@@ -126,7 +123,8 @@ p {
|
|||||||
background: #05070b;
|
background: #05070b;
|
||||||
}
|
}
|
||||||
|
|
||||||
#video {
|
#video,
|
||||||
|
.grid-video {
|
||||||
display: block;
|
display: block;
|
||||||
width: 100%;
|
width: 100%;
|
||||||
height: auto;
|
height: auto;
|
||||||
@@ -134,6 +132,36 @@ p {
|
|||||||
object-fit: contain;
|
object-fit: contain;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
.video-grid {
|
||||||
|
display: grid;
|
||||||
|
grid-template-columns: repeat(2, minmax(0, 1fr));
|
||||||
|
gap: 14px;
|
||||||
|
padding: 14px;
|
||||||
|
background: #05070b;
|
||||||
|
}
|
||||||
|
|
||||||
|
.video-grid-item {
|
||||||
|
overflow: hidden;
|
||||||
|
border: 1px solid var(--line);
|
||||||
|
border-radius: 14px;
|
||||||
|
background: var(--panel-2);
|
||||||
|
}
|
||||||
|
|
||||||
|
.video-grid-title {
|
||||||
|
padding: 10px 12px;
|
||||||
|
border-bottom: 1px solid var(--line);
|
||||||
|
color: var(--muted);
|
||||||
|
font-size: 13px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.video-grid-wrap {
|
||||||
|
min-height: 240px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.grid-video {
|
||||||
|
max-height: calc((100vh - 260px) / 2);
|
||||||
|
}
|
||||||
|
|
||||||
.side-card {
|
.side-card {
|
||||||
display: flex;
|
display: flex;
|
||||||
flex-direction: column;
|
flex-direction: column;
|
||||||
@@ -166,13 +194,19 @@ p {
|
|||||||
background: var(--panel-2);
|
background: var(--panel-2);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
.detections-panel {
|
||||||
|
padding: 16px;
|
||||||
|
border-top: 1px solid var(--line);
|
||||||
|
}
|
||||||
|
|
||||||
.detections {
|
.detections {
|
||||||
display: flex;
|
display: grid;
|
||||||
flex-direction: column;
|
grid-template-columns: repeat(4, minmax(0, 1fr));
|
||||||
gap: 10px;
|
gap: 10px;
|
||||||
}
|
}
|
||||||
|
|
||||||
.detections.empty {
|
.detections.empty {
|
||||||
|
display: block;
|
||||||
color: var(--muted);
|
color: var(--muted);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -197,6 +231,133 @@ p {
|
|||||||
font-size: 12px;
|
font-size: 12px;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
.tokenizer-page .tokenizer-layout {
|
||||||
|
display: grid;
|
||||||
|
grid-template-columns: minmax(0, 1fr) minmax(0, 2fr);
|
||||||
|
grid-template-areas: "flow side";
|
||||||
|
align-items: start;
|
||||||
|
width: 100%;
|
||||||
|
gap: 18px;
|
||||||
|
padding: 18px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.tokenizer-page .tokenizer-side {
|
||||||
|
display: grid;
|
||||||
|
grid-area: side;
|
||||||
|
min-width: 0;
|
||||||
|
gap: 18px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.tokenizer-page .tokenizer-flow-card {
|
||||||
|
grid-area: flow;
|
||||||
|
min-width: 0;
|
||||||
|
min-height: calc(100vh - 122px);
|
||||||
|
}
|
||||||
|
|
||||||
|
.tokenizer-page .tokenizer-side .detections {
|
||||||
|
grid-template-columns: repeat(3, minmax(0, 1fr));
|
||||||
|
}
|
||||||
|
|
||||||
|
.tokenizer-card {
|
||||||
|
border: 1px solid var(--line);
|
||||||
|
border-radius: 18px;
|
||||||
|
padding: 18px;
|
||||||
|
background: rgba(21, 27, 38, 0.9);
|
||||||
|
box-shadow: 0 18px 40px rgba(0, 0, 0, 0.28);
|
||||||
|
}
|
||||||
|
|
||||||
|
.pipeline-steps {
|
||||||
|
display: grid;
|
||||||
|
gap: 10px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.pipeline-step {
|
||||||
|
display: grid;
|
||||||
|
grid-template-columns: 34px minmax(0, 1fr);
|
||||||
|
gap: 10px;
|
||||||
|
align-items: start;
|
||||||
|
padding: 12px;
|
||||||
|
border: 1px solid var(--line);
|
||||||
|
border-radius: 12px;
|
||||||
|
background: var(--panel-2);
|
||||||
|
}
|
||||||
|
|
||||||
|
.step-index {
|
||||||
|
display: grid;
|
||||||
|
place-items: center;
|
||||||
|
width: 28px;
|
||||||
|
height: 28px;
|
||||||
|
border-radius: 999px;
|
||||||
|
color: #06100b;
|
||||||
|
font-weight: 700;
|
||||||
|
background: var(--green);
|
||||||
|
}
|
||||||
|
|
||||||
|
.step-title {
|
||||||
|
margin-bottom: 5px;
|
||||||
|
font-weight: 700;
|
||||||
|
}
|
||||||
|
|
||||||
|
.step-value,
|
||||||
|
.token-summary,
|
||||||
|
.selected-token {
|
||||||
|
color: var(--muted);
|
||||||
|
font-family: ui-monospace, SFMono-Regular, Menlo, monospace;
|
||||||
|
font-size: 13px;
|
||||||
|
word-break: break-all;
|
||||||
|
}
|
||||||
|
|
||||||
|
.token-sequence {
|
||||||
|
display: grid;
|
||||||
|
grid-template-columns: repeat(12, minmax(0, 1fr));
|
||||||
|
gap: 8px;
|
||||||
|
margin-top: 14px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.token-cell {
|
||||||
|
min-height: 50px;
|
||||||
|
border: 1px solid var(--line);
|
||||||
|
border-radius: 10px;
|
||||||
|
color: var(--text);
|
||||||
|
cursor: pointer;
|
||||||
|
background: var(--panel-2);
|
||||||
|
}
|
||||||
|
|
||||||
|
.token-cell span,
|
||||||
|
.token-cell small {
|
||||||
|
display: block;
|
||||||
|
}
|
||||||
|
|
||||||
|
.token-cell small {
|
||||||
|
margin-top: 3px;
|
||||||
|
color: var(--muted);
|
||||||
|
}
|
||||||
|
|
||||||
|
.token-cell.selected,
|
||||||
|
.token-cell:hover {
|
||||||
|
border-color: var(--green);
|
||||||
|
color: var(--green);
|
||||||
|
}
|
||||||
|
|
||||||
|
.token-detail-title {
|
||||||
|
margin-bottom: 10px;
|
||||||
|
color: var(--green);
|
||||||
|
font-weight: 700;
|
||||||
|
}
|
||||||
|
|
||||||
|
.token-vector {
|
||||||
|
padding: 12px;
|
||||||
|
border: 1px solid var(--line);
|
||||||
|
border-radius: 12px;
|
||||||
|
background: var(--panel-2);
|
||||||
|
}
|
||||||
|
|
||||||
|
@media (max-width: 1280px) {
|
||||||
|
.detections {
|
||||||
|
grid-template-columns: repeat(4, minmax(0, 1fr));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@media (max-width: 980px) {
|
@media (max-width: 980px) {
|
||||||
.layout {
|
.layout {
|
||||||
grid-template-columns: 1fr;
|
grid-template-columns: 1fr;
|
||||||
@@ -206,4 +367,18 @@ p {
|
|||||||
align-items: flex-start;
|
align-items: flex-start;
|
||||||
flex-direction: column;
|
flex-direction: column;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
.detections {
|
||||||
|
grid-template-columns: repeat(3, minmax(0, 1fr));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@media (max-width: 640px) {
|
||||||
|
.video-grid {
|
||||||
|
grid-template-columns: 1fr;
|
||||||
|
}
|
||||||
|
|
||||||
|
.detections {
|
||||||
|
grid-template-columns: 1fr;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
131
app/static/tokenizer.js
Normal file
131
app/static/tokenizer.js
Normal file
@@ -0,0 +1,131 @@
|
|||||||
|
const statusEl = document.querySelector("#tokenizer-status");
|
||||||
|
const pipelineEl = document.querySelector("#pipeline-steps");
|
||||||
|
const tokenSummaryEl = document.querySelector("#token-summary");
|
||||||
|
const tokenSequenceEl = document.querySelector("#token-sequence");
|
||||||
|
const selectedTokenEl = document.querySelector("#selected-token");
|
||||||
|
const detectionsEl = document.querySelector("#tokenizer-detections");
|
||||||
|
|
||||||
|
let selectedTokenIndex = null;
|
||||||
|
|
||||||
|
function formatShape(shape) {
|
||||||
|
if (!shape || !shape.length) {
|
||||||
|
return "-";
|
||||||
|
}
|
||||||
|
return `[${shape.join(", ")}]`;
|
||||||
|
}
|
||||||
|
|
||||||
|
function setStatus(ready, text) {
|
||||||
|
statusEl.textContent = text;
|
||||||
|
statusEl.classList.toggle("online", ready);
|
||||||
|
statusEl.classList.toggle("offline", !ready);
|
||||||
|
}
|
||||||
|
|
||||||
|
function renderPipeline(data) {
|
||||||
|
const steps = [
|
||||||
|
["OpenCV RGB 帧", `${data.image_size?.width ?? "-"} × ${data.image_size?.height ?? "-"}`],
|
||||||
|
["PIL Image", `${data.image_size?.width ?? "-"} × ${data.image_size?.height ?? "-"}`],
|
||||||
|
["DetrImageProcessor", `pixel_values ${formatShape(data.pixel_values_shape)} / pixel_mask ${formatShape(data.pixel_mask_shape)}`],
|
||||||
|
["ResNet-50 backbone", `feature map ${formatShape(data.feature_map_shape)}`],
|
||||||
|
["1×1 convolution", `projected ${formatShape(data.projected_feature_map_shape)}`],
|
||||||
|
["视觉 token embedding", `由 projected feature map flatten 得到 ${formatShape(data.visual_tokens_shape)}`],
|
||||||
|
["位置 embedding", `二维位置 embedding ${formatShape(data.position_encoding_shape)}`],
|
||||||
|
["Transformer Encoder", formatShape(data.encoder_last_hidden_state_shape)],
|
||||||
|
["Object query embedding + Decoder", `object query embedding 解码后 ${formatShape(data.decoder_last_hidden_state_shape)}`],
|
||||||
|
["类别 logits + boxes", `logits ${formatShape(data.logits_shape)} / boxes ${formatShape(data.pred_boxes_shape)}`],
|
||||||
|
["post_process_object_detection", `检测结果 ${data.detections?.length ?? 0} 个`],
|
||||||
|
];
|
||||||
|
|
||||||
|
pipelineEl.innerHTML = steps
|
||||||
|
.map(([title, value], index) => `
|
||||||
|
<div class="pipeline-step">
|
||||||
|
<div class="step-index">${index + 1}</div>
|
||||||
|
<div>
|
||||||
|
<div class="step-title">${title}</div>
|
||||||
|
<div class="step-value">${value}</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
`)
|
||||||
|
.join("");
|
||||||
|
}
|
||||||
|
|
||||||
|
function renderTokens(data) {
|
||||||
|
const grid = data.token_grid || {};
|
||||||
|
tokenSummaryEl.textContent = `帧号 ${data.frame_id ?? "-"} · token 网格 ${grid.rows ?? "-"} × ${grid.cols ?? "-"},总数 ${grid.total ?? "-"},展示前 ${grid.shown ?? 0} 个 token,每个显示前 8 维采样。`;
|
||||||
|
tokenSequenceEl.innerHTML = (data.token_sequence || [])
|
||||||
|
.map((token) => `
|
||||||
|
<button class="token-cell ${token.index === selectedTokenIndex ? "selected" : ""}" data-index="${token.index}">
|
||||||
|
<span>#${token.index}</span>
|
||||||
|
<small>(${token.row}, ${token.col})</small>
|
||||||
|
</button>
|
||||||
|
`)
|
||||||
|
.join("");
|
||||||
|
|
||||||
|
tokenSequenceEl.querySelectorAll(".token-cell").forEach((button) => {
|
||||||
|
button.addEventListener("click", () => {
|
||||||
|
selectedTokenIndex = Number(button.dataset.index);
|
||||||
|
renderSelectedToken(data);
|
||||||
|
renderTokens(data);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
renderSelectedToken(data);
|
||||||
|
}
|
||||||
|
|
||||||
|
function renderSelectedToken(data) {
|
||||||
|
const tokens = data.token_sequence || [];
|
||||||
|
const token = tokens.find((item) => item.index === selectedTokenIndex) || tokens[0];
|
||||||
|
if (!token) {
|
||||||
|
selectedTokenEl.textContent = "暂无 token。";
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
selectedTokenIndex = token.index;
|
||||||
|
selectedTokenEl.innerHTML = `
|
||||||
|
<div class="token-detail-title">Token #${token.index} · 网格位置 (${token.row}, ${token.col}) · L2 ${token.magnitude}</div>
|
||||||
|
<div class="token-vector">[${token.values.map((value) => Number(value).toFixed(4)).join(", ")}, ...]</div>
|
||||||
|
`;
|
||||||
|
}
|
||||||
|
|
||||||
|
function renderDetections(detections) {
|
||||||
|
if (!detections.length) {
|
||||||
|
detectionsEl.className = "detections empty";
|
||||||
|
detectionsEl.textContent = "暂无目标";
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
detectionsEl.className = "detections";
|
||||||
|
detectionsEl.innerHTML = detections
|
||||||
|
.map((det) => `
|
||||||
|
<div class="det-item">
|
||||||
|
<div class="det-title">
|
||||||
|
<span>${det.label}</span>
|
||||||
|
<span>${(det.score * 100).toFixed(1)}%</span>
|
||||||
|
</div>
|
||||||
|
<div class="det-box">box: [${det.box.join(", ")}]</div>
|
||||||
|
</div>
|
||||||
|
`)
|
||||||
|
.join("");
|
||||||
|
}
|
||||||
|
|
||||||
|
async function refreshTokenizer() {
|
||||||
|
try {
|
||||||
|
const response = await fetch(`/tokenizer/state?t=${Date.now()}`);
|
||||||
|
const data = await response.json();
|
||||||
|
if (!data.ready) {
|
||||||
|
setStatus(false, data.error || "等待帧");
|
||||||
|
tokenSummaryEl.textContent = data.error || "等待视频帧";
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
setStatus(Boolean(data.connected), data.connected ? "动态更新中" : "未连接");
|
||||||
|
renderPipeline(data);
|
||||||
|
renderTokens(data);
|
||||||
|
renderDetections(data.detections || []);
|
||||||
|
} catch (error) {
|
||||||
|
setStatus(false, "更新失败");
|
||||||
|
tokenSummaryEl.textContent = `更新失败:${error}`;
|
||||||
|
} finally {
|
||||||
|
setTimeout(refreshTokenizer, 1200);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
refreshTokenizer();
|
||||||
@@ -26,6 +26,7 @@ class StreamWorker:
|
|||||||
|
|
||||||
self.lock = threading.Lock()
|
self.lock = threading.Lock()
|
||||||
self.latest_jpeg: bytes | None = None
|
self.latest_jpeg: bytes | None = None
|
||||||
|
self.latest_frame_rgb: Any | None = None
|
||||||
self.latest_detections: list[dict[str, Any]] = []
|
self.latest_detections: list[dict[str, Any]] = []
|
||||||
self.frame_id = 0
|
self.frame_id = 0
|
||||||
self.updated_at = 0.0
|
self.updated_at = 0.0
|
||||||
@@ -56,21 +57,30 @@ class StreamWorker:
|
|||||||
def reconnect(self) -> None:
|
def reconnect(self) -> None:
|
||||||
with self.lock:
|
with self.lock:
|
||||||
self.latest_jpeg = None
|
self.latest_jpeg = None
|
||||||
|
self.latest_frame_rgb = None
|
||||||
self.latest_detections = []
|
self.latest_detections = []
|
||||||
self.frame_id = 0
|
self.frame_id = 0
|
||||||
self.fps = 0.0
|
self.fps = 0.0
|
||||||
self.reconnect_requested = True
|
self.reconnect_requested = True
|
||||||
self.reconnect_version += 1
|
self.reconnect_version += 1
|
||||||
|
version = self.reconnect_version
|
||||||
self.connected = False
|
self.connected = False
|
||||||
self.error = "正在切换视频源"
|
self.error = "正在切换视频源"
|
||||||
self.resolve_ms = 0.0
|
self.resolve_ms = 0.0
|
||||||
self.open_ms = 0.0
|
self.open_ms = 0.0
|
||||||
self.first_frame_ms = 0.0
|
self.first_frame_ms = 0.0
|
||||||
|
print(f"[device-switch] worker reconnect requested version={version}", flush=True)
|
||||||
|
|
||||||
def get_jpeg(self) -> bytes | None:
|
def get_jpeg(self) -> bytes | None:
|
||||||
with self.lock:
|
with self.lock:
|
||||||
return self.latest_jpeg
|
return self.latest_jpeg
|
||||||
|
|
||||||
|
def get_frame_rgb(self) -> Any | None:
|
||||||
|
with self.lock:
|
||||||
|
if self.latest_frame_rgb is None:
|
||||||
|
return None
|
||||||
|
return self.latest_frame_rgb.copy()
|
||||||
|
|
||||||
def get_snapshot(self) -> dict[str, Any]:
|
def get_snapshot(self) -> dict[str, Any]:
|
||||||
with self.lock:
|
with self.lock:
|
||||||
return {
|
return {
|
||||||
@@ -100,16 +110,29 @@ class StreamWorker:
|
|||||||
run_version = self.reconnect_version
|
run_version = self.reconnect_version
|
||||||
self.reconnect_requested = False
|
self.reconnect_requested = False
|
||||||
if should_reconnect:
|
if should_reconnect:
|
||||||
|
print(f"[device-switch] worker reconnect handling version={run_version}", flush=True)
|
||||||
# 切换摄像头时必须释放旧连接,否则 OpenCV 会继续阻塞读旧流。
|
# 切换摄像头时必须释放旧连接,否则 OpenCV 会继续阻塞读旧流。
|
||||||
if cap is not None:
|
if cap is not None:
|
||||||
cap.release()
|
cap.release()
|
||||||
cap = None
|
cap = None
|
||||||
|
print(f"[device-switch] worker released old capture version={run_version}", flush=True)
|
||||||
|
|
||||||
if cap is None or not cap.isOpened():
|
if cap is None or not cap.isOpened():
|
||||||
started = time.monotonic()
|
started = time.monotonic()
|
||||||
|
try:
|
||||||
stream_url = self.stream_url() if callable(self.stream_url) else self.stream_url
|
stream_url = self.stream_url() if callable(self.stream_url) else self.stream_url
|
||||||
|
except Exception as exc:
|
||||||
|
resolve_ms = round((time.monotonic() - started) * 1000, 2)
|
||||||
|
with self.lock:
|
||||||
|
self.resolve_ms = resolve_ms
|
||||||
|
self.open_ms = 0.0
|
||||||
|
self.first_frame_ms = 0.0
|
||||||
|
self._set_connection_state(False, f"获取播放地址失败:{exc},2 秒后重试")
|
||||||
|
time.sleep(2)
|
||||||
|
continue
|
||||||
resolve_ms = round((time.monotonic() - started) * 1000, 2)
|
resolve_ms = round((time.monotonic() - started) * 1000, 2)
|
||||||
started = time.monotonic()
|
started = time.monotonic()
|
||||||
|
print(f"[device-switch] worker open start version={run_version}", flush=True)
|
||||||
cap = cv2.VideoCapture(stream_url)
|
cap = cv2.VideoCapture(stream_url)
|
||||||
open_ms = round((time.monotonic() - started) * 1000, 2)
|
open_ms = round((time.monotonic() - started) * 1000, 2)
|
||||||
with self.lock:
|
with self.lock:
|
||||||
@@ -117,11 +140,13 @@ class StreamWorker:
|
|||||||
self.resolve_ms = resolve_ms
|
self.resolve_ms = resolve_ms
|
||||||
self.first_frame_ms = 0.0
|
self.first_frame_ms = 0.0
|
||||||
if not cap.isOpened():
|
if not cap.isOpened():
|
||||||
|
print(f"[device-switch] worker open failed version={run_version} open_ms={open_ms}", flush=True)
|
||||||
self._set_connection_state(False, "无法打开视频流,2 秒后重试")
|
self._set_connection_state(False, "无法打开视频流,2 秒后重试")
|
||||||
cap.release()
|
cap.release()
|
||||||
cap = None
|
cap = None
|
||||||
time.sleep(2)
|
time.sleep(2)
|
||||||
continue
|
continue
|
||||||
|
print(f"[device-switch] worker open success version={run_version} open_ms={open_ms}", flush=True)
|
||||||
self._set_connection_state(True, "已连接")
|
self._set_connection_state(True, "已连接")
|
||||||
|
|
||||||
started = time.monotonic()
|
started = time.monotonic()
|
||||||
@@ -143,9 +168,9 @@ class StreamWorker:
|
|||||||
|
|
||||||
frame = self._resize(frame)
|
frame = self._resize(frame)
|
||||||
self.frame_id += 1
|
self.frame_id += 1
|
||||||
|
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
||||||
|
|
||||||
if self.frame_id % self.frame_skip == 0:
|
if self.frame_id % self.frame_skip == 0:
|
||||||
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
|
||||||
last_detections = self.detector.detect(frame_rgb)
|
last_detections = self.detector.detect(frame_rgb)
|
||||||
|
|
||||||
annotated = self._draw(frame, last_detections)
|
annotated = self._draw(frame, last_detections)
|
||||||
@@ -173,6 +198,7 @@ class StreamWorker:
|
|||||||
|
|
||||||
with self.lock:
|
with self.lock:
|
||||||
self.latest_jpeg = jpeg.tobytes()
|
self.latest_jpeg = jpeg.tobytes()
|
||||||
|
self.latest_frame_rgb = frame_rgb.copy()
|
||||||
self.latest_detections = list(last_detections)
|
self.latest_detections = list(last_detections)
|
||||||
self.updated_at = time.time()
|
self.updated_at = time.time()
|
||||||
self.connected = True
|
self.connected = True
|
||||||
|
|||||||
@@ -9,26 +9,30 @@
|
|||||||
<body>
|
<body>
|
||||||
<header class="topbar">
|
<header class="topbar">
|
||||||
<div>
|
<div>
|
||||||
<h1>DETR 动态打标</h1>
|
<!-- <h1>DETR 动态打标</h1> -->
|
||||||
<p>使用 Python、OpenCV、PyTorch、Transformers DETR 和 FastAPI,Mac mini m2 上运行。</p>
|
<p>DETR动态打标:Python、OpenCV、PyTorch、Transformers DETR 和 FastAPI。</p>
|
||||||
</div>
|
</div>
|
||||||
|
<div class="topbar-actions">
|
||||||
|
<a class="button-link" href="/tokenizer">tokenizer</a>
|
||||||
<div class="badge" id="connection">连接中</div>
|
<div class="badge" id="connection">连接中</div>
|
||||||
|
</div>
|
||||||
</header>
|
</header>
|
||||||
|
|
||||||
<main class="layout">
|
<main class="layout">
|
||||||
<section class="video-card">
|
<section class="video-card">
|
||||||
<div class="pipeline">
|
<div class="video-grid">
|
||||||
<div class="stage active">源节点</div>
|
{% for device_item in video_grid_devices %}
|
||||||
<div class="arrow">→</div>
|
<article class="video-grid-item">
|
||||||
<div class="stage active">DETR 推理</div>
|
<div class="video-grid-title">{{ device_item.name }} · {{ device_item.device_num }}</div>
|
||||||
<div class="arrow">→</div>
|
<div class="video-wrap video-grid-wrap">
|
||||||
<div class="stage active">OSD 打标</div>
|
<img class="grid-video" src="/video/{{ device_item.device_num | urlencode }}" data-src="/video/{{ device_item.device_num | urlencode }}" alt="{{ device_item.name }} 视频流" />
|
||||||
<div class="arrow">→</div>
|
|
||||||
<div class="stage active">FastAPI 输出</div>
|
|
||||||
</div>
|
</div>
|
||||||
<div class="video-wrap">
|
</article>
|
||||||
<img id="video" src="/video" alt="动态打标视频流" />
|
{% endfor %}
|
||||||
</div>
|
</div>
|
||||||
|
<section class="detections-panel">
|
||||||
|
<div id="detections" class="detections empty">暂无目标</div>
|
||||||
|
</section>
|
||||||
</section>
|
</section>
|
||||||
|
|
||||||
<aside class="side-card">
|
<aside class="side-card">
|
||||||
@@ -69,11 +73,6 @@
|
|||||||
<dd id="timing-frame">-</dd>
|
<dd id="timing-frame">-</dd>
|
||||||
</dl>
|
</dl>
|
||||||
</section>
|
</section>
|
||||||
|
|
||||||
<section>
|
|
||||||
<h2>检测结果</h2>
|
|
||||||
<div id="detections" class="detections empty">暂无目标</div>
|
|
||||||
</section>
|
|
||||||
</aside>
|
</aside>
|
||||||
</main>
|
</main>
|
||||||
|
|
||||||
|
|||||||
48
app/templates/tokenizer.html
Normal file
48
app/templates/tokenizer.html
Normal file
@@ -0,0 +1,48 @@
|
|||||||
|
<!doctype html>
|
||||||
|
<html lang="zh-CN">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8" />
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||||
|
<title>DETR Tokenizer 动态可视化</title>
|
||||||
|
<link rel="stylesheet" href="/static/style.css?v=tokenizer-layout-1-3" />
|
||||||
|
</head>
|
||||||
|
<body class="tokenizer-page">
|
||||||
|
<header class="topbar">
|
||||||
|
<div>
|
||||||
|
<!--<h1>DETR Tokenizer 动态可视化</h1>-->
|
||||||
|
<p>实时展示当前视频帧从图像预处理、视觉特征、token 序列到检测输出的过程。</p>
|
||||||
|
</div>
|
||||||
|
<div class="topbar-actions">
|
||||||
|
<a class="button-link" href="/">返回视频</a>
|
||||||
|
<div class="badge" id="tokenizer-status">等待帧</div>
|
||||||
|
</div>
|
||||||
|
</header>
|
||||||
|
|
||||||
|
<main class="tokenizer-layout">
|
||||||
|
<section class="tokenizer-card tokenizer-flow-card">
|
||||||
|
<h2>实时流程</h2>
|
||||||
|
<div class="pipeline-steps" id="pipeline-steps"></div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<aside class="tokenizer-side">
|
||||||
|
<section class="tokenizer-card">
|
||||||
|
<h2>选中 Token</h2>
|
||||||
|
<div class="selected-token" id="selected-token">点击下方 token 查看向量采样。</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section class="tokenizer-card">
|
||||||
|
<h2>Token 序列</h2>
|
||||||
|
<div class="token-summary" id="token-summary">等待视频帧</div>
|
||||||
|
<div class="token-sequence" id="token-sequence"></div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section class="tokenizer-card">
|
||||||
|
<h2>检测输出</h2>
|
||||||
|
<div id="tokenizer-detections" class="detections empty">暂无目标</div>
|
||||||
|
</section>
|
||||||
|
</aside>
|
||||||
|
</main>
|
||||||
|
|
||||||
|
<script src="/static/tokenizer.js"></script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
33
tokenizer.md
33
tokenizer.md
@@ -1,3 +1,4 @@
|
|||||||
|
|
||||||
# DETR 的视觉 token 化过程说明
|
# DETR 的视觉 token 化过程说明
|
||||||
|
|
||||||
本文基于当前项目代码 `app/detector.py` 中的实现说明 DETR 的“token 化”过程。
|
本文基于当前项目代码 `app/detector.py` 中的实现说明 DETR 的“token 化”过程。
|
||||||
@@ -58,19 +59,39 @@ projected feature map: [batch, 256, H', W']
|
|||||||
↓
|
↓
|
||||||
flatten 空间维度 H' × W'
|
flatten 空间维度 H' × W'
|
||||||
↓
|
↓
|
||||||
visual tokens: [batch, H'×W', 256]
|
visual token embeddings: [batch, H'×W', 256]
|
||||||
↓
|
↓
|
||||||
加入二维位置编码
|
加入二维位置 embedding
|
||||||
↓
|
↓
|
||||||
Transformer Encoder
|
Transformer Encoder
|
||||||
↓
|
↓
|
||||||
Object Queries + Transformer Decoder
|
Object query embeddings + Transformer Decoder
|
||||||
↓
|
↓
|
||||||
类别 logits + 边界框 boxes
|
类别 logits + 边界框 boxes
|
||||||
↓
|
↓
|
||||||
post_process_object_detection 还原到原图坐标
|
post_process_object_detection 还原到原图坐标
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Embedding 在 1-11 个环节中的位置
|
||||||
|
|
||||||
|
在这个 DETR 流程里,embedding 不是单独只有一步,而是出现在 3 个关键环节:
|
||||||
|
|
||||||
|
| 页面步骤 | 名称 | embedding 含义 |
|
||||||
|
| --- | --- | --- |
|
||||||
|
| 第 6 步 | visual token embedding | `projected feature map` 经过 flatten 后,每个空间网格点变成一个 256 维视觉 token embedding。 |
|
||||||
|
| 第 7 步 | position embedding | 给每个视觉 token 加入二维位置 embedding,让 Transformer 知道 token 原本在图像中的位置。 |
|
||||||
|
| 第 9 步 | object query embedding | DETR 使用一组可学习的 object query embeddings 进入 Decoder,每个 query 最终预测一个候选目标。 |
|
||||||
|
|
||||||
|
所以如果问“embedding 在 1-11 哪个环节”,最核心的是:
|
||||||
|
|
||||||
|
```text
|
||||||
|
第 6 步:产生视觉 token embedding
|
||||||
|
第 7 步:加入位置 embedding
|
||||||
|
第 9 步:object query embedding 进入 Decoder
|
||||||
|
```
|
||||||
|
|
||||||
|
第 6 步是图像内容 embedding,第 7 步是空间位置 embedding,第 9 步是检测目标查询 embedding。
|
||||||
|
|
||||||
## 第 1 步:图像预处理
|
## 第 1 步:图像预处理
|
||||||
|
|
||||||
代码:
|
代码:
|
||||||
@@ -212,7 +233,7 @@ x = x.permute(2, 0, 1) # [h*w, batch, 256]
|
|||||||
|
|
||||||
两种写法本质相同,只是 Transformer 接口期望的维度顺序不同。
|
两种写法本质相同,只是 Transformer 接口期望的维度顺序不同。
|
||||||
|
|
||||||
## 第 5 步:加入二维位置编码
|
## 第 5 步:加入二维位置 embedding
|
||||||
|
|
||||||
Transformer 本身不理解图像中的二维空间位置。
|
Transformer 本身不理解图像中的二维空间位置。
|
||||||
|
|
||||||
@@ -264,11 +285,11 @@ Encoder 会通过 self-attention 建模图像中不同区域之间的关系。
|
|||||||
- 道路区域可以影响车辆判断。
|
- 道路区域可以影响车辆判断。
|
||||||
- 远处小目标可以和周围上下文一起被理解。
|
- 远处小目标可以和周围上下文一起被理解。
|
||||||
|
|
||||||
## 第 7 步:Object Queries 和 Transformer Decoder
|
## 第 7 步:Object query embedding 和 Transformer Decoder
|
||||||
|
|
||||||
DETR 与传统检测器不同,它不是先生成大量 anchor box。
|
DETR 与传统检测器不同,它不是先生成大量 anchor box。
|
||||||
|
|
||||||
它使用一组可学习的 object queries。常见数量是:
|
它使用一组可学习的 object query embeddings。常见数量是:
|
||||||
|
|
||||||
```text
|
```text
|
||||||
100 个 object queries
|
100 个 object queries
|
||||||
|
|||||||
Reference in New Issue
Block a user