update at 2026-06-04 14:09:16

This commit is contained in:
陈赣
2026-06-04 14:09:16 +08:00
parent 41bd03123c
commit 4603914e85
11 changed files with 692 additions and 68 deletions

Submodule .claude/worktrees/agent-a02a887cdc0c41851 added at 41bd03123c

View File

@@ -36,7 +36,79 @@ class DetrVehicleDetector:
inputs = {key: value.to(self.device) for key, value in inputs.items()}
outputs = self.model(**inputs)
# DETR 后处理需要原图尺寸PIL size 是 (宽, 高),这里转成 (高, 宽)。
return self._detections_from_outputs(image, outputs)
@torch.no_grad()
def inspect_tokens(self, frame_rgb: Any) -> dict[str, Any]:
image = Image.fromarray(frame_rgb)
inputs = self.processor(images=image, return_tensors="pt")
inputs = {key: value.to(self.device) for key, value in inputs.items()}
features, object_queries_list = self.model.model.backbone(inputs["pixel_values"], inputs["pixel_mask"])
feature_map, mask = features[-1]
projected_feature_map = self.model.model.input_projection(feature_map)
tokens = projected_feature_map.flatten(2).permute(0, 2, 1)
object_queries = object_queries_list[-1].flatten(2).permute(0, 2, 1)
outputs = self.model(**inputs, output_hidden_states=True)
target_sizes = torch.tensor([image.size[::-1]], device=self.device)
results = self.processor.post_process_object_detection(
outputs,
target_sizes=target_sizes,
threshold=self.confidence,
)[0]
token_rows = int(projected_feature_map.shape[2])
token_cols = int(projected_feature_map.shape[3])
sample_count = min(48, int(tokens.shape[1]))
sample_tokens = tokens[0, :sample_count, :8].detach().cpu()
token_sequence = []
for index, vector in enumerate(sample_tokens):
token_sequence.append(
{
"index": index,
"row": index // token_cols,
"col": index % token_cols,
"values": [round(float(value), 4) for value in vector.tolist()],
"magnitude": round(float(vector.norm()), 4),
}
)
detections = []
for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
label_name = self.model.config.id2label[label.item()]
if label_name not in self.vehicle_labels:
continue
x1, y1, x2, y2 = box.detach().cpu().numpy().astype(int).tolist()
detections.append(
{
"label": label_name,
"score": round(float(score.detach().cpu()), 4),
"box": [x1, y1, x2, y2],
}
)
encoder_last_hidden_state = getattr(outputs, "encoder_last_hidden_state", None)
last_hidden_state = getattr(outputs, "last_hidden_state", None)
return {
"image_size": {"width": image.size[0], "height": image.size[1]},
"pixel_values_shape": list(inputs["pixel_values"].shape),
"pixel_mask_shape": list(inputs["pixel_mask"].shape),
"feature_map_shape": list(feature_map.shape),
"projected_feature_map_shape": list(projected_feature_map.shape),
"visual_tokens_shape": list(tokens.shape),
"position_encoding_shape": list(object_queries.shape),
"encoder_last_hidden_state_shape": list(encoder_last_hidden_state.shape) if encoder_last_hidden_state is not None else [],
"decoder_last_hidden_state_shape": list(last_hidden_state.shape) if last_hidden_state is not None else [],
"logits_shape": list(outputs.logits.shape),
"pred_boxes_shape": list(outputs.pred_boxes.shape),
"token_grid": {"rows": token_rows, "cols": token_cols, "total": int(tokens.shape[1]), "shown": sample_count},
"token_sequence": token_sequence,
"detections": detections,
}
def _detections_from_outputs(self, image: Image.Image, outputs: Any) -> list[dict[str, Any]]:
target_sizes = torch.tensor([image.size[::-1]], device=self.device)
results = self.processor.post_process_object_detection(
outputs,
@@ -60,3 +132,4 @@ class DetrVehicleDetector:
)
return detections

View File

@@ -33,11 +33,16 @@ class DeviceManager:
if device_num not in {device.device_num for device in self.devices}:
raise ValueError("设备不在 devicelist.env 中")
with self.lock:
old_device_num = self.current_device_num
self.current_device_num = device_num
self.current_url = ""
self.timings = {}
self.updated_at = time.time()
self.version += 1
print(
f"[device-switch] manager set old={old_device_num} new={device_num} version={self.version}",
flush=True,
)
return self.version
def resolve_stream_url(self) -> str:
@@ -49,16 +54,38 @@ class DeviceManager:
return self.fallback_url
raise RuntimeError("devicelist.env 中没有可用设备号")
print(f"[device-switch] resolve start device={device_num} version={version}", flush=True)
try:
result = self.api_client.get_stream_url_details(device_num)
except Exception as exc:
print(
f"[device-switch] resolve failed device={device_num} version={version} error={exc}",
flush=True,
)
raise
with self.lock:
# 避免旧摄像头的慢接口响应覆盖用户刚切换的新选择。
if version != self.version or device_num != self.current_device_num:
print(
f"[device-switch] resolve stale device={device_num} version={version} current={self.current_device_num} current_version={self.version}",
flush=True,
)
return self.current_url
self.current_url = result.url
self.timings = dict(result.timings)
self.updated_at = time.time()
print(f"[device-switch] resolve success device={device_num} version={version}", flush=True)
return result.url
def resolve_stream_url_for(self, device_num: str) -> str:
if device_num not in {device.device_num for device in self.devices}:
raise ValueError("设备不在 devicelist.env 中")
result = self.api_client.get_stream_url_details(device_num)
return result.url
def get_video_grid_devices(self, limit: int = 4) -> list[Device]:
return self.devices[:limit]
def get_snapshot(self) -> dict[str, Any]:
with self.lock:
return {

View File

@@ -45,6 +45,18 @@ worker = StreamWorker(
resize_width=settings.resize_width,
)
video_grid_devices = device_manager.get_video_grid_devices()
video_grid_workers = {
device.device_num: StreamWorker(
stream_url=lambda device_num=device.device_num: device_manager.resolve_stream_url_for(device_num),
detector=detector,
frame_skip=settings.frame_skip,
jpeg_quality=settings.jpeg_quality,
resize_width=settings.resize_width,
)
for device in video_grid_devices
}
app = FastAPI(title="DETR 动态打标")
app.mount("/static", StaticFiles(directory="app/static"), name="static")
templates = Jinja2Templates(directory="app/templates")
@@ -57,11 +69,15 @@ def display_model_name(model_name: str) -> str:
@app.on_event("startup")
def startup() -> None:
worker.start()
for grid_worker in video_grid_workers.values():
grid_worker.start()
@app.on_event("shutdown")
def shutdown() -> None:
worker.stop()
for grid_worker in video_grid_workers.values():
grid_worker.stop()
@app.get("/", response_class=HTMLResponse)
@@ -73,15 +89,67 @@ def index(request: Request) -> HTMLResponse:
"model": display_model_name(settings.detr_model),
"device": detector.device_name,
"stream_url": f"设备号:{device_manager.get_snapshot()['current_device_num']}",
"video_grid_devices": video_grid_devices,
},
)
@app.get("/tokenizer", response_class=HTMLResponse)
def tokenizer(request: Request) -> HTMLResponse:
return templates.TemplateResponse(
"tokenizer.html",
{
"request": request,
"model": display_model_name(settings.detr_model),
"device": detector.device_name,
},
)
@app.get("/tokenizer/state")
def tokenizer_state() -> JSONResponse:
snapshot = worker.get_snapshot()
frame = worker.get_frame_rgb()
if frame is None:
return JSONResponse(
{
"ready": False,
"frame_id": snapshot["frame_id"],
"connected": snapshot["connected"],
"error": snapshot["error"] or "等待视频帧",
}
)
data = detector.inspect_tokens(frame)
data.update(
{
"ready": True,
"frame_id": snapshot["frame_id"],
"updated_at": snapshot["updated_at"],
"connected": snapshot["connected"],
"error": snapshot["error"],
}
)
return JSONResponse(data)
@app.get("/video")
def video() -> StreamingResponse:
return stream_video(worker)
@app.get("/video/{device_num}")
def video_device(device_num: str) -> StreamingResponse:
grid_worker = video_grid_workers.get(device_num)
if grid_worker is None:
raise HTTPException(status_code=404, detail="设备不在视频网格中")
return stream_video(grid_worker)
def stream_video(stream_worker: StreamWorker) -> StreamingResponse:
async def generate():
while True:
frame = worker.get_jpeg()
frame = stream_worker.get_jpeg()
if frame is None:
await asyncio.sleep(0.1)
continue
@@ -137,11 +205,13 @@ def status() -> JSONResponse:
@app.post("/devices/{device_num}")
def switch_device(device_num: str) -> JSONResponse:
try:
device_manager.set_current_device(device_num)
version = device_manager.set_current_device(device_num)
except ValueError as exc:
print(f"[device-switch] invalid device={device_num}", flush=True)
raise HTTPException(status_code=404, detail=str(exc)) from exc
worker.reconnect()
return JSONResponse({"current_device_num": device_num})
print(f"[device-switch] accepted device={device_num} version={version}", flush=True)
return JSONResponse({"current_device_num": device_num, "version": version})
@app.websocket("/ws/detections")

View File

@@ -13,7 +13,10 @@ const timingFrameEl = document.querySelector("#timing-frame");
let selectedDevice = "";
let pendingDevice = "";
let queuedDevice = "";
let switching = false;
let devicesSignature = "";
let lastWsSignature = "";
function setConnection(online, text) {
connection.textContent = text;
@@ -84,16 +87,54 @@ function renderDetections(detections) {
.join("");
}
async function switchDevice(deviceNum) {
async function performSwitch(deviceNum) {
switching = true;
pendingDevice = deviceNum;
devicesSignature = "";
setConnection(false, "切换中");
console.log("[device-switch] start", { deviceNum });
try {
const response = await fetch(`/devices/${encodeURIComponent(deviceNum)}`, { method: "POST" });
if (!response.ok) {
throw new Error("切换摄像头失败");
}
const result = await response.json();
const video = document.querySelector("#video");
if (video) {
video.src = `/video?t=${Date.now()}`;
}
document.querySelectorAll(".grid-video").forEach((item) => {
item.src = `${item.dataset.src}?t=${Date.now()}`;
});
console.log("[device-switch] requested", { deviceNum, version: result.version });
} catch (error) {
pendingDevice = "";
devicesSignature = "";
setConnection(false, "切换失败");
console.error("[device-switch] failed", { deviceNum, error });
} finally {
switching = false;
if (queuedDevice && queuedDevice !== deviceNum) {
const nextDevice = queuedDevice;
queuedDevice = "";
return performSwitch(nextDevice);
}
queuedDevice = "";
}
}
function switchDevice(deviceNum) {
if (switching) {
queuedDevice = deviceNum;
pendingDevice = deviceNum;
devicesSignature = "";
setConnection(false, "等待切换");
console.log("[device-switch] queued", { deviceNum });
return Promise.resolve();
}
return performSwitch(deviceNum);
}
function connectWebSocket() {
@@ -109,6 +150,18 @@ function connectWebSocket() {
errorEl.textContent = data.error || (data.connected ? "正常" : "未连接");
sourceEl.textContent = data.source || "-";
setConnection(Boolean(data.connected), data.connected ? "已连接" : "重连中");
const wsSignature = `${data.current_device_num}|${data.connected}|${data.frame_id}|${data.error || ""}`;
if (wsSignature !== lastWsSignature) {
lastWsSignature = wsSignature;
console.log("[device-switch] ws", {
currentDeviceNum: data.current_device_num,
pendingDevice,
queuedDevice,
connected: data.connected,
frameId: data.frame_id,
error: data.error,
});
}
if (pendingDevice && data.current_device_num === pendingDevice) {
pendingDevice = "";
deviceSelect.disabled = false;

View File

@@ -73,6 +73,30 @@ p {
color: var(--red);
}
.topbar-actions {
display: flex;
align-items: center;
gap: 12px;
}
.button-link {
display: inline-flex;
align-items: center;
justify-content: center;
min-height: 36px;
padding: 8px 14px;
border: 1px solid var(--line);
border-radius: 999px;
color: var(--text);
text-decoration: none;
background: var(--panel);
}
.button-link:hover {
border-color: var(--green);
color: var(--green);
}
.layout {
display: grid;
grid-template-columns: minmax(0, 1fr) 360px;
@@ -92,33 +116,6 @@ p {
overflow: hidden;
}
.pipeline {
display: flex;
align-items: center;
gap: 10px;
padding: 14px;
border-bottom: 1px solid var(--line);
overflow-x: auto;
}
.stage {
flex: 0 0 auto;
padding: 9px 12px;
border: 1px solid var(--line);
border-radius: 10px;
color: var(--muted);
background: var(--panel-2);
}
.stage.active {
border-color: rgba(46, 232, 135, 0.5);
color: var(--green);
}
.arrow {
color: var(--muted);
}
.video-wrap {
display: grid;
place-items: center;
@@ -126,7 +123,8 @@ p {
background: #05070b;
}
#video {
#video,
.grid-video {
display: block;
width: 100%;
height: auto;
@@ -134,6 +132,36 @@ p {
object-fit: contain;
}
.video-grid {
display: grid;
grid-template-columns: repeat(2, minmax(0, 1fr));
gap: 14px;
padding: 14px;
background: #05070b;
}
.video-grid-item {
overflow: hidden;
border: 1px solid var(--line);
border-radius: 14px;
background: var(--panel-2);
}
.video-grid-title {
padding: 10px 12px;
border-bottom: 1px solid var(--line);
color: var(--muted);
font-size: 13px;
}
.video-grid-wrap {
min-height: 240px;
}
.grid-video {
max-height: calc((100vh - 260px) / 2);
}
.side-card {
display: flex;
flex-direction: column;
@@ -166,13 +194,19 @@ p {
background: var(--panel-2);
}
.detections-panel {
padding: 16px;
border-top: 1px solid var(--line);
}
.detections {
display: flex;
flex-direction: column;
display: grid;
grid-template-columns: repeat(4, minmax(0, 1fr));
gap: 10px;
}
.detections.empty {
display: block;
color: var(--muted);
}
@@ -197,6 +231,133 @@ p {
font-size: 12px;
}
.tokenizer-page .tokenizer-layout {
display: grid;
grid-template-columns: minmax(0, 1fr) minmax(0, 2fr);
grid-template-areas: "flow side";
align-items: start;
width: 100%;
gap: 18px;
padding: 18px;
}
.tokenizer-page .tokenizer-side {
display: grid;
grid-area: side;
min-width: 0;
gap: 18px;
}
.tokenizer-page .tokenizer-flow-card {
grid-area: flow;
min-width: 0;
min-height: calc(100vh - 122px);
}
.tokenizer-page .tokenizer-side .detections {
grid-template-columns: repeat(3, minmax(0, 1fr));
}
.tokenizer-card {
border: 1px solid var(--line);
border-radius: 18px;
padding: 18px;
background: rgba(21, 27, 38, 0.9);
box-shadow: 0 18px 40px rgba(0, 0, 0, 0.28);
}
.pipeline-steps {
display: grid;
gap: 10px;
}
.pipeline-step {
display: grid;
grid-template-columns: 34px minmax(0, 1fr);
gap: 10px;
align-items: start;
padding: 12px;
border: 1px solid var(--line);
border-radius: 12px;
background: var(--panel-2);
}
.step-index {
display: grid;
place-items: center;
width: 28px;
height: 28px;
border-radius: 999px;
color: #06100b;
font-weight: 700;
background: var(--green);
}
.step-title {
margin-bottom: 5px;
font-weight: 700;
}
.step-value,
.token-summary,
.selected-token {
color: var(--muted);
font-family: ui-monospace, SFMono-Regular, Menlo, monospace;
font-size: 13px;
word-break: break-all;
}
.token-sequence {
display: grid;
grid-template-columns: repeat(12, minmax(0, 1fr));
gap: 8px;
margin-top: 14px;
}
.token-cell {
min-height: 50px;
border: 1px solid var(--line);
border-radius: 10px;
color: var(--text);
cursor: pointer;
background: var(--panel-2);
}
.token-cell span,
.token-cell small {
display: block;
}
.token-cell small {
margin-top: 3px;
color: var(--muted);
}
.token-cell.selected,
.token-cell:hover {
border-color: var(--green);
color: var(--green);
}
.token-detail-title {
margin-bottom: 10px;
color: var(--green);
font-weight: 700;
}
.token-vector {
padding: 12px;
border: 1px solid var(--line);
border-radius: 12px;
background: var(--panel-2);
}
@media (max-width: 1280px) {
.detections {
grid-template-columns: repeat(4, minmax(0, 1fr));
}
}
@media (max-width: 980px) {
.layout {
grid-template-columns: 1fr;
@@ -206,4 +367,18 @@ p {
align-items: flex-start;
flex-direction: column;
}
.detections {
grid-template-columns: repeat(3, minmax(0, 1fr));
}
}
@media (max-width: 640px) {
.video-grid {
grid-template-columns: 1fr;
}
.detections {
grid-template-columns: 1fr;
}
}

131
app/static/tokenizer.js Normal file
View File

@@ -0,0 +1,131 @@
const statusEl = document.querySelector("#tokenizer-status");
const pipelineEl = document.querySelector("#pipeline-steps");
const tokenSummaryEl = document.querySelector("#token-summary");
const tokenSequenceEl = document.querySelector("#token-sequence");
const selectedTokenEl = document.querySelector("#selected-token");
const detectionsEl = document.querySelector("#tokenizer-detections");
let selectedTokenIndex = null;
function formatShape(shape) {
if (!shape || !shape.length) {
return "-";
}
return `[${shape.join(", ")}]`;
}
function setStatus(ready, text) {
statusEl.textContent = text;
statusEl.classList.toggle("online", ready);
statusEl.classList.toggle("offline", !ready);
}
function renderPipeline(data) {
const steps = [
["OpenCV RGB 帧", `${data.image_size?.width ?? "-"} × ${data.image_size?.height ?? "-"}`],
["PIL Image", `${data.image_size?.width ?? "-"} × ${data.image_size?.height ?? "-"}`],
["DetrImageProcessor", `pixel_values ${formatShape(data.pixel_values_shape)} / pixel_mask ${formatShape(data.pixel_mask_shape)}`],
["ResNet-50 backbone", `feature map ${formatShape(data.feature_map_shape)}`],
["1×1 convolution", `projected ${formatShape(data.projected_feature_map_shape)}`],
["视觉 token embedding", `由 projected feature map flatten 得到 ${formatShape(data.visual_tokens_shape)}`],
["位置 embedding", `二维位置 embedding ${formatShape(data.position_encoding_shape)}`],
["Transformer Encoder", formatShape(data.encoder_last_hidden_state_shape)],
["Object query embedding + Decoder", `object query embedding 解码后 ${formatShape(data.decoder_last_hidden_state_shape)}`],
["类别 logits + boxes", `logits ${formatShape(data.logits_shape)} / boxes ${formatShape(data.pred_boxes_shape)}`],
["post_process_object_detection", `检测结果 ${data.detections?.length ?? 0}`],
];
pipelineEl.innerHTML = steps
.map(([title, value], index) => `
<div class="pipeline-step">
<div class="step-index">${index + 1}</div>
<div>
<div class="step-title">${title}</div>
<div class="step-value">${value}</div>
</div>
</div>
`)
.join("");
}
function renderTokens(data) {
const grid = data.token_grid || {};
tokenSummaryEl.textContent = `帧号 ${data.frame_id ?? "-"} · token 网格 ${grid.rows ?? "-"} × ${grid.cols ?? "-"},总数 ${grid.total ?? "-"},展示前 ${grid.shown ?? 0} 个 token每个显示前 8 维采样。`;
tokenSequenceEl.innerHTML = (data.token_sequence || [])
.map((token) => `
<button class="token-cell ${token.index === selectedTokenIndex ? "selected" : ""}" data-index="${token.index}">
<span>#${token.index}</span>
<small>(${token.row}, ${token.col})</small>
</button>
`)
.join("");
tokenSequenceEl.querySelectorAll(".token-cell").forEach((button) => {
button.addEventListener("click", () => {
selectedTokenIndex = Number(button.dataset.index);
renderSelectedToken(data);
renderTokens(data);
});
});
renderSelectedToken(data);
}
function renderSelectedToken(data) {
const tokens = data.token_sequence || [];
const token = tokens.find((item) => item.index === selectedTokenIndex) || tokens[0];
if (!token) {
selectedTokenEl.textContent = "暂无 token。";
return;
}
selectedTokenIndex = token.index;
selectedTokenEl.innerHTML = `
<div class="token-detail-title">Token #${token.index} · 网格位置 (${token.row}, ${token.col}) · L2 ${token.magnitude}</div>
<div class="token-vector">[${token.values.map((value) => Number(value).toFixed(4)).join(", ")}, ...]</div>
`;
}
function renderDetections(detections) {
if (!detections.length) {
detectionsEl.className = "detections empty";
detectionsEl.textContent = "暂无目标";
return;
}
detectionsEl.className = "detections";
detectionsEl.innerHTML = detections
.map((det) => `
<div class="det-item">
<div class="det-title">
<span>${det.label}</span>
<span>${(det.score * 100).toFixed(1)}%</span>
</div>
<div class="det-box">box: [${det.box.join(", ")}]</div>
</div>
`)
.join("");
}
async function refreshTokenizer() {
try {
const response = await fetch(`/tokenizer/state?t=${Date.now()}`);
const data = await response.json();
if (!data.ready) {
setStatus(false, data.error || "等待帧");
tokenSummaryEl.textContent = data.error || "等待视频帧";
return;
}
setStatus(Boolean(data.connected), data.connected ? "动态更新中" : "未连接");
renderPipeline(data);
renderTokens(data);
renderDetections(data.detections || []);
} catch (error) {
setStatus(false, "更新失败");
tokenSummaryEl.textContent = `更新失败:${error}`;
} finally {
setTimeout(refreshTokenizer, 1200);
}
}
refreshTokenizer();

View File

@@ -26,6 +26,7 @@ class StreamWorker:
self.lock = threading.Lock()
self.latest_jpeg: bytes | None = None
self.latest_frame_rgb: Any | None = None
self.latest_detections: list[dict[str, Any]] = []
self.frame_id = 0
self.updated_at = 0.0
@@ -56,21 +57,30 @@ class StreamWorker:
def reconnect(self) -> None:
with self.lock:
self.latest_jpeg = None
self.latest_frame_rgb = None
self.latest_detections = []
self.frame_id = 0
self.fps = 0.0
self.reconnect_requested = True
self.reconnect_version += 1
version = self.reconnect_version
self.connected = False
self.error = "正在切换视频源"
self.resolve_ms = 0.0
self.open_ms = 0.0
self.first_frame_ms = 0.0
print(f"[device-switch] worker reconnect requested version={version}", flush=True)
def get_jpeg(self) -> bytes | None:
with self.lock:
return self.latest_jpeg
def get_frame_rgb(self) -> Any | None:
with self.lock:
if self.latest_frame_rgb is None:
return None
return self.latest_frame_rgb.copy()
def get_snapshot(self) -> dict[str, Any]:
with self.lock:
return {
@@ -100,16 +110,29 @@ class StreamWorker:
run_version = self.reconnect_version
self.reconnect_requested = False
if should_reconnect:
print(f"[device-switch] worker reconnect handling version={run_version}", flush=True)
# 切换摄像头时必须释放旧连接,否则 OpenCV 会继续阻塞读旧流。
if cap is not None:
cap.release()
cap = None
print(f"[device-switch] worker released old capture version={run_version}", flush=True)
if cap is None or not cap.isOpened():
started = time.monotonic()
try:
stream_url = self.stream_url() if callable(self.stream_url) else self.stream_url
except Exception as exc:
resolve_ms = round((time.monotonic() - started) * 1000, 2)
with self.lock:
self.resolve_ms = resolve_ms
self.open_ms = 0.0
self.first_frame_ms = 0.0
self._set_connection_state(False, f"获取播放地址失败:{exc}2 秒后重试")
time.sleep(2)
continue
resolve_ms = round((time.monotonic() - started) * 1000, 2)
started = time.monotonic()
print(f"[device-switch] worker open start version={run_version}", flush=True)
cap = cv2.VideoCapture(stream_url)
open_ms = round((time.monotonic() - started) * 1000, 2)
with self.lock:
@@ -117,11 +140,13 @@ class StreamWorker:
self.resolve_ms = resolve_ms
self.first_frame_ms = 0.0
if not cap.isOpened():
print(f"[device-switch] worker open failed version={run_version} open_ms={open_ms}", flush=True)
self._set_connection_state(False, "无法打开视频流2 秒后重试")
cap.release()
cap = None
time.sleep(2)
continue
print(f"[device-switch] worker open success version={run_version} open_ms={open_ms}", flush=True)
self._set_connection_state(True, "已连接")
started = time.monotonic()
@@ -143,9 +168,9 @@ class StreamWorker:
frame = self._resize(frame)
self.frame_id += 1
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
if self.frame_id % self.frame_skip == 0:
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
last_detections = self.detector.detect(frame_rgb)
annotated = self._draw(frame, last_detections)
@@ -173,6 +198,7 @@ class StreamWorker:
with self.lock:
self.latest_jpeg = jpeg.tobytes()
self.latest_frame_rgb = frame_rgb.copy()
self.latest_detections = list(last_detections)
self.updated_at = time.time()
self.connected = True

View File

@@ -9,26 +9,30 @@
<body>
<header class="topbar">
<div>
<h1>DETR 动态打标</h1>
<p>使用 Python、OpenCV、PyTorch、Transformers DETR 和 FastAPIMac mini m2 上运行</p>
<!-- <h1>DETR 动态打标</h1> -->
<p>DETR动态打标Python、OpenCV、PyTorch、Transformers DETR 和 FastAPI。</p>
</div>
<div class="topbar-actions">
<a class="button-link" href="/tokenizer">tokenizer</a>
<div class="badge" id="connection">连接中</div>
</div>
</header>
<main class="layout">
<section class="video-card">
<div class="pipeline">
<div class="stage active">源节点</div>
<div class="arrow"></div>
<div class="stage active">DETR 推理</div>
<div class="arrow"></div>
<div class="stage active">OSD 打标</div>
<div class="arrow"></div>
<div class="stage active">FastAPI 输出</div>
<div class="video-grid">
{% for device_item in video_grid_devices %}
<article class="video-grid-item">
<div class="video-grid-title">{{ device_item.name }} · {{ device_item.device_num }}</div>
<div class="video-wrap video-grid-wrap">
<img class="grid-video" src="/video/{{ device_item.device_num | urlencode }}" data-src="/video/{{ device_item.device_num | urlencode }}" alt="{{ device_item.name }} 视频流" />
</div>
<div class="video-wrap">
<img id="video" src="/video" alt="动态打标视频流" />
</article>
{% endfor %}
</div>
<section class="detections-panel">
<div id="detections" class="detections empty">暂无目标</div>
</section>
</section>
<aside class="side-card">
@@ -69,11 +73,6 @@
<dd id="timing-frame">-</dd>
</dl>
</section>
<section>
<h2>检测结果</h2>
<div id="detections" class="detections empty">暂无目标</div>
</section>
</aside>
</main>

View File

@@ -0,0 +1,48 @@
<!doctype html>
<html lang="zh-CN">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<title>DETR Tokenizer 动态可视化</title>
<link rel="stylesheet" href="/static/style.css?v=tokenizer-layout-1-3" />
</head>
<body class="tokenizer-page">
<header class="topbar">
<div>
<!--<h1>DETR Tokenizer 动态可视化</h1>-->
<p>实时展示当前视频帧从图像预处理、视觉特征、token 序列到检测输出的过程。</p>
</div>
<div class="topbar-actions">
<a class="button-link" href="/">返回视频</a>
<div class="badge" id="tokenizer-status">等待帧</div>
</div>
</header>
<main class="tokenizer-layout">
<section class="tokenizer-card tokenizer-flow-card">
<h2>实时流程</h2>
<div class="pipeline-steps" id="pipeline-steps"></div>
</section>
<aside class="tokenizer-side">
<section class="tokenizer-card">
<h2>选中 Token</h2>
<div class="selected-token" id="selected-token">点击下方 token 查看向量采样。</div>
</section>
<section class="tokenizer-card">
<h2>Token 序列</h2>
<div class="token-summary" id="token-summary">等待视频帧</div>
<div class="token-sequence" id="token-sequence"></div>
</section>
<section class="tokenizer-card">
<h2>检测输出</h2>
<div id="tokenizer-detections" class="detections empty">暂无目标</div>
</section>
</aside>
</main>
<script src="/static/tokenizer.js"></script>
</body>
</html>

View File

@@ -1,3 +1,4 @@
# DETR 的视觉 token 化过程说明
本文基于当前项目代码 `app/detector.py` 中的实现说明 DETR 的“token 化”过程。
@@ -58,19 +59,39 @@ projected feature map: [batch, 256, H', W']
flatten 空间维度 H' × W'
visual tokens: [batch, H'×W', 256]
visual token embeddings: [batch, H'×W', 256]
加入二维位置编码
加入二维位置 embedding
Transformer Encoder
Object Queries + Transformer Decoder
Object query embeddings + Transformer Decoder
类别 logits + 边界框 boxes
post_process_object_detection 还原到原图坐标
```
## Embedding 在 1-11 个环节中的位置
在这个 DETR 流程里embedding 不是单独只有一步,而是出现在 3 个关键环节:
| 页面步骤 | 名称 | embedding 含义 |
| --- | --- | --- |
| 第 6 步 | visual token embedding | `projected feature map` 经过 flatten 后,每个空间网格点变成一个 256 维视觉 token embedding。 |
| 第 7 步 | position embedding | 给每个视觉 token 加入二维位置 embedding让 Transformer 知道 token 原本在图像中的位置。 |
| 第 9 步 | object query embedding | DETR 使用一组可学习的 object query embeddings 进入 Decoder每个 query 最终预测一个候选目标。 |
所以如果问“embedding 在 1-11 哪个环节”,最核心的是:
```text
第 6 步:产生视觉 token embedding
第 7 步:加入位置 embedding
第 9 步object query embedding 进入 Decoder
```
第 6 步是图像内容 embedding第 7 步是空间位置 embedding第 9 步是检测目标查询 embedding。
## 第 1 步:图像预处理
代码:
@@ -212,7 +233,7 @@ x = x.permute(2, 0, 1) # [h*w, batch, 256]
两种写法本质相同,只是 Transformer 接口期望的维度顺序不同。
## 第 5 步:加入二维位置编码
## 第 5 步:加入二维位置 embedding
Transformer 本身不理解图像中的二维空间位置。
@@ -264,11 +285,11 @@ Encoder 会通过 self-attention 建模图像中不同区域之间的关系。
- 道路区域可以影响车辆判断。
- 远处小目标可以和周围上下文一起被理解。
## 第 7 步Object Queries 和 Transformer Decoder
## 第 7 步Object query embedding 和 Transformer Decoder
DETR 与传统检测器不同,它不是先生成大量 anchor box。
它使用一组可学习的 object queries。常见数量是
它使用一组可学习的 object query embeddings。常见数量是
```text
100 个 object queries