update at 2026-06-04 14:09:16

This commit is contained in:
陈赣
2026-06-04 14:09:16 +08:00
parent 41bd03123c
commit 4603914e85
11 changed files with 692 additions and 68 deletions

View File

@@ -36,7 +36,79 @@ class DetrVehicleDetector:
inputs = {key: value.to(self.device) for key, value in inputs.items()}
outputs = self.model(**inputs)
# DETR 后处理需要原图尺寸PIL size 是 (宽, 高),这里转成 (高, 宽)。
return self._detections_from_outputs(image, outputs)
@torch.no_grad()
def inspect_tokens(self, frame_rgb: Any) -> dict[str, Any]:
image = Image.fromarray(frame_rgb)
inputs = self.processor(images=image, return_tensors="pt")
inputs = {key: value.to(self.device) for key, value in inputs.items()}
features, object_queries_list = self.model.model.backbone(inputs["pixel_values"], inputs["pixel_mask"])
feature_map, mask = features[-1]
projected_feature_map = self.model.model.input_projection(feature_map)
tokens = projected_feature_map.flatten(2).permute(0, 2, 1)
object_queries = object_queries_list[-1].flatten(2).permute(0, 2, 1)
outputs = self.model(**inputs, output_hidden_states=True)
target_sizes = torch.tensor([image.size[::-1]], device=self.device)
results = self.processor.post_process_object_detection(
outputs,
target_sizes=target_sizes,
threshold=self.confidence,
)[0]
token_rows = int(projected_feature_map.shape[2])
token_cols = int(projected_feature_map.shape[3])
sample_count = min(48, int(tokens.shape[1]))
sample_tokens = tokens[0, :sample_count, :8].detach().cpu()
token_sequence = []
for index, vector in enumerate(sample_tokens):
token_sequence.append(
{
"index": index,
"row": index // token_cols,
"col": index % token_cols,
"values": [round(float(value), 4) for value in vector.tolist()],
"magnitude": round(float(vector.norm()), 4),
}
)
detections = []
for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
label_name = self.model.config.id2label[label.item()]
if label_name not in self.vehicle_labels:
continue
x1, y1, x2, y2 = box.detach().cpu().numpy().astype(int).tolist()
detections.append(
{
"label": label_name,
"score": round(float(score.detach().cpu()), 4),
"box": [x1, y1, x2, y2],
}
)
encoder_last_hidden_state = getattr(outputs, "encoder_last_hidden_state", None)
last_hidden_state = getattr(outputs, "last_hidden_state", None)
return {
"image_size": {"width": image.size[0], "height": image.size[1]},
"pixel_values_shape": list(inputs["pixel_values"].shape),
"pixel_mask_shape": list(inputs["pixel_mask"].shape),
"feature_map_shape": list(feature_map.shape),
"projected_feature_map_shape": list(projected_feature_map.shape),
"visual_tokens_shape": list(tokens.shape),
"position_encoding_shape": list(object_queries.shape),
"encoder_last_hidden_state_shape": list(encoder_last_hidden_state.shape) if encoder_last_hidden_state is not None else [],
"decoder_last_hidden_state_shape": list(last_hidden_state.shape) if last_hidden_state is not None else [],
"logits_shape": list(outputs.logits.shape),
"pred_boxes_shape": list(outputs.pred_boxes.shape),
"token_grid": {"rows": token_rows, "cols": token_cols, "total": int(tokens.shape[1]), "shown": sample_count},
"token_sequence": token_sequence,
"detections": detections,
}
def _detections_from_outputs(self, image: Image.Image, outputs: Any) -> list[dict[str, Any]]:
target_sizes = torch.tensor([image.size[::-1]], device=self.device)
results = self.processor.post_process_object_detection(
outputs,
@@ -60,3 +132,4 @@ class DetrVehicleDetector:
)
return detections