update at 2026-06-04 14:09:16
This commit is contained in:
@@ -36,7 +36,79 @@ class DetrVehicleDetector:
|
||||
inputs = {key: value.to(self.device) for key, value in inputs.items()}
|
||||
|
||||
outputs = self.model(**inputs)
|
||||
# DETR 后处理需要原图尺寸,PIL size 是 (宽, 高),这里转成 (高, 宽)。
|
||||
return self._detections_from_outputs(image, outputs)
|
||||
|
||||
@torch.no_grad()
|
||||
def inspect_tokens(self, frame_rgb: Any) -> dict[str, Any]:
|
||||
image = Image.fromarray(frame_rgb)
|
||||
inputs = self.processor(images=image, return_tensors="pt")
|
||||
inputs = {key: value.to(self.device) for key, value in inputs.items()}
|
||||
|
||||
features, object_queries_list = self.model.model.backbone(inputs["pixel_values"], inputs["pixel_mask"])
|
||||
feature_map, mask = features[-1]
|
||||
projected_feature_map = self.model.model.input_projection(feature_map)
|
||||
tokens = projected_feature_map.flatten(2).permute(0, 2, 1)
|
||||
object_queries = object_queries_list[-1].flatten(2).permute(0, 2, 1)
|
||||
|
||||
outputs = self.model(**inputs, output_hidden_states=True)
|
||||
target_sizes = torch.tensor([image.size[::-1]], device=self.device)
|
||||
results = self.processor.post_process_object_detection(
|
||||
outputs,
|
||||
target_sizes=target_sizes,
|
||||
threshold=self.confidence,
|
||||
)[0]
|
||||
|
||||
token_rows = int(projected_feature_map.shape[2])
|
||||
token_cols = int(projected_feature_map.shape[3])
|
||||
sample_count = min(48, int(tokens.shape[1]))
|
||||
sample_tokens = tokens[0, :sample_count, :8].detach().cpu()
|
||||
token_sequence = []
|
||||
for index, vector in enumerate(sample_tokens):
|
||||
token_sequence.append(
|
||||
{
|
||||
"index": index,
|
||||
"row": index // token_cols,
|
||||
"col": index % token_cols,
|
||||
"values": [round(float(value), 4) for value in vector.tolist()],
|
||||
"magnitude": round(float(vector.norm()), 4),
|
||||
}
|
||||
)
|
||||
|
||||
detections = []
|
||||
for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
|
||||
label_name = self.model.config.id2label[label.item()]
|
||||
if label_name not in self.vehicle_labels:
|
||||
continue
|
||||
|
||||
x1, y1, x2, y2 = box.detach().cpu().numpy().astype(int).tolist()
|
||||
detections.append(
|
||||
{
|
||||
"label": label_name,
|
||||
"score": round(float(score.detach().cpu()), 4),
|
||||
"box": [x1, y1, x2, y2],
|
||||
}
|
||||
)
|
||||
|
||||
encoder_last_hidden_state = getattr(outputs, "encoder_last_hidden_state", None)
|
||||
last_hidden_state = getattr(outputs, "last_hidden_state", None)
|
||||
return {
|
||||
"image_size": {"width": image.size[0], "height": image.size[1]},
|
||||
"pixel_values_shape": list(inputs["pixel_values"].shape),
|
||||
"pixel_mask_shape": list(inputs["pixel_mask"].shape),
|
||||
"feature_map_shape": list(feature_map.shape),
|
||||
"projected_feature_map_shape": list(projected_feature_map.shape),
|
||||
"visual_tokens_shape": list(tokens.shape),
|
||||
"position_encoding_shape": list(object_queries.shape),
|
||||
"encoder_last_hidden_state_shape": list(encoder_last_hidden_state.shape) if encoder_last_hidden_state is not None else [],
|
||||
"decoder_last_hidden_state_shape": list(last_hidden_state.shape) if last_hidden_state is not None else [],
|
||||
"logits_shape": list(outputs.logits.shape),
|
||||
"pred_boxes_shape": list(outputs.pred_boxes.shape),
|
||||
"token_grid": {"rows": token_rows, "cols": token_cols, "total": int(tokens.shape[1]), "shown": sample_count},
|
||||
"token_sequence": token_sequence,
|
||||
"detections": detections,
|
||||
}
|
||||
|
||||
def _detections_from_outputs(self, image: Image.Image, outputs: Any) -> list[dict[str, Any]]:
|
||||
target_sizes = torch.tensor([image.size[::-1]], device=self.device)
|
||||
results = self.processor.post_process_object_detection(
|
||||
outputs,
|
||||
@@ -60,3 +132,4 @@ class DetrVehicleDetector:
|
||||
)
|
||||
|
||||
return detections
|
||||
|
||||
|
||||
Reference in New Issue
Block a user