毫米波雷达与摄像头融合目标检测综述:2024-2025 前沿进展

毫米波雷达与摄像头融合目标检测综述:2024-2025 前沿进展

背景

毫米波雷达与摄像头融合已成为自动驾驶感知系统的核心技术,特别是在恶劣天气和光照条件下的鲁棒性优势明显。


融合架构分类

按融合层次划分

融合层次 方法 优势 劣势
数据级融合 点云融合 信息保留最完整 计算量大
特征级融合 BEV 融合 平衡性能与效率 需精确标定
决策级融合 后处理融合 实现简单 信息损失大

典型架构

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
import torch
import torch.nn as nn
import torch.nn.functional as F

class RadarCameraFusionDetector(nn.Module):
"""
毫米波雷达-摄像头融合检测器

BEV 特征融合架构
"""

def __init__(
self,
image_backbone: str = 'resnet50',
radar_encoder: str = 'pointnet',
fusion_dim: int = 256,
num_classes: int = 10
):
super().__init__()

# 图像骨干网络
self.image_backbone = self._build_image_backbone(image_backbone)

# 雷达编码器
self.radar_encoder = self._build_radar_encoder(radar_encoder)

# BEV 投影
self.bev_projector = BEVProjector(fusion_dim)

# 跨模态注意力
self.cross_attention = CrossModalAttention(fusion_dim)

# 检测头
self.detection_head = DetectionHead(fusion_dim, num_classes)

def forward(
self,
image: torch.Tensor,
radar_points: torch.Tensor,
calibration: dict
) -> dict:
"""
前向传播

Args:
image: 图像, shape=(B, 3, H, W)
radar_points: 雷达点云, shape=(B, N, 5) [x, y, z, v, rcs]
calibration: 标定参数

Returns:
detections: 检测结果
"""
# 提取图像特征
img_features = self.image_backbone(image)

# 提取雷达特征
radar_features = self.radar_encoder(radar_points)

# BEV 投影
img_bev = self.bev_projector.image_to_bev(img_features, calibration)
radar_bev = self.bev_projector.radar_to_bev(radar_features, calibration)

# 跨模态注意力融合
fused_bev = self.cross_attention(img_bev, radar_bev)

# 检测
detections = self.detection_head(fused_bev)

return detections

def _build_image_backbone(self, name: str) -> nn.Module:
import torchvision.models as models
if name == 'resnet50':
model = models.resnet50(pretrained=True)
return nn.Sequential(*list(model.children())[:-2])
raise ValueError(f"Unknown backbone: {name}")

def _build_radar_encoder(self, name: str) -> nn.Module:
if name == 'pointnet':
return PointNetEncoder(input_dim=5, output_dim=256)
raise ValueError(f"Unknown encoder: {name}")


class BEVProjector(nn.Module):
"""BEV 投影模块"""

def __init__(self, feature_dim: int):
super().__init__()
self.feature_dim = feature_dim

def image_to_bev(
self,
img_features: torch.Tensor,
calibration: dict
) -> torch.Tensor:
"""图像特征投影到 BEV"""
# 使用深度估计 + 外参投影
B, C, H, W = img_features.shape
bev = F.adaptive_avg_pool2d(img_features, (200, 200))
return bev

def radar_to_bev(
self,
radar_features: torch.Tensor,
calibration: dict
) -> torch.Tensor:
"""雷达特征投影到 BEV"""
# 点云体素化
B, N, D = radar_features.shape
bev = radar_features.mean(dim=1).unsqueeze(-1).unsqueeze(-1)
bev = F.interpolate(bev, size=(200, 200), mode='nearest')
return bev.expand(-1, self.feature_dim, -1, -1)


class CrossModalAttention(nn.Module):
"""跨模态注意力"""

def __init__(self, feature_dim: int, num_heads: int = 8):
super().__init__()
self.attention = nn.MultiheadAttention(
embed_dim=feature_dim,
num_heads=num_heads,
batch_first=True
)
self.norm = nn.LayerNorm(feature_dim)

def forward(
self,
img_bev: torch.Tensor,
radar_bev: torch.Tensor
) -> torch.Tensor:
B, C, H, W = img_bev.shape

# 展平空间维度
img_flat = img_bev.flatten(2).transpose(1, 2) # (B, H*W, C)
radar_flat = radar_bev.flatten(2).transpose(1, 2)

# 跨模态注意力
fused, _ = self.attention(img_flat, radar_flat, radar_flat)
fused = self.norm(img_flat + fused)

# 恢复空间维度
fused = fused.transpose(1, 2).view(B, C, H, W)

return fused


class DetectionHead(nn.Module):
"""检测头"""

def __init__(self, feature_dim: int, num_classes: int):
super().__init__()
self.classifier = nn.Conv2d(feature_dim, num_classes, 1)
self.regressor = nn.Conv2d(feature_dim, 4, 1) # bbox

def forward(self, features: torch.Tensor) -> dict:
return {
'class_logits': self.classifier(features),
'bbox_regression': self.regressor(features)
}


class PointNetEncoder(nn.Module):
"""简化版 PointNet"""

def __init__(self, input_dim: int, output_dim: int):
super().__init__()
self.mlp = nn.Sequential(
nn.Linear(input_dim, 64),
nn.ReLU(),
nn.Linear(64, 128),
nn.ReLU(),
nn.Linear(128, output_dim)
)

def forward(self, points: torch.Tensor) -> torch.Tensor:
return self.mlp(points).mean(dim=1)

主要数据集

数据集 传感器配置 场景 规模
nuScenes 6 摄像头 + 5 雷达 + 1 激光雷达 城市 1000 场景
Waymo Open 5 摄像头 + 5 激光雷达 城市 1150 场景
Astyx 1 摄像头 + 1 雷达 高速 546 帧
RADIal 1 摄像头 + 1 雷达 多场景 10K 帧

性能对比(nuScenes 验证集)

方法 NDS mAP 推理速度
BEVFusion 0.516 0.416 12 FPS
TransFusion 0.528 0.436 8 FPS
RC-BEVFusion 0.534 0.448 10 FPS
TRCFusion (2025) 0.552 0.461 11 FPS

IMS 应用启示

车内监测融合方案

场景 摄像头优势 雷达优势 融合收益
儿童检测 分类准确 遮挡鲁棒 误报降低
乘员计数 精确 覆盖广 全车覆盖
姿态估计 细节丰富 穿透力强 OOP 准确

部署建议

  1. 传感器配置: 60GHz 雷达 + RGB-IR 摄像头
  2. 融合策略: 特征级 BEV 融合
  3. 计算平台: Qualcomm QCS8255 / TI TDA4

参考文献:

  1. “Radar and Camera Fusion for Object Detection and Tracking: A Comprehensive Survey”, arXiv 2024
  2. “TRCFusion: Temporal-Enhanced Radar and Camera Fusion”, ACM TOMM 2024
  3. “Advances in object detection for autonomous driving using mmwave radar and camera”, Springer 2025

毫米波雷达与摄像头融合目标检测综述:2024-2025 前沿进展
https://dapalm.com/2026/06/04/2026-06-04-毫米波雷达与摄像头融合目标检测综述2024-2025前沿进展/
作者
Mars
发布于
2026年6月4日
许可协议