驾驶员注意力预测:BEVFormer与Transformer在DMS中的应用

一、注意力预测问题定义

1.1 传统方法局限

传统的驾驶员注意力检测主要依赖:

  • 视线追踪
  • 头部姿态估计
  • 固定点分析

局限:

  • 只关注”看哪里”,不关注”应该看哪里”
  • 缺乏场景上下文
  • 无法预测注意力分配

1.2 BEV感知优势

Bird’s Eye View (BEV) 感知提供全局视角:

维度 传统方法 BEV方法
视角 单目/多目独立 统一BEV空间
场景理解 局部 全局
时序融合 困难 自然支持
注意力分配 可预测

二、BEVFormer架构

2.1 核心思想

BEVFormer将多摄像头图像转换为统一的鸟瞰图表示:

1
多摄像头输入 → 特征提取 → BEV Query → 空间交叉注意力 → 时序自注意力 → BEV特征

2.2 技术实现

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
import torch
import torch.nn as nn
import torch.nn.functional as F

class BEVFormer(nn.Module):
"""
BEVFormer: 鸟瞰图Transformer
用于驾驶员注意力预测
"""
def __init__(self,
num_cams=6,
embed_dim=256,
num_heads=8,
num_encoder_layers=6,
bev_h=200,
bev_w=200):
super(BEVFormer, self).__init__()

self.num_cams = num_cams
self.embed_dim = embed_dim
self.bev_h = bev_h
self.bev_w = bev_w

# 图像特征提取backbone
self.backbone = ResNetBackbone()

# BEV Query
self.bev_queries = nn.Parameter(torch.randn(1, bev_h * bev_w, embed_dim))

# 空间交叉注意力
self.spatial_cross_attention = nn.ModuleList([
SpatialCrossAttention(embed_dim, num_heads)
for _ in range(num_encoder_layers)
])

# 时序自注意力
self.temporal_self_attention = nn.ModuleList([
TemporalSelfAttention(embed_dim, num_heads)
for _ in range(num_encoder_layers)
])

# FFN
self.ffn = nn.ModuleList([
FFN(embed_dim)
for _ in range(num_encoder_layers)
])

def forward(self, imgs, prev_bev=None):
"""
前向传播

Args:
imgs: (B, num_cams, C, H, W) 多摄像头图像
prev_bev: (B, bev_h * bev_w, C) 前一帧BEV特征

Returns:
bev_features: (B, bev_h * bev_w, C) BEV特征
"""
B = imgs.shape[0]

# 提取图像特征
img_features = []
for cam_id in range(self.num_cams):
feat = self.backbone(imgs[:, cam_id])
img_features.append(feat)
img_features = torch.stack(img_features, dim=1) # (B, num_cams, C, H', W')

# 初始化BEV query
bev_queries = self.bev_queries.expand(B, -1, -1) # (B, bev_h*bev_w, C)

# Transformer编码
for i in range(len(self.spatial_cross_attention)):
# 时序自注意力
if prev_bev is not None:
bev_queries = self.temporal_self_attention[i](bev_queries, prev_bev)

# 空间交叉注意力
bev_queries = self.spatial_cross_attention[i](bev_queries, img_features)

# FFN
bev_queries = self.ffn[i](bev_queries)

return bev_queries


class SpatialCrossAttention(nn.Module):
"""
空间交叉注意力
BEV query与图像特征交互
"""
def __init__(self, embed_dim, num_heads):
super().__init__()
self.cross_attn = nn.MultiheadAttention(embed_dim, num_heads)
self.norm = nn.LayerNorm(embed_dim)

def forward(self, bev_queries, img_features):
"""
Args:
bev_queries: (B, N, C) BEV queries
img_features: (B, num_cams, C, H, W) 图像特征

Returns:
output: (B, N, C)
"""
B, num_cams, C, H, W = img_features.shape

# 展平图像特征
img_features_flat = img_features.flatten(3).permute(0, 1, 3, 2) # (B, num_cams, H*W, C)
img_features_flat = img_features_flat.flatten(1, 2) # (B, num_cams*H*W, C)

# 交叉注意力
bev_queries_t = bev_queries.permute(1, 0, 2) # (N, B, C)
img_features_t = img_features_flat.permute(1, 0, 2) # (num_cams*H*W, B, C)

attn_output, _ = self.cross_attn(bev_queries_t, img_features_t, img_features_t)
attn_output = attn_output.permute(1, 0, 2) # (B, N, C)

# 残差连接
output = self.norm(bev_queries + attn_output)

return output


class TemporalSelfAttention(nn.Module):
"""
时序自注意力
融合历史BEV特征
"""
def __init__(self, embed_dim, num_heads):
super().__init__()
self.self_attn = nn.MultiheadAttention(embed_dim, num_heads)
self.norm = nn.LayerNorm(embed_dim)

def forward(self, bev_queries, prev_bev):
"""
Args:
bev_queries: (B, N, C) 当前BEV queries
prev_bev: (B, N, C) 历史BEV特征

Returns:
output: (B, N, C)
"""
# 拼接当前和历史
combined = torch.cat([bev_queries, prev_bev], dim=1) # (B, 2N, C)

# 自注意力
combined_t = combined.permute(1, 0, 2) # (2N, B, C)
attn_output, _ = self.self_attn(combined_t, combined_t, combined_t)
attn_output = attn_output.permute(1, 0, 2) # (B, 2N, C)

# 残差连接
output = self.norm(combined + attn_output)

# 取前N个
output = output[:, :bev_queries.shape[1]]

return output


class ResNetBackbone(nn.Module):
"""简化的ResNet backbone"""
def __init__(self, output_dim=256):
super().__init__()
import torchvision.models as models
resnet = models.resnet50(pretrained=True)
self.backbone = nn.Sequential(*list(resnet.children())[:-2])
self.proj = nn.Conv2d(2048, output_dim, 1)

def forward(self, x):
x = self.backbone(x)
x = self.proj(x)
return x


class FFN(nn.Module):
"""Feed Forward Network"""
def __init__(self, embed_dim, hidden_dim=1024):
super().__init__()
self.fc1 = nn.Linear(embed_dim, hidden_dim)
self.fc2 = nn.Linear(hidden_dim, embed_dim)
self.norm = nn.LayerNorm(embed_dim)

def forward(self, x):
return self.norm(x + self.fc2(F.relu(self.fc1(x))))

三、驾驶员注意力预测

3.1 注意力图生成

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
class AttentionPredictor(nn.Module):
"""
驾驶员注意力预测器
基于BEV特征生成注意力图
"""
def __init__(self, bev_dim=256, num_classes=1):
super().__init__()

self.decoder = nn.Sequential(
nn.ConvTranspose2d(bev_dim, 128, 4, 2, 1),
nn.ReLU(),
nn.ConvTranspose2d(128, 64, 4, 2, 1),
nn.ReLU(),
nn.ConvTranspose2d(64, 32, 4, 2, 1),
nn.ReLU(),
nn.Conv2d(32, num_classes, 1),
nn.Sigmoid(),
)

def forward(self, bev_features, bev_size=(200, 200)):
"""
生成注意力图

Args:
bev_features: (B, N, C) BEV特征
bev_size: (H, W) BEV尺寸

Returns:
attention_map: (B, 1, H, W) 注意力图
"""
B, N, C = bev_features.shape
H, W = bev_size

# Reshape为2D
bev_2d = bev_features.permute(0, 2, 1).reshape(B, C, H, W)

# 解码
attention_map = self.decoder(bev_2d)

return attention_map


class DriverAttentionSystem:
"""
完整的驾驶员注意力预测系统
"""
def __init__(self):
self.bevformer = BEVFormer()
self.attention_predictor = AttentionPredictor()
self.prev_bev = None

def predict_attention(self, multi_cam_imgs):
"""
预测驾驶员应该关注的区域

Args:
multi_cam_imgs: 多摄像头图像

Returns:
attention_map: 注意力热力图
risky_areas: 风险区域列表
"""
# BEV特征提取
bev_features = self.bevformer(multi_cam_imgs, self.prev_bev)

# 更新历史
self.prev_bev = bev_features.detach()

# 注意力预测
attention_map = self.attention_predictor(bev_features)

# 风险区域检测
risky_areas = self._detect_risky_areas(attention_map)

return attention_map, risky_areas

def _detect_risky_areas(self, attention_map, threshold=0.3):
"""
检测高风险区域

注意力图中的低注意力区域可能是风险点
"""
risky_areas = []

# 低注意力区域
low_attention = attention_map < threshold

# 连通域分析
# 返回风险区域坐标

return risky_areas

def compare_with_driver_gaze(self, attention_map, driver_gaze_point):
"""
比较驾驶员实际注视点与预测注意力

Args:
attention_map: 预测注意力图
driver_gaze_point: (x, y) 驾驶员注视点

Returns:
attention_score: 注意力得分
"""
# 将注视点映射到BEV
# 计算该点的注意力值
attention_score = attention_map[0, 0, driver_gaze_point[1], driver_gaze_point[0]]

return attention_score.item()

四、与DMS集成

4.1 多模态融合

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
class IntegratedDMS(nn.Module):
"""
集成DMS:传统方法 + BEV注意力预测
"""
def __init__(self):
# 传统DMS组件
self.gaze_estimator = GazeEstimator()
self.pose_estimator = PoseEstimator()

# BEV注意力预测
self.attention_system = DriverAttentionSystem()

# 融合决策
self.fusion = AttentionFusion()

def forward(self, driver_cam, multi_cam_imgs):
"""
综合分析

Args:
driver_cam: 驾驶员摄像头
multi_cam_imgs: 环视摄像头

Returns:
attention_status: 注意力状态
"""
# 传统方法
gaze_point = self.gaze_estimator.estimate(driver_cam)
head_pose = self.pose_estimator.estimate(driver_cam)

# BEV注意力预测
attention_map, risky_areas = self.attention_system.predict_attention(multi_cam_imgs)

# 融合判断
attention_score = self.attention_system.compare_with_driver_gaze(
attention_map, gaze_point
)

# 综合判断
status = self.fusion.decide(attention_score, gaze_point, head_pose, risky_areas)

return status


class AttentionFusion:
"""注意力融合决策"""
def decide(self, attention_score, gaze_point, head_pose, risky_areas):
"""
融合决策

Returns:
status: dict
"""
# 判断逻辑
if attention_score < 0.3:
# 注意力分配不合理
return {
'status': 'attention_mismatch',
'attention_score': attention_score,
'risky_areas': risky_areas,
'action': 'warning'
}
elif attention_score < 0.5:
return {
'status': 'partial_attention',
'attention_score': attention_score,
'action': 'caution'
}
else:
return {
'status': 'normal',
'attention_score': attention_score,
'action': 'none'
}

五、实验验证

5.1 数据集

数据集 场景 样本数
DR(eye)VE 驾驶注意力 55分钟
DADA-2000 事故预测 2000场景
BDD-A 注意力预测 100K帧

5.2 性能指标

方法 AUC CC SIM
传统视线追踪 0.72 0.45 0.51
CNN注意力预测 0.85 0.62 0.68
BEVFormer 0.91 0.75 0.79

六、IMS开发启示

6.1 部署优化

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
# 模型量化
def quantize_bevformer(model):
"""INT8量化"""
quantized = torch.quantization.quantize_dynamic(
model, {nn.Linear, nn.Conv2d}, dtype=torch.qint8
)
return quantized

# TensorRT加速
def export_to_tensorrt(model, input_shape):
"""导出TensorRT引擎"""
# ONNX导出
dummy_input = torch.randn(*input_shape)
torch.onnx.export(model, dummy_input, "bevformer.onnx")
# TensorRT转换
# ...

6.2 计算需求

配置 帧率 延迟
NVIDIA Orin 15 fps 67ms
NVIDIA Xavier 8 fps 125ms
高通8295 10 fps 100ms

七、总结

BEVFormer为驾驶员注意力预测提供了新范式:

核心优势:

  • 全局场景理解
  • 时序信息融合
  • 注意力分配预测

应用场景:

  • 驾驶员注意力评估
  • 风险区域预警
  • L2+辅助驾驶

参考来源:

  • “BEVFormer: Learning Bird’s-Eye-View Representation from Multi-Camera Images”
  • “DriveTransformer: Unified Transformer for Scalable End-to-End Autonomous Driving”
  • DR(eye)VE Dataset

相关文章: