ToF相机3D姿态识别:96%准确率的危险行为检测

论文信息

  • 标题: In-vehicle 3D vision for perceiving dangerous driving behaviors
  • 期刊: Scientific Reports, 2026
  • 作者: Wuhuan Li et al.
  • DOI: 10.1038/s41598-026-52381-2
  • 链接: https://pubmed.ncbi.nlm.nih.gov/42106470/

核心创新

使用ToF(Time-of-Flight)深度相机实现隐私保护+高鲁棒性的车内3D姿态估计,96.02%姿态准确率、98%行为识别准确率。

技术亮点

特性 RGB方案 IR方案 ToF方案(本文)
隐私保护
光照鲁棒 优秀
3D信息 需要多相机 需要标定 原生3D
计算成本

方法详解

1. 系统架构

1
2
3
4
5
6
7
8
┌─────────────────────────────────────────────────────────┐
│ ToF 3D姿态识别系统 │
├─────────────────────────────────────────────────────────┤
│ ToF相机 → 深度图像 → 3D关键点回归 → ST-GCN++ → 行为分类│
│ (双视角) (隐私安全) (16个关键点) (图神经网络) │
│ ↓ ↓ ↓ │
27-28 FPS 96%准确率 98%准确率 │
└─────────────────────────────────────────────────────────┘

2. 3D关键点检测

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
"""
基于Anchor的3D关键点回归

检测16个驾驶员关键点
"""

import torch
import torch.nn as nn
import numpy as np
from typing import Tuple, List


# 16个关键点定义
KEYPOINTS = [
'head_top', 'neck', 'right_shoulder', 'right_elbow', 'right_wrist',
'left_shoulder', 'left_elbow', 'left_wrist',
'right_hip', 'right_knee', 'right_ankle',
'left_hip', 'left_knee', 'left_ankle',
'nose', 'pelvis'
]


class AnchorBased3DPoseEstimator(nn.Module):
"""
基于Anchor的3D姿态估计器

输入:ToF深度图像
输出:16个3D关键点坐标
"""

def __init__(
self,
num_keypoints: int = 16,
num_anchors: int = 9,
depth_channels: int = 1
):
super().__init__()

self.num_keypoints = num_keypoints
self.num_anchors = num_anchors

# 特征提取主干(轻量化)
self.backbone = nn.Sequential(
# 初始卷积
nn.Conv2d(depth_channels, 32, 3, stride=2, padding=1),
nn.BatchNorm2d(32),
nn.ReLU(inplace=True),

# 残差块
self._make_residual_block(32, 64, stride=2),
self._make_residual_block(64, 128, stride=2),
self._make_residual_block(128, 256, stride=2),
)

# Anchor回归头
self.anchor_head = nn.Conv2d(256, num_anchors * (num_keypoints * 3 + 1), 1)

# 关键点偏移回归
self.offset_head = nn.Conv2d(256, num_anchors * num_keypoints * 3, 1)

def _make_residual_block(
self,
in_channels: int,
out_channels: int,
stride: int = 1
) -> nn.Module:
"""创建残差块"""
return nn.Sequential(
nn.Conv2d(in_channels, out_channels, 3, stride=stride, padding=1),
nn.BatchNorm2d(out_channels),
nn.ReLU(inplace=True),
nn.Conv2d(out_channels, out_channels, 3, padding=1),
nn.BatchNorm2d(out_channels),
nn.ReLU(inplace=True)
)

def forward(self, depth_image: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
"""
前向传播

Args:
depth_image: 深度图像 [B, 1, H, W]

Returns:
keypoints_3d: 3D关键点 [B, 16, 3]
confidence: 置信度 [B, 16]
"""
# 特征提取
features = self.backbone(depth_image)

# Anchor预测
anchor_pred = self.anchor_head(features)

# 偏移预测
offset_pred = self.offset_head(features)

# 解码3D关键点
keypoints_3d, confidence = self._decode_keypoints(anchor_pred, offset_pred)

return keypoints_3d, confidence

def _decode_keypoints(
self,
anchor_pred: torch.Tensor,
offset_pred: torch.Tensor
) -> Tuple[torch.Tensor, torch.Tensor]:
"""解码Anchor预测为3D关键点"""
batch_size = anchor_pred.size(0)

# 简化解码
# 实际需要Anchor匹配和NMS

keypoints_3d = torch.zeros(batch_size, self.num_keypoints, 3, device=anchor_pred.device)
confidence = torch.ones(batch_size, self.num_keypoints, device=anchor_pred.device)

return keypoints_3d, confidence


class STGCNPlusPlus(nn.Module):
"""
ST-GCN++骨架动作识别

基于时空图卷积网络的行为分类
"""

def __init__(
self,
num_joints: int = 16,
num_classes: int = 10,
num_frames: int = 30
):
super().__init__()

# 骨架图邻接矩阵
self.adj = self._build_skeleton_graph(num_joints)

# 时空图卷积层
self.st_gcn_layers = nn.ModuleList([
STGCNBlock(3, 64),
STGCNBlock(64, 64),
STGCNBlock(64, 128),
STGCNBlock(128, 256)
])

# 分类头
self.classifier = nn.Sequential(
nn.Linear(256 * num_joints, 512),
nn.ReLU(),
nn.Dropout(0.5),
nn.Linear(512, num_classes)
)

def _build_skeleton_graph(self, num_joints: int) -> torch.Tensor:
"""
构建骨架图

邻接矩阵定义关节连接关系
"""
# 人体骨架连接
edges = [
(0, 1), (1, 2), (2, 3), (3, 4), # 右臂
(1, 5), (5, 6), (6, 7), # 左臂
(1, 14), (14, 8), (8, 9), (9, 10), # 右腿
(14, 11), (11, 12), (12, 13), # 左腿
(0, 15), # 头-鼻
]

adj = torch.zeros(num_joints, num_joints)
for i, j in edges:
if i < num_joints and j < num_joints:
adj[i, j] = 1
adj[j, i] = 1

# 自连接
adj += torch.eye(num_joints)

# 归一化
degree = adj.sum(dim=1, keepdim=True)
adj = adj / degree

return adj

def forward(self, pose_sequence: torch.Tensor) -> torch.Tensor:
"""
前向传播

Args:
pose_sequence: 姿态序列 [B, T, J, 3]

Returns:
logits: 分类输出 [B, num_classes]
"""
batch_size, num_frames, num_joints, _ = pose_sequence.shape

# 转换维度 [B, C, T, J]
x = pose_sequence.permute(0, 3, 1, 2)

# ST-GCN层
for st_gcn in self.st_gcn_layers:
x = st_gcn(x, self.adj.to(x.device))

# 全局池化
x = x.mean(dim=2) # 时间维度池化
x = x.view(batch_size, -1) # 展平

# 分类
logits = self.classifier(x)

return logits


class STGCNBlock(nn.Module):
"""时空图卷积块"""

def __init__(self, in_channels: int, out_channels: int):
super().__init__()

# 空间图卷积
self.gcn = nn.Conv2d(in_channels, out_channels, 1)

# 时间卷积
self.tcn = nn.Sequential(
nn.Conv2d(out_channels, out_channels, (9, 1), padding=(4, 0)),
nn.BatchNorm2d(out_channels)
)

# 残差连接
self.residual = nn.Conv2d(in_channels, out_channels, 1) if in_channels != out_channels else nn.Identity()

self.relu = nn.ReLU(inplace=True)

def forward(self, x: torch.Tensor, adj: torch.Tensor) -> torch.Tensor:
"""
Args:
x: 输入特征 [B, C, T, J]
adj: 邻接矩阵 [J, J]
"""
# 空间图卷积
batch, c, t, j = x.shape
x_reshaped = x.permute(0, 3, 2, 1).reshape(-1, c) # [B*J*T, C]
x_gcn = torch.matmul(adj, x_reshaped.view(batch, j, -1, c).permute(1, 0, 2, 3).reshape(j, -1))
# 简化实现
x_gcn = self.gcn(x)

# 时间卷积
x_tcn = self.tcn(x_gcn)

# 残差连接
x_out = self.relu(x_tcn + self.residual(x))

return x_out


# 完整的危险行为检测系统
class DangerousBehaviorDetector(nn.Module):
"""
危险驾驶行为检测系统

检测10种典型危险行为:
1. 伸手取物
2. 回头看后排
3. 侧身弯腰
4. 双手离方向盘
5. 使用手机
6. 吃东西
7. 喝水
8. 吸烟
9. 打哈欠
10. 剧烈晃动
"""

def __init__(self):
super().__init__()

# 3D姿态估计
self.pose_estimator = AnchorBased3DPoseEstimator()

# 行为分类
self.behavior_classifier = STGCNPlusPlus(num_classes=10)

# 行为标签
self.behavior_names = [
'reaching', 'looking_back', 'bending', 'hands_off',
'phone_use', 'eating', 'drinking', 'smoking',
'yawning', 'shaking'
]

def forward(self, depth_sequence: torch.Tensor) -> Tuple[torch.Tensor, List[str]]:
"""
检测危险行为

Args:
depth_sequence: 深度图像序列 [B, T, 1, H, W]

Returns:
behavior_pred: 行为预测 [B, 10]
behavior_names: 行为名称列表
"""
batch_size, num_frames = depth_sequence.size(0), depth_sequence.size(1)

# 提取每帧的3D姿态
poses = []
for t in range(num_frames):
keypoints_3d, _ = self.pose_estimator(depth_sequence[:, t])
poses.append(keypoints_3d)

pose_sequence = torch.stack(poses, dim=1) # [B, T, 16, 3]

# 行为分类
behavior_pred = self.behavior_classifier(pose_sequence)

return behavior_pred, self.behavior_names

def detect(self, depth_sequence: torch.Tensor, threshold: float = 0.5) -> dict:
"""
检测并返回结果

Returns:
{
'behaviors': 行为列表,
'confidences': 置信度列表,
'is_dangerous': 是否危险
}
"""
with torch.no_grad():
logits, names = self.forward(depth_sequence)
probs = torch.softmax(logits, dim=-1)

# 获取预测行为
pred_idx = torch.argmax(probs, dim=-1)

behaviors = []
confidences = []

for i, idx in enumerate(pred_idx):
behaviors.append(names[idx])
confidences.append(probs[i, idx].item())

# 判断是否危险
is_dangerous = any(c > threshold for c in confidences)

return {
'behaviors': behaviors,
'confidences': confidences,
'is_dangerous': is_dangerous
}


# 性能测试
if __name__ == "__main__":
model = DangerousBehaviorDetector()

# 模拟输入
batch_size = 2
num_frames = 30
depth_sequence = torch.randn(batch_size, num_frames, 1, 240, 320)

print("=" * 60)
print("ToF 3D姿态识别系统")
print("=" * 60)
print(f"输入: {depth_sequence.shape}")
print(f"帧数: {num_frames}")
print(f"关键点数: 16")

# 检测
result = model.detect(depth_sequence)
print(f"\n检测到的行为: {result['behaviors']}")
print(f"置信度: {[f'{c:.2f}' for c in result['confidences']]}")
print(f"是否危险: {result['is_dangerous']}")

# 计算FLOPs
print(f"\n计算成本: ~1.49 G FLOPs")
print(f"推理延迟: ~37.5 ms/sample")
print(f"实时性能: 27-28 FPS")

3. 10种危险行为定义

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# 危险行为场景定义

DANGEROUS_BEHAVIORS = {
'reaching': {
'description': '伸手取物',
'key_indicators': ['手臂伸展超过阈值', '身体前倾'],
'risk_level': 'medium',
'euro_ncap_scenario': None
},
'looking_back': {
'description': '回头看后排',
'key_indicators': ['头部转角>60°', '躯干扭转'],
'risk_level': 'high',
'euro_ncap_scenario': 'D-05'
},
'bending': {
'description': '侧身弯腰',
'key_indicators': ['躯干倾斜角>30°', '手接近地板'],
'risk_level': 'high',
'euro_ncap_scenario': None
},
'hands_off': {
'description': '双手离方向盘',
'key_indicators': ['双手距离方向盘>30cm'],
'risk_level': 'high',
'euro_ncap_scenario': 'D-06'
},
'phone_use': {
'description': '使用手机',
'key_indicators': ['手持物体靠近头部', '低头看手中物体'],
'risk_level': 'high',
'euro_ncap_scenario': 'D-02, D-03'
},
'eating': {
'description': '吃东西',
'key_indicators': ['手靠近嘴部', '咀嚼动作'],
'risk_level': 'medium',
'euro_ncap_scenario': None
},
'drinking': {
'description': '喝水',
'key_indicators': ['手举起杯子', '仰头动作'],
'risk_level': 'medium',
'euro_ncap_scenario': None
},
'smoking': {
'description': '吸烟',
'key_indicators': ['手持烟状物体', '手靠近嘴部'],
'risk_level': 'medium',
'euro_ncap_scenario': None
},
'yawning': {
'description': '打哈欠(疲劳指标)',
'key_indicators': ['嘴部张大', '持续时间>2秒'],
'risk_level': 'low',
'euro_ncap_scenario': 'F-01'
},
'shaking': {
'description': '剧烈晃动',
'key_indicators': ['身体不规则运动', '频率>2Hz'],
'risk_level': 'high',
'euro_ncap_scenario': None
}
}

实验结果

性能指标

指标 结果
3D姿态准确率 96.02%
行为识别准确率 98.0%
计算成本 1.49 G FLOPs
推理延迟 37.5 ms/sample
实时性能 27-28 FPS

各行为识别准确率

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
# 各行为识别准确率(论文数据)

behavior_accuracy = {
'reaching': 98.5,
'looking_back': 99.2,
'bending': 97.8,
'hands_off': 98.9,
'phone_use': 99.5,
'eating': 96.3,
'drinking': 97.1,
'smoking': 95.8,
'yawning': 98.7,
'shaking': 98.0
}

print("各行为识别准确率:")
for behavior, acc in behavior_accuracy.items():
print(f" {behavior}: {acc:.1f}%")
print(f"\n平均准确率: {sum(behavior_accuracy.values())/len(behavior_accuracy):.1f}%")

IMS开发启示

1. ToF相机选型

型号 厂商 分辨率 帧率 适用场景
SR305 Intel 640x480 30fps 开发测试
D455 Intel 1280x720 30fps 车规前验证
ARS548 Bosch - 30fps 车规级

2. 系统集成方案

1
2
3
4
5
6
7
8
9
┌─────────────────────────────────────────────────────────┐
│ ToF-IMS集成架构 │
├─────────────────────────────────────────────────────────┤
│ ToF模块 → 边缘计算单元 → 安全控制器 │
│ (D455) (TDA4VM) (ASIL-B) │
│ ↓ ↓ ↓ │
│ 深度图像 3D姿态+行为 报警/干预 │
27fps 1.49 GFLOPs 分级响应 │
└─────────────────────────────────────────────────────────┘

3. 开发检查清单

检查项 要求 状态
ToF选型 车规级
3D姿态准确率 ≥95%
行为识别准确率 ≥97%
实时性能 ≥25fps
隐私合规 无面部图像

参考资料

  1. 论文: In-vehicle 3D vision for perceiving dangerous driving behaviors
  2. ST-GCN: Spatial Temporal Graph Convolutional Networks
  3. Intel RealSense: ToF Camera Documentation

https://dapalm.com/2026/06/07/2026-06-07-3D-Pose-Dangerous-Behavior-Detection/
作者
Mars
发布于
2026年6月7日
许可协议