驾驶员视线追踪技术详解:注意力机制实现高精度分心检测

驾驶员视线追踪技术详解:注意力机制实现高精度分心检测

来源: Springer Nature + ScienceDirect
发布时间: 2026年4月
核心价值: 视线追踪是Euro NCAP 2026分心检测的核心技术


核心洞察

视线追踪技术指标:

指标 传统方案 注意力机制方案
角度精度 5-10° 1-3°
头部自由度 限制大 高自由度
实时性 50-100ms <30ms
遮挡鲁棒 中等

Euro NCAP 2026分心检测要求:

  • 视线偏离道路≥3秒触发警告
  • 手机使用检测(腿部/耳边)
  • 需要眼动追踪直接监测

一、技术原理

1.1 视线估计流程

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
视线估计流程

├── 输入:面部图像

├── 1. 人脸检测
│ └── 关键点定位(68点)

├── 2. 眼部区域提取
│ ├── 左眼ROI
│ └── 右眼ROI

├── 3. 头部姿态估计
│ └── (yaw, pitch, roll)

├── 4. 视线方向估计
│ ├── 瞳孔检测
│ ├── 虹膜边缘
│ └── 视线向量

└── 输出:(yaw, pitch) 或 视线区域

1.2 视线区域定义

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
"""
驾驶员视线区域定义
Euro NCAP标准
"""

from enum import Enum
from dataclasses import dataclass
from typing import Tuple, Optional
import numpy as np

class GazeZone(Enum):
"""视线区域"""
FORWARD = "forward" # 前方道路
LEFT_MIRROR = "left_mirror" # 左后视镜
RIGHT_MIRROR = "right_mirror" # 右后视镜
REAR_MIRROR = "rear_mirror" # 车内后视镜
INSTRUMENT = "instrument" # 仪表盘
CENTER_CONSOLE = "console" # 中控
PASSENGER = "passenger" # 副驾驶侧
DOWN = "down" # 下方
UNKNOWN = "unknown" # 未识别

@dataclass
class GazeVector:
"""视线向量"""
yaw: float # 水平角度 (-90° to 90°)
pitch: float # 垂直角度 (-90° to 90°)
confidence: float # 置信度 (0-1)

class GazeZoneClassifier:
"""
视线区域分类器

基于视线角度和头部姿态判断视线区域
"""

def __init__(self):
# 区域定义(角度范围)
self.zone_ranges = {
GazeZone.FORWARD: {
'yaw': (-15, 15),
'pitch': (-10, 10)
},
GazeZone.LEFT_MIRROR: {
'yaw': (-45, -15),
'pitch': (-5, 15)
},
GazeZone.RIGHT_MIRROR: {
'yaw': (15, 45),
'pitch': (-5, 15)
},
GazeZone.REAR_MIRROR: {
'yaw': (-15, 15),
'pitch': (10, 30)
},
GazeZone.INSTRUMENT: {
'yaw': (-15, 15),
'pitch': (-30, -10)
},
GazeZone.CENTER_CONSOLE: {
'yaw': (-30, 30),
'pitch': (-45, -30)
},
GazeZone.PASSENGER: {
'yaw': (30, 60),
'pitch': (-20, 20)
},
GazeZone.DOWN: {
'yaw': (-30, 30),
'pitch': (-60, -45)
},
}

def classify(self, gaze: GazeVector,
head_yaw: float = 0,
head_pitch: float = 0) -> Tuple[GazeZone, float]:
"""
分类视线区域

Args:
gaze: 视线向量
head_yaw: 头部水平角度
head_pitch: 头部垂直角度

Returns:
(zone, confidence)
"""
# 补偿头部姿态
compensated_yaw = gaze.yaw + head_yaw * 0.5
compensated_pitch = gaze.pitch + head_pitch * 0.5

# 匹配区域
best_zone = GazeZone.UNKNOWN
best_score = 0

for zone, ranges in self.zone_ranges.items():
yaw_in = ranges['yaw'][0] <= compensated_yaw <= ranges['yaw'][1]
pitch_in = ranges['pitch'][0] <= compensated_pitch <= ranges['pitch'][1]

if yaw_in and pitch_in:
# 计算距离中心的得分
yaw_center = (ranges['yaw'][0] + ranges['yaw'][1]) / 2
pitch_center = (ranges['pitch'][0] + ranges['pitch'][1]) / 2

yaw_dist = abs(compensated_yaw - yaw_center)
pitch_dist = abs(compensated_pitch - pitch_center)

score = 1 / (1 + yaw_dist + pitch_dist)

if score > best_score:
best_score = score
best_zone = zone

return best_zone, best_score * gaze.confidence

def is_eyes_on_road(self, zone: GazeZone) -> bool:
"""判断是否注视道路"""
road_zones = {
GazeZone.FORWARD,
GazeZone.LEFT_MIRROR,
GazeZone.RIGHT_MIRROR,
GazeZone.REAR_MIRROR
}
return zone in road_zones


# 实际测试
if __name__ == "__main__":
classifier = GazeZoneClassifier()

# 测试场景
test_cases = [
GazeVector(yaw=0, pitch=0, confidence=0.9), # 正前方
GazeVector(yaw=-30, pitch=5, confidence=0.85), # 左后视镜
GazeVector(yaw=30, pitch=5, confidence=0.85), # 右后视镜
GazeVector(yaw=0, pitch=-40, confidence=0.8), # 中控
GazeVector(yaw=40, pitch=0, confidence=0.75), # 副驾驶侧
]

print("=== 视线区域分类测试 ===")
for gaze in test_cases:
zone, conf = classifier.classify(gaze)
is_road = classifier.is_eyes_on_road(zone)
print(f"视线({gaze.yaw:+.0f}°, {gaze.pitch:+.0f}°) → {zone.value:15s} "
f"置信度:{conf:.2f} 道路:{is_road}")

1.3 注意力机制视线估计

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
"""
基于注意力机制的视线估计网络
"""

import torch
import torch.nn as nn
import torch.nn.functional as F
from typing import Tuple

class AttentionGazeNet(nn.Module):
"""
注意力机制视线估计网络

架构:
1. 面部特征提取(ResNet)
2. 眼部特征提取(专用分支)
3. 跨模态注意力融合
4. 视线回归头
"""

def __init__(self,
backbone: str = 'resnet18',
pretrained: bool = True):
super().__init__()

# 面部特征提取
if backbone == 'resnet18':
self.face_encoder = torch.hub.load(
'pytorch/vision:v0.10.0',
'resnet18',
pretrained=pretrained
)
feature_dim = 512
else:
raise ValueError(f"Unsupported backbone: {backbone}")

# 移除分类头
self.face_encoder.fc = nn.Identity()

# 眼部特征提取
self.eye_encoder = nn.Sequential(
nn.Conv2d(1, 32, kernel_size=3, stride=2, padding=1),
nn.BatchNorm2d(32),
nn.ReLU(),
nn.MaxPool2d(2),

nn.Conv2d(32, 64, kernel_size=3, stride=2, padding=1),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.MaxPool2d(2),

nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
nn.BatchNorm2d(128),
nn.ReLU(),
nn.AdaptiveAvgPool2d(1),
)

# 跨模态注意力
self.cross_attention = nn.MultiheadAttention(
embed_dim=128,
num_heads=4,
batch_first=True
)

# 特征投影
self.face_proj = nn.Linear(feature_dim, 128)
self.eye_proj = nn.Linear(128, 128)

# 视线回归头
self.gaze_regressor = nn.Sequential(
nn.Linear(128 * 3, 256),
nn.ReLU(),
nn.Dropout(0.3),
nn.Linear(256, 64),
nn.ReLU(),
nn.Linear(64, 2) # (yaw, pitch)
)

# 置信度头
self.confidence_head = nn.Sequential(
nn.Linear(128 * 3, 64),
nn.ReLU(),
nn.Linear(64, 1),
nn.Sigmoid()
)

def forward(self,
face_image: torch.Tensor,
left_eye: torch.Tensor,
right_eye: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
"""
前向传播

Args:
face_image: 面部图像 (B, 3, 224, 224)
left_eye: 左眼图像 (B, 1, 64, 64)
right_eye: 右眼图像 (B, 1, 64, 64)

Returns:
gaze: (B, 2) 视线角度 (yaw, pitch)
confidence: (B, 1) 置信度
"""
batch_size = face_image.size(0)

# 1. 提取特征
face_feat = self.face_encoder(face_image) # (B, 512)
left_feat = self.eye_encoder(left_eye).squeeze(-1).squeeze(-1) # (B, 128)
right_feat = self.eye_encoder(right_eye).squeeze(-1).squeeze(-1) # (B, 128)

# 2. 投影到统一维度
face_feat = self.face_proj(face_feat) # (B, 128)
left_feat = self.eye_proj(left_feat)
right_feat = self.eye_proj(right_feat)

# 3. 堆叠为序列
features = torch.stack([face_feat, left_feat, right_feat], dim=1) # (B, 3, 128)

# 4. 跨模态注意力
attended, _ = self.cross_attention(features, features, features)

# 5. 展平
fused = attended.view(batch_size, -1) # (B, 384)

# 6. 回归视线角度
gaze = self.gaze_regressor(fused) # (B, 2)

# 7. 置信度
confidence = self.confidence_head(fused) # (B, 1)

return gaze, confidence

def get_gaze_vector(self,
face_image: torch.Tensor,
left_eye: torch.Tensor,
right_eye: torch.Tensor) -> GazeVector:
"""获取视线向量"""
with torch.no_grad():
gaze, conf = self.forward(face_image, left_eye, right_eye)

return GazeVector(
yaw=float(gaze[0, 0]),
pitch=float(gaze[0, 1]),
confidence=float(conf[0, 0])
)


# 实际测试
if __name__ == "__main__":
model = AttentionGazeNet(backbone='resnet18', pretrained=False)

# 模拟输入
face = torch.randn(4, 3, 224, 224)
left_eye = torch.randn(4, 1, 64, 64)
right_eye = torch.randn(4, 1, 64, 64)

# 前向传播
gaze, conf = model(face, left_eye, right_eye)

print(f"面部图像: {face.shape}")
print(f"眼部图像: {left_eye.shape}")
print(f"视线输出: {gaze.shape}")
print(f"置信度: {conf.shape}")

# 参数统计
total_params = sum(p.numel() for p in model.parameters())
print(f"总参数量: {total_params/1e6:.2f}M")

二、分心检测系统

2.1 Euro NCAP分心场景

场景 描述 检测时限 警告类型
D-01 视线短暂偏离(≤3s) 不触发
D-02 视线长时间偏离(3-4s) ≤3s 一级警告
D-03 视线极端偏离(≥4s) ≤3s 二级警告
D-04 手机使用(腿部) ≤3s 一级警告
D-05 手机使用(耳边) ≤3s 二级警告

2.2 分心检测器实现

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
"""
Euro NCAP 2026分心检测器
"""

from dataclasses import dataclass
from typing import Tuple, Optional, List
from collections import deque
from enum import Enum
import time

class DistractionType(Enum):
"""分心类型"""
NONE = "none"
VISUAL = "visual" # 视觉分心
PHONE_LAP = "phone_lap" # 手机(腿部)
PHONE_EAR = "phone_ear" # 手机(耳边)
MANUAL = "manual" # 手动操作
COGNITIVE = "cognitive" # 认知分心

class WarningLevel(Enum):
"""警告等级"""
NONE = 0
LEVEL_1 = 1 # 一级警告
LEVEL_2 = 2 # 二级警告
EMERGENCY = 3 # 紧急

@dataclass
class DistractionEvent:
"""分心事件"""
type: DistractionType
start_time: float
duration: float
gaze_zone: GazeZone
phone_detected: bool
hand_position: Optional[str]

class DistractionDetector:
"""
Euro NCAP 2026分心检测器

检测规则:
1. 视线偏离道路≥3秒触发
2. 手机使用检测
3. 前置4秒道路注视条件
"""

def __init__(self):
# Euro NCAP阈值
self.WARNING_THRESHOLD = 3.0 # 秒
self.CRITICAL_THRESHOLD = 4.0 # 秒
self.FORWARD_REQUIREMENT = 4.0 # 秒

# 状态跟踪
self.forward_gaze_time = 0.0
self.distraction_start: Optional[float] = None
self.current_distraction: Optional[DistractionEvent] = None

# 历史记录
self.event_history: List[DistractionEvent] = []
self.gaze_history = deque(maxlen=300) # 10秒@30fps

# 手机检测状态
self.phone_detected = False
self.hand_position = None

def update(self,
gaze_zone: GazeZone,
phone_detected: bool = False,
hand_position: Optional[str] = None,
dt: float = 0.033) -> Tuple[WarningLevel, Optional[str]]:
"""
更新检测状态

Args:
gaze_zone: 当前视线区域
phone_detected: 是否检测到手机
hand_position: 手部位置
dt: 时间步长

Returns:
(warning_level, warning_type)
"""
current_time = time.time()

# 更新前方注视时间
if gaze_zone == GazeZone.FORWARD:
self.forward_gaze_time += dt

# 如果有正在进行的分心事件,结束它
if self.current_distraction is not None:
self.current_distraction.duration = current_time - self.current_distraction.start_time
self.event_history.append(self.current_distraction)
self.current_distraction = None

self.distraction_start = None
else:
# 检查是否满足前置条件
if self.forward_gaze_time >= self.FORWARD_REQUIREMENT:
# 开始计时分心
if self.distraction_start is None:
self.distraction_start = current_time
self.current_distraction = DistractionEvent(
type=DistractionType.VISUAL,
start_time=current_time,
duration=0.0,
gaze_zone=gaze_zone,
phone_detected=phone_detected,
hand_position=hand_position
)

self.forward_gaze_time = 0.0

# 记录历史
self.gaze_history.append(gaze_zone)

# 手机使用检测
if phone_detected:
return self._handle_phone_use(hand_position, current_time)

# 视觉分心检测
if self.distraction_start is not None:
duration = current_time - self.distraction_start

if duration >= self.CRITICAL_THRESHOLD:
return WarningLevel.LEVEL_2, "CRITICAL_DISTRACTION"
elif duration >= self.WARNING_THRESHOLD:
return WarningLevel.LEVEL_1, "PROLONGED_DISTRACTION"

return WarningLevel.NONE, None

def _handle_phone_use(self,
hand_position: Optional[str],
current_time: float) -> Tuple[WarningLevel, str]:
"""处理手机使用检测"""
if hand_position == 'ear':
# 打电话:立即二级警告
return WarningLevel.LEVEL_2, "PHONE_CALL"
elif hand_position in ['lap', 'texting']:
# 手机使用:一级警告
if self.distraction_start is not None:
duration = current_time - self.distraction_start
if duration >= self.WARNING_THRESHOLD:
return WarningLevel.LEVEL_1, "PHONE_USE"

return WarningLevel.NONE, None

def get_statistics(self) -> dict:
"""获取统计信息"""
if len(self.event_history) == 0:
return {
'total_events': 0,
'total_distraction_time': 0.0,
'avg_duration': 0.0,
}

total_time = sum(e.duration for e in self.event_history)
avg_duration = total_time / len(self.event_history)

return {
'total_events': len(self.event_history),
'total_distraction_time': total_time,
'avg_duration': avg_duration,
}


# 实际测试
if __name__ == "__main__":
detector = DistractionDetector()

print("=== 分心检测测试 ===")

# 场景1:正常驾驶
print("\n[场景1:正常驾驶 5秒]")
for i in range(150): # 5秒
level, warning = detector.update(GazeZone.FORWARD, dt=0.033)
if i % 50 == 0:
print(f" {i//30}s: 前方注视时间={detector.forward_gaze_time:.1f}s")

# 场景2:视线偏离
print("\n[场景2:视线偏离中控 5秒]")
for i in range(150):
level, warning = detector.update(GazeZone.CENTER_CONSOLE, dt=0.033)
if warning:
print(f" {i//30}s: ⚠️ 警告={warning}, 等级={level.name}")

# 场景3:手机使用
print("\n[场景3:手机打电话]")
detector = DistractionDetector()
detector.forward_gaze_time = 5.0 # 满足前置条件
level, warning = detector.update(GazeZone.UNKNOWN,
phone_detected=True,
hand_position='ear',
dt=0.033)
print(f" 结果: 警告={warning}, 等级={level.name}")

# 统计
print("\n=== 统计信息 ===")
stats = detector.get_statistics()
print(f"总事件数: {stats['total_events']}")
print(f"总分心时间: {stats['total_distraction_time']:.1f}s")

三、手机检测

3.1 检测方案

方案 准确率 实时性 鲁棒性
纯视觉 85% 30ms 中等
视觉+姿态 92% 40ms 较好
多模态融合 95% 50ms 最好

3.2 手机检测网络

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
"""
手机使用检测网络
"""

import torch
import torch.nn as nn

class PhoneDetectionNet(nn.Module):
"""
手机使用检测网络

输入:
- 面部图像
- 手部关键点

输出:
- 手机存在概率
- 手部位置类别
"""

def __init__(self, num_hand_keypoints: int = 21):
super().__init__()

# 面部特征提取
self.face_encoder = nn.Sequential(
nn.Conv2d(3, 32, kernel_size=3, stride=2, padding=1),
nn.BatchNorm2d(32),
nn.ReLU(),
nn.MaxPool2d(2),

nn.Conv2d(32, 64, kernel_size=3, stride=2, padding=1),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.MaxPool2d(2),

nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
nn.BatchNorm2d(128),
nn.ReLU(),
nn.AdaptiveAvgPool2d(1),
)

# 手部关键点编码
self.hand_encoder = nn.Sequential(
nn.Linear(num_hand_keypoints * 3, 128), # x, y, conf
nn.ReLU(),
nn.Dropout(0.3),
nn.Linear(128, 64),
nn.ReLU(),
)

# 融合层
self.fusion = nn.Sequential(
nn.Linear(128 + 64, 128),
nn.ReLU(),
nn.Dropout(0.3),
)

# 手机检测头
self.phone_head = nn.Sequential(
nn.Linear(128, 32),
nn.ReLU(),
nn.Linear(32, 1),
nn.Sigmoid()
)

# 手部位置分类头
self.position_head = nn.Sequential(
nn.Linear(128, 32),
nn.ReLU(),
nn.Linear(32, 4), # [none, lap, ear, texting]
)

def forward(self,
face_image: torch.Tensor,
hand_keypoints: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
"""
前向传播

Args:
face_image: (B, 3, H, W)
hand_keypoints: (B, 21, 3)

Returns:
phone_prob: (B, 1)
position_logits: (B, 4)
"""
# 提取特征
face_feat = self.face_encoder(face_image).squeeze(-1).squeeze(-1)
hand_feat = self.hand_encoder(hand_keypoints.view(hand_keypoints.size(0), -1))

# 融合
fused = torch.cat([face_feat, hand_feat], dim=1)
fused = self.fusion(fused)

# 输出
phone_prob = self.phone_head(fused)
position_logits = self.position_head(fused)

return phone_prob, position_logits


# 实际测试
if __name__ == "__main__":
model = PhoneDetectionNet(num_hand_keypoints=21)

# 模拟输入
face = torch.randn(4, 3, 128, 128)
hand = torch.randn(4, 21, 3)

phone_prob, pos_logits = model(face, hand)

print(f"手机概率: {phone_prob.shape}")
print(f"位置分类: {pos_logits.shape}")

四、IMS开发建议

4.1 技术选型

需求 推荐方案 理由
高精度视线 注意力机制网络 精度高、鲁棒性强
实时部署 MobileNet骨干+INT8量化 低延迟
手机检测 视觉+手部关键点融合 准确率高

4.2 Euro NCAP合规检查

检查项 要求 验证方法
[ ] 视线偏离检测 ≥3秒触发 台架测试
[ ] 手机使用检测 ≤3秒警告 实车测试
[ ] 误报率 <5% 长期测试
[ ] 遮挡鲁棒 墨镜/口罩 极端场景测试

五、总结

5.1 核心要点

  1. 视线追踪是Euro NCAP 2026核心技术
  2. 注意力机制提升精度和鲁棒性
  3. 手机检测需多模态融合
  4. 实时部署需要模型优化

5.2 技术趋势

  • 3D视线估计
  • 多任务学习(疲劳+分心+视线)
  • 跨域泛化

参考链接:

  • Springer: Driver Cognitive Distraction Detection
  • ScienceDirect: Multi-task driver gaze estimation
  • Euro NCAP 2026 Protocol

驾驶员视线追踪技术详解:注意力机制实现高精度分心检测
https://dapalm.com/2026/04/24/2026-04-24-gaze-tracking-distraction-detection/
作者
Mars
发布于
2026年4月24日
许可协议