深度学习眼动追踪:瞳孔检测与视线估计论文解读

深度学习眼动追踪:瞳孔检测与视线估计论文解读

发布时间: 2026-06-15
标签: 论文解读, 眼动追踪, 瞳孔检测, 视线估计, DMS
来源: arXiv 2403.19768, MDPI Applied Sciences


论文信息

  • 标题: Using Deep Learning to Increase Eye-Tracking Robustness, Accuracy, and Precision in Virtual Reality
  • 作者: Pupil Labs 研究团队
  • 发表: arXiv:2403.19768 (2024年3月)
  • 链接: https://arxiv.org/abs/2403.19768

核心贡献

本文系统性评估了深度学习方法对眼动追踪精度的影响

  1. 特征检测模型对最终视线估计的贡献
  2. 基于特征 vs 基于模型的视线估计对比
  3. VR 场景下的鲁棒性验证

眼动追踪技术栈

graph LR
    A[眼部图像] --> B[人脸/眼部检测]
    B --> C[瞳孔定位]
    C --> D[特征提取]
    D --> E[视线估计]
    E --> F[3D 视线向量]

瞳孔检测算法

1. 传统方法 vs 深度学习

方法 精度 速度 鲁棒性
边缘检测 + Hough
Daugman 算子
CNN 分割
YOLO 检测

2. 深度学习瞳孔检测实现

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
"""
深度学习瞳孔检测与视线估计
基于论文方法复现
"""

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from typing import Dict, Tuple, Optional

class PupilDetector(nn.Module):
"""
瞳孔检测网络

输出:瞳孔中心坐标 + 椭圆参数
"""

def __init__(self, config: Dict):
super().__init__()

# 编码器(轻量级)
self.encoder = nn.Sequential(
nn.Conv2d(1, 32, 3, stride=2, padding=1), # 1/2
nn.BatchNorm2d(32),
nn.ReLU(),
nn.Conv2d(32, 64, 3, stride=2, padding=1), # 1/4
nn.BatchNorm2d(64),
nn.ReLU(),
nn.Conv2d(64, 128, 3, stride=2, padding=1), # 1/8
nn.BatchNorm2d(128),
nn.ReLU(),
nn.Conv2d(128, 256, 3, stride=2, padding=1),# 1/16
nn.BatchNorm2d(256),
nn.ReLU()
)

# 瞳孔中心回归头
self.center_head = nn.Sequential(
nn.Conv2d(256, 128, 3, padding=1),
nn.ReLU(),
nn.Conv2d(128, 1, 1),
nn.Sigmoid() # 归一化到 0-1
)

# 椭圆参数回归头
self.ellipse_head = nn.Sequential(
nn.Conv2d(256, 64, 3, padding=1),
nn.ReLU(),
nn.AdaptiveAvgPool2d(1),
nn.Flatten(),
nn.Linear(64, 5) # cx, cy, a, b, angle
)

# 瞳孔分割头
self.segmentation_head = nn.Sequential(
nn.ConvTranspose2d(256, 128, 4, stride=2, padding=1), # 1/8
nn.ReLU(),
nn.ConvTranspose2d(128, 64, 4, stride=2, padding=1), # 1/4
nn.ReLU(),
nn.ConvTranspose2d(64, 32, 4, stride=2, padding=1), # 1/2
nn.ReLU(),
nn.ConvTranspose2d(32, 1, 4, stride=2, padding=1), # 原尺寸
nn.Sigmoid()
)

def forward(self, eye_image: torch.Tensor) -> Dict:
"""
前向传播

Args:
eye_image: 眼部图像 (B, 1, H, W) 灰度

Returns:
outputs: 检测结果
"""
# 编码
features = self.encoder(eye_image)

# 瞳孔中心热图
center_heatmap = self.center_head(features) # (B, 1, H/16, W/16)

# 椭圆参数
ellipse_params = self.ellipse_head(features) # (B, 5)

# 分割掩码
segmentation = self.segmentation_head(features) # (B, 1, H, W)

# 从热图提取中心坐标
batch_size = eye_image.shape[0]
center_coords = self._extract_center(center_heatmap, batch_size)

return {
'center': center_coords, # (B, 2) 归一化坐标
'ellipse': ellipse_params, # (B, 5) 椭圆参数
'segmentation': segmentation, # (B, 1, H, W) 分割掩码
'heatmap': center_heatmap # (B, 1, H/16, W/16) 热图
}

def _extract_center(self, heatmap: torch.Tensor, batch_size: int) -> torch.Tensor:
"""从热图提取瞳孔中心"""
# 展平热图
flat = heatmap.view(batch_size, -1)

# 找最大值位置
max_idx = flat.argmax(dim=1)

# 转换为坐标
h, w = heatmap.shape[2], heatmap.shape[3]
y = (max_idx // w).float() / h
x = (max_idx % w).float() / w

return torch.stack([x, y], dim=1)


class GazeEstimator(nn.Module):
"""
视线估计网络

输入:眼部图像 + 头部姿态
输出:3D 视线向量
"""

def __init__(self, config: Dict):
super().__init__()

# 眼部特征编码器
self.eye_encoder = nn.Sequential(
nn.Conv2d(1, 32, 5, stride=2, padding=2),
nn.ReLU(),
nn.Conv2d(32, 64, 3, stride=2, padding=1),
nn.ReLU(),
nn.Conv2d(64, 128, 3, stride=2, padding=1),
nn.ReLU(),
nn.AdaptiveAvgPool2d(1),
nn.Flatten()
)

# 头部姿态编码器
self.head_encoder = nn.Sequential(
nn.Linear(3, 32), # pitch, yaw, roll
nn.ReLU(),
nn.Linear(32, 64),
nn.ReLU()
)

# 视线回归
self.gaze_regressor = nn.Sequential(
nn.Linear(128 + 64, 256),
nn.ReLU(),
nn.Dropout(0.3),
nn.Linear(256, 128),
nn.ReLU(),
nn.Linear(128, 2) # yaw, pitch
)

def forward(self,
eye_image: torch.Tensor,
head_pose: torch.Tensor) -> torch.Tensor:
"""
前向传播

Args:
eye_image: 眼部图像 (B, 1, H, W)
head_pose: 头部姿态 (B, 3) [pitch, yaw, roll] 度

Returns:
gaze: 视线向量 (B, 2) [yaw, pitch] 度
"""
# 眼部特征
eye_features = self.eye_encoder(eye_image)

# 头部特征
head_features = self.head_encoder(head_pose)

# 融合
combined = torch.cat([eye_features, head_features], dim=1)

# 视线回归
gaze = self.gaze_regressor(combined)

return gaze


class EyeTrackerPipeline:
"""
完整眼动追踪流水线

整合人脸检测、眼部定位、瞳孔检测、视线估计
"""

def __init__(self, config: Dict):
self.pupil_detector = PupilDetector(config)
self.gaze_estimator = GazeEstimator(config)

# 加载预训练权重(实际部署)
# self.pupil_detector.load_state_dict(...)
# self.gaze_estimator.load_state_dict(...)

def process_frame(self, frame: np.ndarray) -> Dict:
"""
处理单帧图像

Args:
frame: 输入图像 (H, W, 3) BGR

Returns:
result: 眼动追踪结果
"""
# 1. 人脸检测(简化,实际使用专业检测器)
face_bbox = self._detect_face(frame)

# 2. 眼部定位
left_eye, right_eye = self._locate_eyes(frame, face_bbox)

# 3. 瞳孔检测
left_pupil = self._detect_pupil(left_eye)
right_pupil = self._detect_pupil(right_eye)

# 4. 视线估计
left_gaze = self._estimate_gaze(left_eye, head_pose=(0, 0, 0))
right_gaze = self._estimate_gaze(right_eye, head_pose=(0, 0, 0))

# 5. 双眼融合
final_gaze = self._fuse_gaze(left_gaze, right_gaze)

return {
'left_pupil': left_pupil,
'right_pupil': right_pupil,
'left_gaze': left_gaze,
'right_gaze': right_gaze,
'final_gaze': final_gaze
}

def _detect_face(self, frame: np.ndarray) -> Tuple[int, int, int, int]:
"""人脸检测(简化)"""
# 实际使用 RetinaFace, BlazeFace 等
return (0, 0, frame.shape[1], frame.shape[0])

def _locate_eyes(self,
frame: np.ndarray,
face_bbox: Tuple) -> Tuple[np.ndarray, np.ndarray]:
"""眼部定位(简化)"""
# 实际使用 facial landmarks
h, w = frame.shape[:2]
left_eye = frame[h//4:h//2, w//4:w//2]
right_eye = frame[h//4:h//2, w//2:3*w//4]
return left_eye, right_eye

def _detect_pupil(self, eye_image: np.ndarray) -> Dict:
"""瞳孔检测"""
# 预处理
gray = cv2.cvtColor(eye_image, cv2.COLOR_BGR2GRAY)
tensor = torch.from_numpy(gray).float().unsqueeze(0).unsqueeze(0) / 255.0

# 推理
with torch.no_grad():
output = self.pupil_detector(tensor)

return {
'center': output['center'].cpu().numpy()[0],
'ellipse': output['ellipse'].cpu().numpy()[0]
}

def _estimate_gaze(self,
eye_image: np.ndarray,
head_pose: Tuple) -> np.ndarray:
"""视线估计"""
gray = cv2.cvtColor(eye_image, cv2.COLOR_BGR2GRAY)
eye_tensor = torch.from_numpy(gray).float().unsqueeze(0).unsqueeze(0) / 255.0
head_tensor = torch.tensor(head_pose).float().unsqueeze(0)

with torch.no_grad():
gaze = self.gaze_estimator(eye_tensor, head_tensor)

return gaze.cpu().numpy()[0]

def _fuse_gaze(self, left: np.ndarray, right: np.ndarray) -> np.ndarray:
"""双眼融合"""
# 简单平均
return (left + right) / 2


# 测试示例
if __name__ == "__main__":
import cv2

config = {}
pipeline = EyeTrackerPipeline(config)

# 模拟输入
dummy_frame = np.random.randint(0, 255, (480, 640, 3), dtype=np.uint8)
result = pipeline.process_frame(dummy_frame)

print("=== 眼动追踪结果 ===")
print(f"左眼瞳孔中心: {result['left_pupil']['center']}")
print(f"右眼瞳孔中心: {result['right_pupil']['center']}")
print(f"最终视线方向: {result['final_gaze']}")

Euro NCAP DMS 应用

视线追踪在 DMS 中的应用

应用 精度要求 帧率要求
分心检测 ±5° ≥ 15 fps
视线落点 ±3° ≥ 30 fps
疲劳检测 ±10° ≥ 10 fps

视线落点检测

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
class GazeZoneDetector:
"""
视线落点区域检测

Euro NCAP 分心检测应用
"""

def __init__(self, config: Dict):
# 定义视线区域(车内坐标系)
self.zones = {
'road_ahead': {
'center': (0, 0, 10), # 前方道路
'radius': 15, # 度
'priority': 'high'
},
'left_mirror': {
'center': (-30, 0, 2),
'radius': 10,
'priority': 'low'
},
'right_mirror': {
'center': (30, 0, 2),
'radius': 10,
'priority': 'low'
},
'instrument_cluster': {
'center': (0, -20, 1),
'radius': 10,
'priority': 'low'
},
'infotainment': {
'center': (20, -25, 1),
'radius': 10,
'priority': 'medium'
}
}

# 分心阈值
self.distraction_threshold = 3.0 # 秒

def detect_zone(self, gaze: np.ndarray) -> str:
"""
检测视线落点区域

Args:
gaze: 视线向量 [yaw, pitch] 度

Returns:
zone_name: 区域名称
"""
yaw, pitch = gaze

for zone_name, zone in self.zones.items():
center = zone['center']
radius = zone['radius']

# 计算角度距离
dist = np.sqrt((yaw - center[0])**2 + (pitch - center[1])**2)

if dist < radius:
return zone_name

return 'unknown'

def check_distraction(self,
gaze_history: List[np.ndarray],
timestamp: float) -> Tuple[bool, str]:
"""
检查分心状态

Args:
gaze_history: 历史视线数据
timestamp: 当前时间

Returns:
is_distracted: 是否分心
distraction_type: 分心类型
"""
# 统计非道路区域时间
off_road_time = 0
current_zone = None
zone_start_time = None

for i, (gaze, ts) in enumerate(gaze_history):
zone = self.detect_zone(gaze)

if zone != 'road_ahead':
if current_zone is None:
current_zone = zone
zone_start_time = ts
elif zone == current_zone:
off_road_time = ts - zone_start_time
else:
current_zone = None
off_road_time = 0

if off_road_time > self.distraction_threshold:
return True, f'looking_{current_zone}'

return False, ''

鲁棒性优化

1. 墨镜/红外场景

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
class RobustPupilDetector(nn.Module):
"""鲁棒瞳孔检测(支持墨镜/红外)"""

def __init__(self):
super().__init__()

# 多光谱输入
self.rgb_encoder = self._build_encoder(3)
self.ir_encoder = self._build_encoder(1)

# 融合
self.fusion = nn.Conv2d(512, 256, 1)

def _build_encoder(self, in_channels: int) -> nn.Module:
return nn.Sequential(
nn.Conv2d(in_channels, 64, 3, stride=2, padding=1),
nn.ReLU(),
nn.Conv2d(64, 128, 3, stride=2, padding=1),
nn.ReLU(),
nn.Conv2d(128, 256, 3, stride=2, padding=1),
nn.ReLU()
)

def forward(self, rgb: torch.Tensor, ir: torch.Tensor) -> Dict:
"""RGB-IR 融合检测"""
rgb_feat = self.rgb_encoder(rgb)
ir_feat = self.ir_encoder(ir)

# 融合
fused = torch.cat([rgb_feat, ir_feat], dim=1)
fused = self.fusion(fused)

# 后续处理...
return {'features': fused}

2. 头部姿态补偿

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
def compensate_head_pose(gaze: np.ndarray, head_pose: np.ndarray) -> np.ndarray:
"""
头部姿态补偿

Args:
gaze: 原始视线向量
head_pose: 头部姿态 [pitch, yaw, roll]

Returns:
compensated: 补偿后视线向量
"""
# 旋转矩阵
pitch, yaw, roll = np.radians(head_pose)

# 补偿 yaw
gaze[0] -= yaw * 0.7 # 头部 yaw 对视线 yaw 的影响系数

# 补偿 pitch
gaze[1] -= pitch * 0.5

return gaze

IMS 开发启示

1. 模型选型

场景 推荐模型 原因
边缘部署 MobileNetV3 + 轻量头 低延迟
高精度 ResNet50 + Attention 高精度
墨镜场景 RGB-IR 双流 鲁棒性

2. 数据集

数据集 大小 用途
MPIIGaze 213K 视线估计
GazeCapture 2.5M 视线估计
TEyeD 20M 瞳孔分割

3. 部署优化

1
2
3
4
5
6
7
8
9
10
11
12
# ONNX 导出
def export_to_onnx(model, input_shape=(1, 1, 64, 64)):
"""导出为 ONNX 格式"""
dummy_input = torch.randn(*input_shape)
torch.onnx.export(
model,
dummy_input,
"pupil_detector.onnx",
opset_version=11,
input_names=['eye_image'],
output_names=['center', 'ellipse', 'segmentation']
)

总结

深度学习显著提升眼动追踪精度:

  1. 瞳孔检测:CNN 分割优于传统方法
  2. 视线估计:外观法 + 头部姿态融合
  3. 鲁棒性:RGB-IR 融合解决墨镜场景
  4. 实时性:轻量化模型满足车载需求

参考来源:


深度学习眼动追踪:瞳孔检测与视线估计论文解读
https://dapalm.com/2026/06/15/2026-06-15-Deep-Learning-Eye-Tracking-Pupil-Detection-Gaze-Estimation/
作者
Mars
发布于
2026年6月15日
许可协议