深度学习眼动追踪：瞳孔检测与视线估计论文解读

发布时间： 2026-06-15
标签： 论文解读, 眼动追踪, 瞳孔检测, 视线估计, DMS
来源： arXiv 2403.19768, MDPI Applied Sciences

论文信息

标题： Using Deep Learning to Increase Eye-Tracking Robustness, Accuracy, and Precision in Virtual Reality
作者： Pupil Labs 研究团队
发表： arXiv:2403.19768 (2024年3月)
链接： https://arxiv.org/abs/2403.19768

核心贡献

本文系统性评估了深度学习方法对眼动追踪精度的影响：

特征检测模型对最终视线估计的贡献
基于特征 vs 基于模型的视线估计对比
VR 场景下的鲁棒性验证

眼动追踪技术栈

graph LR
    A[眼部图像] --> B[人脸/眼部检测]
    B --> C[瞳孔定位]
    C --> D[特征提取]
    D --> E[视线估计]
    E --> F[3D 视线向量]

瞳孔检测算法

1. 传统方法 vs 深度学习

方法	精度	速度	鲁棒性
边缘检测 + Hough	低	快	差
Daugman 算子	中	中	中
CNN 分割	高	慢	好
YOLO 检测	高	快	好

2. 深度学习瞳孔检测实现

"""
深度学习瞳孔检测与视线估计
基于论文方法复现
"""

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from typing import Dict, Tuple, Optional

class PupilDetector(nn.Module):
    """
    瞳孔检测网络
    
    输出：瞳孔中心坐标 + 椭圆参数
    """
    
    def __init__(self, config: Dict):
        super().__init__()
        
        # 编码器（轻量级）
        self.encoder = nn.Sequential(
            nn.Conv2d(1, 32, 3, stride=2, padding=1),   # 1/2
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.Conv2d(32, 64, 3, stride=2, padding=1),  # 1/4
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.Conv2d(64, 128, 3, stride=2, padding=1), # 1/8
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.Conv2d(128, 256, 3, stride=2, padding=1),# 1/16
            nn.BatchNorm2d(256),
            nn.ReLU()
        )
        
        # 瞳孔中心回归头
        self.center_head = nn.Sequential(
            nn.Conv2d(256, 128, 3, padding=1),
            nn.ReLU(),
            nn.Conv2d(128, 1, 1),
            nn.Sigmoid()  # 归一化到 0-1
        )
        
        # 椭圆参数回归头
        self.ellipse_head = nn.Sequential(
            nn.Conv2d(256, 64, 3, padding=1),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d(1),
            nn.Flatten(),
            nn.Linear(64, 5)  # cx, cy, a, b, angle
        )
        
        # 瞳孔分割头
        self.segmentation_head = nn.Sequential(
            nn.ConvTranspose2d(256, 128, 4, stride=2, padding=1),  # 1/8
            nn.ReLU(),
            nn.ConvTranspose2d(128, 64, 4, stride=2, padding=1),   # 1/4
            nn.ReLU(),
            nn.ConvTranspose2d(64, 32, 4, stride=2, padding=1),    # 1/2
            nn.ReLU(),
            nn.ConvTranspose2d(32, 1, 4, stride=2, padding=1),     # 原尺寸
            nn.Sigmoid()
        )
    
    def forward(self, eye_image: torch.Tensor) -> Dict:
        """
        前向传播
        
        Args:
            eye_image: 眼部图像 (B, 1, H, W) 灰度
        
        Returns:
            outputs: 检测结果
        """
        # 编码
        features = self.encoder(eye_image)
        
        # 瞳孔中心热图
        center_heatmap = self.center_head(features)  # (B, 1, H/16, W/16)
        
        # 椭圆参数
        ellipse_params = self.ellipse_head(features)  # (B, 5)
        
        # 分割掩码
        segmentation = self.segmentation_head(features)  # (B, 1, H, W)
        
        # 从热图提取中心坐标
        batch_size = eye_image.shape[0]
        center_coords = self._extract_center(center_heatmap, batch_size)
        
        return {
            'center': center_coords,           # (B, 2) 归一化坐标
            'ellipse': ellipse_params,         # (B, 5) 椭圆参数
            'segmentation': segmentation,      # (B, 1, H, W) 分割掩码
            'heatmap': center_heatmap          # (B, 1, H/16, W/16) 热图
        }
    
    def _extract_center(self, heatmap: torch.Tensor, batch_size: int) -> torch.Tensor:
        """从热图提取瞳孔中心"""
        # 展平热图
        flat = heatmap.view(batch_size, -1)
        
        # 找最大值位置
        max_idx = flat.argmax(dim=1)
        
        # 转换为坐标
        h, w = heatmap.shape[2], heatmap.shape[3]
        y = (max_idx // w).float() / h
        x = (max_idx % w).float() / w
        
        return torch.stack([x, y], dim=1)


class GazeEstimator(nn.Module):
    """
    视线估计网络
    
    输入：眼部图像 + 头部姿态
    输出：3D 视线向量
    """
    
    def __init__(self, config: Dict):
        super().__init__()
        
        # 眼部特征编码器
        self.eye_encoder = nn.Sequential(
            nn.Conv2d(1, 32, 5, stride=2, padding=2),
            nn.ReLU(),
            nn.Conv2d(32, 64, 3, stride=2, padding=1),
            nn.ReLU(),
            nn.Conv2d(64, 128, 3, stride=2, padding=1),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d(1),
            nn.Flatten()
        )
        
        # 头部姿态编码器
        self.head_encoder = nn.Sequential(
            nn.Linear(3, 32),  # pitch, yaw, roll
            nn.ReLU(),
            nn.Linear(32, 64),
            nn.ReLU()
        )
        
        # 视线回归
        self.gaze_regressor = nn.Sequential(
            nn.Linear(128 + 64, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 2)  # yaw, pitch
        )
    
    def forward(self, 
                eye_image: torch.Tensor, 
                head_pose: torch.Tensor) -> torch.Tensor:
        """
        前向传播
        
        Args:
            eye_image: 眼部图像 (B, 1, H, W)
            head_pose: 头部姿态 (B, 3) [pitch, yaw, roll] 度
        
        Returns:
            gaze: 视线向量 (B, 2) [yaw, pitch] 度
        """
        # 眼部特征
        eye_features = self.eye_encoder(eye_image)
        
        # 头部特征
        head_features = self.head_encoder(head_pose)
        
        # 融合
        combined = torch.cat([eye_features, head_features], dim=1)
        
        # 视线回归
        gaze = self.gaze_regressor(combined)
        
        return gaze


class EyeTrackerPipeline:
    """
    完整眼动追踪流水线
    
    整合人脸检测、眼部定位、瞳孔检测、视线估计
    """
    
    def __init__(self, config: Dict):
        self.pupil_detector = PupilDetector(config)
        self.gaze_estimator = GazeEstimator(config)
        
        # 加载预训练权重（实际部署）
        # self.pupil_detector.load_state_dict(...)
        # self.gaze_estimator.load_state_dict(...)
    
    def process_frame(self, frame: np.ndarray) -> Dict:
        """
        处理单帧图像
        
        Args:
            frame: 输入图像 (H, W, 3) BGR
        
        Returns:
            result: 眼动追踪结果
        """
        # 1. 人脸检测（简化，实际使用专业检测器）
        face_bbox = self._detect_face(frame)
        
        # 2. 眼部定位
        left_eye, right_eye = self._locate_eyes(frame, face_bbox)
        
        # 3. 瞳孔检测
        left_pupil = self._detect_pupil(left_eye)
        right_pupil = self._detect_pupil(right_eye)
        
        # 4. 视线估计
        left_gaze = self._estimate_gaze(left_eye, head_pose=(0, 0, 0))
        right_gaze = self._estimate_gaze(right_eye, head_pose=(0, 0, 0))
        
        # 5. 双眼融合
        final_gaze = self._fuse_gaze(left_gaze, right_gaze)
        
        return {
            'left_pupil': left_pupil,
            'right_pupil': right_pupil,
            'left_gaze': left_gaze,
            'right_gaze': right_gaze,
            'final_gaze': final_gaze
        }
    
    def _detect_face(self, frame: np.ndarray) -> Tuple[int, int, int, int]:
        """人脸检测（简化）"""
        # 实际使用 RetinaFace, BlazeFace 等
        return (0, 0, frame.shape[1], frame.shape[0])
    
    def _locate_eyes(self, 
                     frame: np.ndarray, 
                     face_bbox: Tuple) -> Tuple[np.ndarray, np.ndarray]:
        """眼部定位（简化）"""
        # 实际使用 facial landmarks
        h, w = frame.shape[:2]
        left_eye = frame[h//4:h//2, w//4:w//2]
        right_eye = frame[h//4:h//2, w//2:3*w//4]
        return left_eye, right_eye
    
    def _detect_pupil(self, eye_image: np.ndarray) -> Dict:
        """瞳孔检测"""
        # 预处理
        gray = cv2.cvtColor(eye_image, cv2.COLOR_BGR2GRAY)
        tensor = torch.from_numpy(gray).float().unsqueeze(0).unsqueeze(0) / 255.0
        
        # 推理
        with torch.no_grad():
            output = self.pupil_detector(tensor)
        
        return {
            'center': output['center'].cpu().numpy()[0],
            'ellipse': output['ellipse'].cpu().numpy()[0]
        }
    
    def _estimate_gaze(self, 
                       eye_image: np.ndarray, 
                       head_pose: Tuple) -> np.ndarray:
        """视线估计"""
        gray = cv2.cvtColor(eye_image, cv2.COLOR_BGR2GRAY)
        eye_tensor = torch.from_numpy(gray).float().unsqueeze(0).unsqueeze(0) / 255.0
        head_tensor = torch.tensor(head_pose).float().unsqueeze(0)
        
        with torch.no_grad():
            gaze = self.gaze_estimator(eye_tensor, head_tensor)
        
        return gaze.cpu().numpy()[0]
    
    def _fuse_gaze(self, left: np.ndarray, right: np.ndarray) -> np.ndarray:
        """双眼融合"""
        # 简单平均
        return (left + right) / 2


# 测试示例
if __name__ == "__main__":
    import cv2
    
    config = {}
    pipeline = EyeTrackerPipeline(config)
    
    # 模拟输入
    dummy_frame = np.random.randint(0, 255, (480, 640, 3), dtype=np.uint8)
    result = pipeline.process_frame(dummy_frame)
    
    print("=== 眼动追踪结果 ===")
    print(f"左眼瞳孔中心: {result['left_pupil']['center']}")
    print(f"右眼瞳孔中心: {result['right_pupil']['center']}")
    print(f"最终视线方向: {result['final_gaze']}")

Euro NCAP DMS 应用

视线追踪在 DMS 中的应用

应用	精度要求	帧率要求
分心检测	±5°	≥ 15 fps
视线落点	±3°	≥ 30 fps
疲劳检测	±10°	≥ 10 fps

视线落点检测

class GazeZoneDetector:
    """
    视线落点区域检测
    
    Euro NCAP 分心检测应用
    """
    
    def __init__(self, config: Dict):
        # 定义视线区域（车内坐标系）
        self.zones = {
            'road_ahead': {
                'center': (0, 0, 10),  # 前方道路
                'radius': 15,           # 度
                'priority': 'high'
            },
            'left_mirror': {
                'center': (-30, 0, 2),
                'radius': 10,
                'priority': 'low'
            },
            'right_mirror': {
                'center': (30, 0, 2),
                'radius': 10,
                'priority': 'low'
            },
            'instrument_cluster': {
                'center': (0, -20, 1),
                'radius': 10,
                'priority': 'low'
            },
            'infotainment': {
                'center': (20, -25, 1),
                'radius': 10,
                'priority': 'medium'
            }
        }
        
        # 分心阈值
        self.distraction_threshold = 3.0  # 秒
    
    def detect_zone(self, gaze: np.ndarray) -> str:
        """
        检测视线落点区域
        
        Args:
            gaze: 视线向量 [yaw, pitch] 度
        
        Returns:
            zone_name: 区域名称
        """
        yaw, pitch = gaze
        
        for zone_name, zone in self.zones.items():
            center = zone['center']
            radius = zone['radius']
            
            # 计算角度距离
            dist = np.sqrt((yaw - center[0])**2 + (pitch - center[1])**2)
            
            if dist < radius:
                return zone_name
        
        return 'unknown'
    
    def check_distraction(self, 
                          gaze_history: List[np.ndarray],
                          timestamp: float) -> Tuple[bool, str]:
        """
        检查分心状态
        
        Args:
            gaze_history: 历史视线数据
            timestamp: 当前时间
        
        Returns:
            is_distracted: 是否分心
            distraction_type: 分心类型
        """
        # 统计非道路区域时间
        off_road_time = 0
        current_zone = None
        zone_start_time = None
        
        for i, (gaze, ts) in enumerate(gaze_history):
            zone = self.detect_zone(gaze)
            
            if zone != 'road_ahead':
                if current_zone is None:
                    current_zone = zone
                    zone_start_time = ts
                elif zone == current_zone:
                    off_road_time = ts - zone_start_time
            else:
                current_zone = None
                off_road_time = 0
        
        if off_road_time > self.distraction_threshold:
            return True, f'looking_{current_zone}'
        
        return False, ''

鲁棒性优化

1. 墨镜/红外场景

class RobustPupilDetector(nn.Module):
    """鲁棒瞳孔检测（支持墨镜/红外）"""
    
    def __init__(self):
        super().__init__()
        
        # 多光谱输入
        self.rgb_encoder = self._build_encoder(3)
        self.ir_encoder = self._build_encoder(1)
        
        # 融合
        self.fusion = nn.Conv2d(512, 256, 1)
        
    def _build_encoder(self, in_channels: int) -> nn.Module:
        return nn.Sequential(
            nn.Conv2d(in_channels, 64, 3, stride=2, padding=1),
            nn.ReLU(),
            nn.Conv2d(64, 128, 3, stride=2, padding=1),
            nn.ReLU(),
            nn.Conv2d(128, 256, 3, stride=2, padding=1),
            nn.ReLU()
        )
    
    def forward(self, rgb: torch.Tensor, ir: torch.Tensor) -> Dict:
        """RGB-IR 融合检测"""
        rgb_feat = self.rgb_encoder(rgb)
        ir_feat = self.ir_encoder(ir)
        
        # 融合
        fused = torch.cat([rgb_feat, ir_feat], dim=1)
        fused = self.fusion(fused)
        
        # 后续处理...
        return {'features': fused}

2. 头部姿态补偿

def compensate_head_pose(gaze: np.ndarray, head_pose: np.ndarray) -> np.ndarray:
    """
    头部姿态补偿
    
    Args:
        gaze: 原始视线向量
        head_pose: 头部姿态 [pitch, yaw, roll]
    
    Returns:
        compensated: 补偿后视线向量
    """
    # 旋转矩阵
    pitch, yaw, roll = np.radians(head_pose)
    
    # 补偿 yaw
    gaze[0] -= yaw * 0.7  # 头部 yaw 对视线 yaw 的影响系数
    
    # 补偿 pitch
    gaze[1] -= pitch * 0.5
    
    return gaze

IMS 开发启示

1. 模型选型

场景	推荐模型	原因
边缘部署	MobileNetV3 + 轻量头	低延迟
高精度	ResNet50 + Attention	高精度
墨镜场景	RGB-IR 双流	鲁棒性

2. 数据集

数据集	大小	用途
MPIIGaze	213K	视线估计
GazeCapture	2.5M	视线估计
TEyeD	20M	瞳孔分割

3. 部署优化

# ONNX 导出
def export_to_onnx(model, input_shape=(1, 1, 64, 64)):
    """导出为 ONNX 格式"""
    dummy_input = torch.randn(*input_shape)
    torch.onnx.export(
        model,
        dummy_input,
        "pupil_detector.onnx",
        opset_version=11,
        input_names=['eye_image'],
        output_names=['center', 'ellipse', 'segmentation']
    )

总结

深度学习显著提升眼动追踪精度：

瞳孔检测：CNN 分割优于传统方法
视线估计：外观法 + 头部姿态融合
鲁棒性：RGB-IR 融合解决墨镜场景
实时性：轻量化模型满足车载需求

参考来源：

论文解读 > DMS

#DMS #眼动追踪 #视线估计 #论文解读 #瞳孔检测

深度学习眼动追踪：瞳孔检测与视线估计论文解读

https://dapalm.com/2026/06/15/2026-06-15-Deep-Learning-Eye-Tracking-Pupil-Detection-Gaze-Estimation/

作者

Mars

发布于

2026年6月15日

许可协议

Euro NCAP 2027 路线图：从疲劳检测到认知分心上一篇

Euro NCAP 2026 无响应驾驶员干预：ESF 紧急停车功能技术详解下一篇