驾驶员视线追踪技术详解：注意力机制实现高精度分心检测

来源： Springer Nature + ScienceDirect
发布时间： 2026年4月
核心价值： 视线追踪是Euro NCAP 2026分心检测的核心技术

核心洞察

视线追踪技术指标：

指标	传统方案	注意力机制方案
角度精度	5-10°	1-3°
头部自由度	限制大	高自由度
实时性	50-100ms	<30ms
遮挡鲁棒	差	中等

Euro NCAP 2026分心检测要求：

视线偏离道路≥3秒触发警告
手机使用检测（腿部/耳边）
需要眼动追踪直接监测

一、技术原理

1.1 视线估计流程

视线估计流程
         │
         ├── 输入：面部图像
         │
         ├── 1. 人脸检测
         │   └── 关键点定位（68点）
         │
         ├── 2. 眼部区域提取
         │   ├── 左眼ROI
         │   └── 右眼ROI
         │
         ├── 3. 头部姿态估计
         │   └── (yaw, pitch, roll)
         │
         ├── 4. 视线方向估计
         │   ├── 瞳孔检测
         │   ├── 虹膜边缘
         │   └── 视线向量
         │
         └── 输出：(yaw, pitch) 或 视线区域

1.2 视线区域定义

"""
驾驶员视线区域定义
Euro NCAP标准
"""

from enum import Enum
from dataclasses import dataclass
from typing import Tuple, Optional
import numpy as np

class GazeZone(Enum):
    """视线区域"""
    FORWARD = "forward"           # 前方道路
    LEFT_MIRROR = "left_mirror"   # 左后视镜
    RIGHT_MIRROR = "right_mirror" # 右后视镜
    REAR_MIRROR = "rear_mirror"   # 车内后视镜
    INSTRUMENT = "instrument"     # 仪表盘
    CENTER_CONSOLE = "console"    # 中控
    PASSENGER = "passenger"       # 副驾驶侧
    DOWN = "down"                 # 下方
    UNKNOWN = "unknown"           # 未识别

@dataclass
class GazeVector:
    """视线向量"""
    yaw: float      # 水平角度 (-90° to 90°)
    pitch: float    # 垂直角度 (-90° to 90°)
    confidence: float  # 置信度 (0-1)

class GazeZoneClassifier:
    """
    视线区域分类器
    
    基于视线角度和头部姿态判断视线区域
    """
    
    def __init__(self):
        # 区域定义（角度范围）
        self.zone_ranges = {
            GazeZone.FORWARD: {
                'yaw': (-15, 15),
                'pitch': (-10, 10)
            },
            GazeZone.LEFT_MIRROR: {
                'yaw': (-45, -15),
                'pitch': (-5, 15)
            },
            GazeZone.RIGHT_MIRROR: {
                'yaw': (15, 45),
                'pitch': (-5, 15)
            },
            GazeZone.REAR_MIRROR: {
                'yaw': (-15, 15),
                'pitch': (10, 30)
            },
            GazeZone.INSTRUMENT: {
                'yaw': (-15, 15),
                'pitch': (-30, -10)
            },
            GazeZone.CENTER_CONSOLE: {
                'yaw': (-30, 30),
                'pitch': (-45, -30)
            },
            GazeZone.PASSENGER: {
                'yaw': (30, 60),
                'pitch': (-20, 20)
            },
            GazeZone.DOWN: {
                'yaw': (-30, 30),
                'pitch': (-60, -45)
            },
        }
    
    def classify(self, gaze: GazeVector, 
                 head_yaw: float = 0,
                 head_pitch: float = 0) -> Tuple[GazeZone, float]:
        """
        分类视线区域
        
        Args:
            gaze: 视线向量
            head_yaw: 头部水平角度
            head_pitch: 头部垂直角度
            
        Returns:
            (zone, confidence)
        """
        # 补偿头部姿态
        compensated_yaw = gaze.yaw + head_yaw * 0.5
        compensated_pitch = gaze.pitch + head_pitch * 0.5
        
        # 匹配区域
        best_zone = GazeZone.UNKNOWN
        best_score = 0
        
        for zone, ranges in self.zone_ranges.items():
            yaw_in = ranges['yaw'][0] <= compensated_yaw <= ranges['yaw'][1]
            pitch_in = ranges['pitch'][0] <= compensated_pitch <= ranges['pitch'][1]
            
            if yaw_in and pitch_in:
                # 计算距离中心的得分
                yaw_center = (ranges['yaw'][0] + ranges['yaw'][1]) / 2
                pitch_center = (ranges['pitch'][0] + ranges['pitch'][1]) / 2
                
                yaw_dist = abs(compensated_yaw - yaw_center)
                pitch_dist = abs(compensated_pitch - pitch_center)
                
                score = 1 / (1 + yaw_dist + pitch_dist)
                
                if score > best_score:
                    best_score = score
                    best_zone = zone
        
        return best_zone, best_score * gaze.confidence
    
    def is_eyes_on_road(self, zone: GazeZone) -> bool:
        """判断是否注视道路"""
        road_zones = {
            GazeZone.FORWARD,
            GazeZone.LEFT_MIRROR,
            GazeZone.RIGHT_MIRROR,
            GazeZone.REAR_MIRROR
        }
        return zone in road_zones


# 实际测试
if __name__ == "__main__":
    classifier = GazeZoneClassifier()
    
    # 测试场景
    test_cases = [
        GazeVector(yaw=0, pitch=0, confidence=0.9),      # 正前方
        GazeVector(yaw=-30, pitch=5, confidence=0.85),   # 左后视镜
        GazeVector(yaw=30, pitch=5, confidence=0.85),    # 右后视镜
        GazeVector(yaw=0, pitch=-40, confidence=0.8),    # 中控
        GazeVector(yaw=40, pitch=0, confidence=0.75),    # 副驾驶侧
    ]
    
    print("=== 视线区域分类测试 ===")
    for gaze in test_cases:
        zone, conf = classifier.classify(gaze)
        is_road = classifier.is_eyes_on_road(zone)
        print(f"视线({gaze.yaw:+.0f}°, {gaze.pitch:+.0f}°) → {zone.value:15s} "
              f"置信度:{conf:.2f} 道路:{is_road}")

1.3 注意力机制视线估计

"""
基于注意力机制的视线估计网络
"""

import torch
import torch.nn as nn
import torch.nn.functional as F
from typing import Tuple

class AttentionGazeNet(nn.Module):
    """
    注意力机制视线估计网络
    
    架构：
    1. 面部特征提取（ResNet）
    2. 眼部特征提取（专用分支）
    3. 跨模态注意力融合
    4. 视线回归头
    """
    
    def __init__(self, 
                 backbone: str = 'resnet18',
                 pretrained: bool = True):
        super().__init__()
        
        # 面部特征提取
        if backbone == 'resnet18':
            self.face_encoder = torch.hub.load(
                'pytorch/vision:v0.10.0', 
                'resnet18', 
                pretrained=pretrained
            )
            feature_dim = 512
        else:
            raise ValueError(f"Unsupported backbone: {backbone}")
        
        # 移除分类头
        self.face_encoder.fc = nn.Identity()
        
        # 眼部特征提取
        self.eye_encoder = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, stride=2, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(2),
            
            nn.Conv2d(32, 64, kernel_size=3, stride=2, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2),
            
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d(1),
        )
        
        # 跨模态注意力
        self.cross_attention = nn.MultiheadAttention(
            embed_dim=128,
            num_heads=4,
            batch_first=True
        )
        
        # 特征投影
        self.face_proj = nn.Linear(feature_dim, 128)
        self.eye_proj = nn.Linear(128, 128)
        
        # 视线回归头
        self.gaze_regressor = nn.Sequential(
            nn.Linear(128 * 3, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Linear(64, 2)  # (yaw, pitch)
        )
        
        # 置信度头
        self.confidence_head = nn.Sequential(
            nn.Linear(128 * 3, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )
    
    def forward(self, 
                face_image: torch.Tensor,
                left_eye: torch.Tensor,
                right_eye: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        前向传播
        
        Args:
            face_image: 面部图像 (B, 3, 224, 224)
            left_eye: 左眼图像 (B, 1, 64, 64)
            right_eye: 右眼图像 (B, 1, 64, 64)
            
        Returns:
            gaze: (B, 2) 视线角度 (yaw, pitch)
            confidence: (B, 1) 置信度
        """
        batch_size = face_image.size(0)
        
        # 1. 提取特征
        face_feat = self.face_encoder(face_image)  # (B, 512)
        left_feat = self.eye_encoder(left_eye).squeeze(-1).squeeze(-1)  # (B, 128)
        right_feat = self.eye_encoder(right_eye).squeeze(-1).squeeze(-1)  # (B, 128)
        
        # 2. 投影到统一维度
        face_feat = self.face_proj(face_feat)  # (B, 128)
        left_feat = self.eye_proj(left_feat)
        right_feat = self.eye_proj(right_feat)
        
        # 3. 堆叠为序列
        features = torch.stack([face_feat, left_feat, right_feat], dim=1)  # (B, 3, 128)
        
        # 4. 跨模态注意力
        attended, _ = self.cross_attention(features, features, features)
        
        # 5. 展平
        fused = attended.view(batch_size, -1)  # (B, 384)
        
        # 6. 回归视线角度
        gaze = self.gaze_regressor(fused)  # (B, 2)
        
        # 7. 置信度
        confidence = self.confidence_head(fused)  # (B, 1)
        
        return gaze, confidence
    
    def get_gaze_vector(self, 
                        face_image: torch.Tensor,
                        left_eye: torch.Tensor,
                        right_eye: torch.Tensor) -> GazeVector:
        """获取视线向量"""
        with torch.no_grad():
            gaze, conf = self.forward(face_image, left_eye, right_eye)
        
        return GazeVector(
            yaw=float(gaze[0, 0]),
            pitch=float(gaze[0, 1]),
            confidence=float(conf[0, 0])
        )


# 实际测试
if __name__ == "__main__":
    model = AttentionGazeNet(backbone='resnet18', pretrained=False)
    
    # 模拟输入
    face = torch.randn(4, 3, 224, 224)
    left_eye = torch.randn(4, 1, 64, 64)
    right_eye = torch.randn(4, 1, 64, 64)
    
    # 前向传播
    gaze, conf = model(face, left_eye, right_eye)
    
    print(f"面部图像: {face.shape}")
    print(f"眼部图像: {left_eye.shape}")
    print(f"视线输出: {gaze.shape}")
    print(f"置信度: {conf.shape}")
    
    # 参数统计
    total_params = sum(p.numel() for p in model.parameters())
    print(f"总参数量: {total_params/1e6:.2f}M")

二、分心检测系统

2.1 Euro NCAP分心场景

场景	描述	检测时限	警告类型
D-01	视线短暂偏离(≤3s)	不触发	无
D-02	视线长时间偏离(3-4s)	≤3s	一级警告
D-03	视线极端偏离(≥4s)	≤3s	二级警告
D-04	手机使用（腿部）	≤3s	一级警告
D-05	手机使用（耳边）	≤3s	二级警告

2.2 分心检测器实现

"""
Euro NCAP 2026分心检测器
"""

from dataclasses import dataclass
from typing import Tuple, Optional, List
from collections import deque
from enum import Enum
import time

class DistractionType(Enum):
    """分心类型"""
    NONE = "none"
    VISUAL = "visual"        # 视觉分心
    PHONE_LAP = "phone_lap"  # 手机（腿部）
    PHONE_EAR = "phone_ear"  # 手机（耳边）
    MANUAL = "manual"        # 手动操作
    COGNITIVE = "cognitive"  # 认知分心

class WarningLevel(Enum):
    """警告等级"""
    NONE = 0
    LEVEL_1 = 1  # 一级警告
    LEVEL_2 = 2  # 二级警告
    EMERGENCY = 3  # 紧急

@dataclass
class DistractionEvent:
    """分心事件"""
    type: DistractionType
    start_time: float
    duration: float
    gaze_zone: GazeZone
    phone_detected: bool
    hand_position: Optional[str]

class DistractionDetector:
    """
    Euro NCAP 2026分心检测器
    
    检测规则：
    1. 视线偏离道路≥3秒触发
    2. 手机使用检测
    3. 前置4秒道路注视条件
    """
    
    def __init__(self):
        # Euro NCAP阈值
        self.WARNING_THRESHOLD = 3.0   # 秒
        self.CRITICAL_THRESHOLD = 4.0  # 秒
        self.FORWARD_REQUIREMENT = 4.0 # 秒
        
        # 状态跟踪
        self.forward_gaze_time = 0.0
        self.distraction_start: Optional[float] = None
        self.current_distraction: Optional[DistractionEvent] = None
        
        # 历史记录
        self.event_history: List[DistractionEvent] = []
        self.gaze_history = deque(maxlen=300)  # 10秒@30fps
        
        # 手机检测状态
        self.phone_detected = False
        self.hand_position = None
    
    def update(self,
               gaze_zone: GazeZone,
               phone_detected: bool = False,
               hand_position: Optional[str] = None,
               dt: float = 0.033) -> Tuple[WarningLevel, Optional[str]]:
        """
        更新检测状态
        
        Args:
            gaze_zone: 当前视线区域
            phone_detected: 是否检测到手机
            hand_position: 手部位置
            dt: 时间步长
            
        Returns:
            (warning_level, warning_type)
        """
        current_time = time.time()
        
        # 更新前方注视时间
        if gaze_zone == GazeZone.FORWARD:
            self.forward_gaze_time += dt
            
            # 如果有正在进行的分心事件，结束它
            if self.current_distraction is not None:
                self.current_distraction.duration = current_time - self.current_distraction.start_time
                self.event_history.append(self.current_distraction)
                self.current_distraction = None
            
            self.distraction_start = None
        else:
            # 检查是否满足前置条件
            if self.forward_gaze_time >= self.FORWARD_REQUIREMENT:
                # 开始计时分心
                if self.distraction_start is None:
                    self.distraction_start = current_time
                    self.current_distraction = DistractionEvent(
                        type=DistractionType.VISUAL,
                        start_time=current_time,
                        duration=0.0,
                        gaze_zone=gaze_zone,
                        phone_detected=phone_detected,
                        hand_position=hand_position
                    )
            
            self.forward_gaze_time = 0.0
        
        # 记录历史
        self.gaze_history.append(gaze_zone)
        
        # 手机使用检测
        if phone_detected:
            return self._handle_phone_use(hand_position, current_time)
        
        # 视觉分心检测
        if self.distraction_start is not None:
            duration = current_time - self.distraction_start
            
            if duration >= self.CRITICAL_THRESHOLD:
                return WarningLevel.LEVEL_2, "CRITICAL_DISTRACTION"
            elif duration >= self.WARNING_THRESHOLD:
                return WarningLevel.LEVEL_1, "PROLONGED_DISTRACTION"
        
        return WarningLevel.NONE, None
    
    def _handle_phone_use(self, 
                          hand_position: Optional[str],
                          current_time: float) -> Tuple[WarningLevel, str]:
        """处理手机使用检测"""
        if hand_position == 'ear':
            # 打电话：立即二级警告
            return WarningLevel.LEVEL_2, "PHONE_CALL"
        elif hand_position in ['lap', 'texting']:
            # 手机使用：一级警告
            if self.distraction_start is not None:
                duration = current_time - self.distraction_start
                if duration >= self.WARNING_THRESHOLD:
                    return WarningLevel.LEVEL_1, "PHONE_USE"
        
        return WarningLevel.NONE, None
    
    def get_statistics(self) -> dict:
        """获取统计信息"""
        if len(self.event_history) == 0:
            return {
                'total_events': 0,
                'total_distraction_time': 0.0,
                'avg_duration': 0.0,
            }
        
        total_time = sum(e.duration for e in self.event_history)
        avg_duration = total_time / len(self.event_history)
        
        return {
            'total_events': len(self.event_history),
            'total_distraction_time': total_time,
            'avg_duration': avg_duration,
        }


# 实际测试
if __name__ == "__main__":
    detector = DistractionDetector()
    
    print("=== 分心检测测试 ===")
    
    # 场景1：正常驾驶
    print("\n[场景1：正常驾驶 5秒]")
    for i in range(150):  # 5秒
        level, warning = detector.update(GazeZone.FORWARD, dt=0.033)
        if i % 50 == 0:
            print(f"  {i//30}s: 前方注视时间={detector.forward_gaze_time:.1f}s")
    
    # 场景2：视线偏离
    print("\n[场景2：视线偏离中控 5秒]")
    for i in range(150):
        level, warning = detector.update(GazeZone.CENTER_CONSOLE, dt=0.033)
        if warning:
            print(f"  {i//30}s: ⚠️ 警告={warning}, 等级={level.name}")
    
    # 场景3：手机使用
    print("\n[场景3：手机打电话]")
    detector = DistractionDetector()
    detector.forward_gaze_time = 5.0  # 满足前置条件
    level, warning = detector.update(GazeZone.UNKNOWN, 
                                     phone_detected=True, 
                                     hand_position='ear',
                                     dt=0.033)
    print(f"  结果: 警告={warning}, 等级={level.name}")
    
    # 统计
    print("\n=== 统计信息 ===")
    stats = detector.get_statistics()
    print(f"总事件数: {stats['total_events']}")
    print(f"总分心时间: {stats['total_distraction_time']:.1f}s")

三、手机检测

3.1 检测方案

方案	准确率	实时性	鲁棒性
纯视觉	85%	30ms	中等
视觉+姿态	92%	40ms	较好
多模态融合	95%	50ms	最好

3.2 手机检测网络

"""
手机使用检测网络
"""

import torch
import torch.nn as nn

class PhoneDetectionNet(nn.Module):
    """
    手机使用检测网络
    
    输入：
    - 面部图像
    - 手部关键点
    
    输出：
    - 手机存在概率
    - 手部位置类别
    """
    
    def __init__(self, num_hand_keypoints: int = 21):
        super().__init__()
        
        # 面部特征提取
        self.face_encoder = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, stride=2, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(2),
            
            nn.Conv2d(32, 64, kernel_size=3, stride=2, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2),
            
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d(1),
        )
        
        # 手部关键点编码
        self.hand_encoder = nn.Sequential(
            nn.Linear(num_hand_keypoints * 3, 128),  # x, y, conf
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
        )
        
        # 融合层
        self.fusion = nn.Sequential(
            nn.Linear(128 + 64, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
        )
        
        # 手机检测头
        self.phone_head = nn.Sequential(
            nn.Linear(128, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid()
        )
        
        # 手部位置分类头
        self.position_head = nn.Sequential(
            nn.Linear(128, 32),
            nn.ReLU(),
            nn.Linear(32, 4),  # [none, lap, ear, texting]
        )
    
    def forward(self, 
                face_image: torch.Tensor,
                hand_keypoints: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        前向传播
        
        Args:
            face_image: (B, 3, H, W)
            hand_keypoints: (B, 21, 3)
            
        Returns:
            phone_prob: (B, 1)
            position_logits: (B, 4)
        """
        # 提取特征
        face_feat = self.face_encoder(face_image).squeeze(-1).squeeze(-1)
        hand_feat = self.hand_encoder(hand_keypoints.view(hand_keypoints.size(0), -1))
        
        # 融合
        fused = torch.cat([face_feat, hand_feat], dim=1)
        fused = self.fusion(fused)
        
        # 输出
        phone_prob = self.phone_head(fused)
        position_logits = self.position_head(fused)
        
        return phone_prob, position_logits


# 实际测试
if __name__ == "__main__":
    model = PhoneDetectionNet(num_hand_keypoints=21)
    
    # 模拟输入
    face = torch.randn(4, 3, 128, 128)
    hand = torch.randn(4, 21, 3)
    
    phone_prob, pos_logits = model(face, hand)
    
    print(f"手机概率: {phone_prob.shape}")
    print(f"位置分类: {pos_logits.shape}")

四、IMS开发建议

4.1 技术选型

需求	推荐方案	理由
高精度视线	注意力机制网络	精度高、鲁棒性强
实时部署	MobileNet骨干+INT8量化	低延迟
手机检测	视觉+手部关键点融合	准确率高

4.2 Euro NCAP合规检查

检查项	要求	验证方法
[ ] 视线偏离检测	≥3秒触发	台架测试
[ ] 手机使用检测	≤3秒警告	实车测试
[ ] 误报率	<5%	长期测试
[ ] 遮挡鲁棒	墨镜/口罩	极端场景测试

五、总结

5.1 核心要点

视线追踪是Euro NCAP 2026核心技术
注意力机制提升精度和鲁棒性
手机检测需多模态融合
实时部署需要模型优化

5.2 技术趋势

3D视线估计
多任务学习（疲劳+分心+视线）
跨域泛化

参考链接：

Springer: Driver Cognitive Distraction Detection
ScienceDirect: Multi-task driver gaze estimation
Euro NCAP 2026 Protocol

技术方案

#分心检测 #Euro NCAP 2026 #眼动追踪 #注意力机制

驾驶员视线追踪技术详解：注意力机制实现高精度分心检测

https://dapalm.com/2026/04/24/2026-04-24-gaze-tracking-distraction-detection/

作者

Mars

发布于

2026年4月24日

许可协议

ONNX Runtime边缘部署：ARM Cortex平台INT8量化实战上一篇

Qualcomm Snapdragon Ride Platform：DMS与ADAS一体化部署方案下一篇