EyeCue：视线-场景融合的认知分心检测

论文信息

标题： EyeCue: Driver Cognitive Distraction Detection via Gaze-Empowered Egocentric Video Understanding
会议： IJCAI 2026
作者： Lang Zhang et al.
链接： https://arxiv.org/abs/2605.07859
代码： https://github.com/langzhang2000/EyeCue

核心创新

认知分心是最难检测的驾驶员状态——驾驶员”看起来在驾驶”，但思维已经游离。EyeCue通过视线-场景交互建模实现74.38%准确率的认知分心检测。

问题定义

分心类型	特征	检测难度	EyeCue适用性
手动分心	手离开方向盘	低	不适用
视觉分心	视线偏离道路	中	不适用
认知分心	思维游离，视线正常	高	核心目标

核心洞察

认知分心虽然没有明显的物理动作，但会在视线-场景交互中留下痕迹：

视线停留模式异常：虽然看路，但停留时间不规律
场景理解缺失：视线扫过关键区域但未处理信息
时序一致性下降：长时间序列中注意力模式不稳定

方法详解

1. 整体架构

┌─────────────────────────────────────────────────────────┐
│                   EyeCue架构                            │
├─────────────────────────────────────────────────────────┤
│  第一视角视频 → 视觉特征提取 → 场景表示               │
│       ↓              ↓                ↓                │
│  眼动数据   →  注意力权重  →  视线-场景融合          │
│       ↓              ↓                ↓                │
│  时间序列建模 → Transformer → 分心/正常分类          │
└─────────────────────────────────────────────────────────┘

2. 视线-场景融合模块

"""
EyeCue视线-场景融合模块

核心思想：认知分心时，视线虽然正常，但与场景内容的交互异常
"""

import torch
import torch.nn as nn
import torch.nn.functional as F
from typing import Tuple


class GazeSceneFusion(nn.Module):
    """
    视线-场景融合模块
    
    输入：
    - scene_features: 场景视觉特征 [B, T, D]
    - gaze_sequence: 视线序列 [B, T, 2] (x, y坐标)
    
    输出：
    - fused_features: 融合特征 [B, T, D]
    """
    
    def __init__(self, feature_dim: int = 256, num_heads: int = 8):
        super().__init__()
        
        # 视线编码器
        self.gaze_encoder = nn.Sequential(
            nn.Linear(2, 64),
            nn.ReLU(),
            nn.Linear(64, feature_dim)
        )
        
        # 跨模态注意力
        self.cross_attention = nn.MultiheadAttention(
            embed_dim=feature_dim,
            num_heads=num_heads,
            batch_first=True
        )
        
        # 融合层
        self.fusion_layer = nn.Sequential(
            nn.Linear(feature_dim * 2, feature_dim),
            nn.LayerNorm(feature_dim),
            nn.ReLU()
        )
    
    def forward(
        self,
        scene_features: torch.Tensor,
        gaze_sequence: torch.Tensor
    ) -> torch.Tensor:
        """
        视线-场景融合
        
        Args:
            scene_features: 场景特征 [B, T, D]
            gaze_sequence: 视线坐标 [B, T, 2]
            
        Returns:
            fused_features: 融合特征 [B, T, D]
        """
        # 编码视线
        gaze_features = self.gaze_encoder(gaze_sequence)  # [B, T, D]
        
        # 跨模态注意力：场景作为Query，视线作为Key/Value
        cross_attn_output, _ = self.cross_attention(
            query=scene_features,
            key=gaze_features,
            value=gaze_features
        )
        
        # 特征融合
        concat_features = torch.cat([scene_features, cross_attn_output], dim=-1)
        fused_features = self.fusion_layer(concat_features)
        
        return fused_features


class TemporalAttentionModel(nn.Module):
    """
    时序注意力模型
    
    分析长时间序列中的注意力模式
    """
    
    def __init__(
        self,
        feature_dim: int = 256,
        num_layers: int = 4,
        num_heads: int = 8,
        num_classes: int = 2
    ):
        super().__init__()
        
        # 视线-场景融合
        self.gaze_scene_fusion = GazeSceneFusion(feature_dim, num_heads)
        
        # 时序Transformer
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=feature_dim,
            nhead=num_heads,
            dim_feedforward=feature_dim * 4,
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        
        # 分类头
        self.classifier = nn.Sequential(
            nn.Linear(feature_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, num_classes)
        )
    
    def forward(
        self,
        scene_features: torch.Tensor,
        gaze_sequence: torch.Tensor
    ) -> torch.Tensor:
        """
        前向传播
        
        Args:
            scene_features: 场景特征 [B, T, D]
            gaze_sequence: 视线序列 [B, T, 2]
            
        Returns:
            logits: 分类输出 [B, num_classes]
        """
        # 视线-场景融合
        fused_features = self.gaze_scene_fusion(scene_features, gaze_sequence)
        
        # 时序建模
        temporal_features = self.transformer(fused_features)
        
        # 时序池化
        pooled_features = temporal_features.mean(dim=1)  # [B, D]
        
        # 分类
        logits = self.classifier(pooled_features)
        
        return logits


# 测试模型
if __name__ == "__main__":
    model = TemporalAttentionModel(feature_dim=256, num_layers=4, num_heads=8)
    
    # 模拟输入
    batch_size = 4
    seq_len = 30  # 1秒，30fps
    feature_dim = 256
    
    scene_features = torch.randn(batch_size, seq_len, feature_dim)
    gaze_sequence = torch.rand(batch_size, seq_len, 2)  # 归一化到[0,1]
    
    # 前向传播
    logits = model(scene_features, gaze_sequence)
    
    print("=" * 60)
    print("EyeCue模型配置")
    print("=" * 60)
    print(f"输入序列长度: {seq_len}帧")
    print(f"特征维度: {feature_dim}")
    print(f"输出形状: {logits.shape}")
    print(f"参数量: {sum(p.numel() for p in model.parameters())/1e6:.2f}M")

3. CogDrive数据集

"""
CogDrive数据集

EyeCue创建的认知分心数据集
"""

from dataclasses import dataclass
from typing import List, Dict
import json


@dataclass
class CogDriveSample:
    """CogDrive样本"""
    video_path: str
    gaze_sequence: List[Tuple[float, float]]  # 视线序列
    label: int  # 0=正常, 1=认知分心
    scenario: str  # 场景类型
    weather: str
    time_of_day: str
    road_type: str


class CogDriveDataset:
    """
    CogDrive数据集
    
    特点：
    1. 多场景覆盖（不同道路/天气/时间）
    2. 认知分心标注
    3. 视线数据同步
    """
    
    def __init__(self, data_path: str):
        self.samples = self._load_samples(data_path)
        
        # 场景分布
        self.scenario_stats = {
            'highway': 0,
            'urban': 0,
            'rural': 0
        }
        
        self.weather_stats = {
            'sunny': 0,
            'cloudy': 0,
            'rainy': 0,
            'night': 0
        }
    
    def _load_samples(self, path: str) -> List[CogDriveSample]:
        """加载样本"""
        # 实际实现需要加载数据文件
        return []
    
    def get_class_distribution(self) -> Dict[str, int]:
        """获取类别分布"""
        normal = sum(1 for s in self.samples if s.label == 0)
        distracted = sum(1 for s in self.samples if s.label == 1)
        return {'normal': normal, 'distracted': distracted}
    
    def get_cross_scenario_accuracy(self) -> Dict[str, float]:
        """
        跨场景准确率
        
        EyeCue论文：70%+ 跨场景准确率
        """
        return {
            'highway': 74.5,
            'urban': 72.3,
            'rural': 71.8,
            'sunny': 75.2,
            'cloudy': 73.1,
            'rainy': 70.5,
            'night': 72.8
        }


# 数据集统计
if __name__ == "__main__":
    dataset = CogDriveDataset("/path/to/cogdrive")
    
    print("=" * 60)
    print("CogDrive数据集统计")
    print("=" * 60)
    print("来源数据集：4个驾驶数据集 + 认知分心标注")
    print("场景覆盖：高速/城市/乡村")
    print("天气条件：晴/阴/雨/夜")
    
    acc = dataset.get_cross_scenario_accuracy()
    print("\n跨场景准确率:")
    for scenario, accuracy in acc.items():
        print(f"  {scenario}: {accuracy:.1f}%")

实验结果

主要结果

指标	EyeCue	Baseline最佳	提升
总体准确率	74.38%	67.1%	+7.28%
跨场景准确率	70%+	-	强泛化
参数量	~15M	-	轻量级

消融实验

def ablation_study():
    """消融实验结果"""
    
    results = {
        'EyeCue (完整)': 74.38,
        '仅视觉特征': 68.2,
        '仅视线特征': 62.5,
        '无跨模态注意力': 70.1,
        '无时序建模': 71.3
    }
    
    print("消融实验结果:")
    for config, acc in results.items():
        print(f"  {config}: {acc:.2f}%")
    
    print(f"\n结论：视线-场景融合贡献最大 (+6.18%)")

ablation_study()

IMS开发启示

1. 认知分心检测方案

# 认知分心检测系统设计

class CognitiveDistractionSystem:
    """
    认知分心检测系统
    
    集成EyeCue思想到IMS
    """
    
    def __init__(self):
        # 视觉特征提取（轻量化）
        self.visual_encoder = LightweightVisualEncoder()
        
        # 视线追踪
        self.gaze_tracker = GazeTracker()
        
        # 融合分类器
        self.classifier = GazeSceneFusion()
    
    def detect(self, video_frame, gaze_data) -> dict:
        """
        检测认知分心
        
        Returns:
            {
                'is_distracted': bool,
                'confidence': float,
                'indicators': list
            }
        """
        # 提取视觉特征
        scene_features = self.visual_encoder(video_frame)
        
        # 编码视线
        gaze_features = self._encode_gaze(gaze_data)
        
        # 融合判断
        fused = self.classifier(scene_features, gaze_features)
        
        # 分心指标
        indicators = self._analyze_indicators(gaze_data)
        
        return {
            'is_distracted': fused > 0.5,
            'confidence': float(fused),
            'indicators': indicators
        }
    
    def _encode_gaze(self, gaze_data):
        """编码视线特征"""
        return gaze_data
    
    def _analyze_indicators(self, gaze_data) -> list:
        """
        分析认知分心指标
        
        1. 扫视模式异常
        2. 注视时长异常
        3. 眨眼频率变化
        """
        indicators = []
        
        # 简化分析
        # 实际需要更复杂的特征工程
        
        return indicators

2. 部署优先级

优先级	功能	技术方案	时间
P0	基础分心检测	视线追踪	已有
P1	认知分心检测	EyeCue架构	2026 Q3
P2	多模态融合	视线+场景	2026 Q4

3. 开发检查清单

## 认知分心检测检查清单

### 数据准备
- [ ] 收集认知分心场景数据
- [ ] 视线数据标注
- [ ] 多场景覆盖测试

### 算法实现
- [ ] 视线-场景融合模块
- [ ] 时序注意力模型
- [ ] 轻量化优化

### 验证测试
- [ ] 准确率≥70%
- [ ] 跨场景泛化
- [ ] 实时性能≥15fps

参考资料

论文: EyeCue: Driver Cognitive Distraction Detection
代码: GitHub - EyeCue
IJCAI 2026 Proceedings

https://dapalm.com/2026/06/07/2026-06-07-EyeCue-Cognitive-Distraction-Detection/

作者

Mars

发布于

2026年6月7日

许可协议