EyeCue:视线-场景融合的认知分心检测

论文信息

核心创新

认知分心是最难检测的驾驶员状态——驾驶员”看起来在驾驶”,但思维已经游离。EyeCue通过视线-场景交互建模实现74.38%准确率的认知分心检测。

问题定义

分心类型 特征 检测难度 EyeCue适用性
手动分心 手离开方向盘 不适用
视觉分心 视线偏离道路 不适用
认知分心 思维游离,视线正常 核心目标

核心洞察

认知分心虽然没有明显的物理动作,但会在视线-场景交互中留下痕迹:

  1. 视线停留模式异常:虽然看路,但停留时间不规律
  2. 场景理解缺失:视线扫过关键区域但未处理信息
  3. 时序一致性下降:长时间序列中注意力模式不稳定

方法详解

1. 整体架构

1
2
3
4
5
6
7
8
9
┌─────────────────────────────────────────────────────────┐
│ EyeCue架构 │
├─────────────────────────────────────────────────────────┤
│ 第一视角视频 → 视觉特征提取 → 场景表示 │
│ ↓ ↓ ↓ │
│ 眼动数据 → 注意力权重 → 视线-场景融合 │
│ ↓ ↓ ↓ │
│ 时间序列建模 → Transformer → 分心/正常分类 │
└─────────────────────────────────────────────────────────┘

2. 视线-场景融合模块

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
"""
EyeCue视线-场景融合模块

核心思想:认知分心时,视线虽然正常,但与场景内容的交互异常
"""

import torch
import torch.nn as nn
import torch.nn.functional as F
from typing import Tuple


class GazeSceneFusion(nn.Module):
"""
视线-场景融合模块

输入:
- scene_features: 场景视觉特征 [B, T, D]
- gaze_sequence: 视线序列 [B, T, 2] (x, y坐标)

输出:
- fused_features: 融合特征 [B, T, D]
"""

def __init__(self, feature_dim: int = 256, num_heads: int = 8):
super().__init__()

# 视线编码器
self.gaze_encoder = nn.Sequential(
nn.Linear(2, 64),
nn.ReLU(),
nn.Linear(64, feature_dim)
)

# 跨模态注意力
self.cross_attention = nn.MultiheadAttention(
embed_dim=feature_dim,
num_heads=num_heads,
batch_first=True
)

# 融合层
self.fusion_layer = nn.Sequential(
nn.Linear(feature_dim * 2, feature_dim),
nn.LayerNorm(feature_dim),
nn.ReLU()
)

def forward(
self,
scene_features: torch.Tensor,
gaze_sequence: torch.Tensor
) -> torch.Tensor:
"""
视线-场景融合

Args:
scene_features: 场景特征 [B, T, D]
gaze_sequence: 视线坐标 [B, T, 2]

Returns:
fused_features: 融合特征 [B, T, D]
"""
# 编码视线
gaze_features = self.gaze_encoder(gaze_sequence) # [B, T, D]

# 跨模态注意力:场景作为Query,视线作为Key/Value
cross_attn_output, _ = self.cross_attention(
query=scene_features,
key=gaze_features,
value=gaze_features
)

# 特征融合
concat_features = torch.cat([scene_features, cross_attn_output], dim=-1)
fused_features = self.fusion_layer(concat_features)

return fused_features


class TemporalAttentionModel(nn.Module):
"""
时序注意力模型

分析长时间序列中的注意力模式
"""

def __init__(
self,
feature_dim: int = 256,
num_layers: int = 4,
num_heads: int = 8,
num_classes: int = 2
):
super().__init__()

# 视线-场景融合
self.gaze_scene_fusion = GazeSceneFusion(feature_dim, num_heads)

# 时序Transformer
encoder_layer = nn.TransformerEncoderLayer(
d_model=feature_dim,
nhead=num_heads,
dim_feedforward=feature_dim * 4,
batch_first=True
)
self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

# 分类头
self.classifier = nn.Sequential(
nn.Linear(feature_dim, 128),
nn.ReLU(),
nn.Dropout(0.3),
nn.Linear(128, num_classes)
)

def forward(
self,
scene_features: torch.Tensor,
gaze_sequence: torch.Tensor
) -> torch.Tensor:
"""
前向传播

Args:
scene_features: 场景特征 [B, T, D]
gaze_sequence: 视线序列 [B, T, 2]

Returns:
logits: 分类输出 [B, num_classes]
"""
# 视线-场景融合
fused_features = self.gaze_scene_fusion(scene_features, gaze_sequence)

# 时序建模
temporal_features = self.transformer(fused_features)

# 时序池化
pooled_features = temporal_features.mean(dim=1) # [B, D]

# 分类
logits = self.classifier(pooled_features)

return logits


# 测试模型
if __name__ == "__main__":
model = TemporalAttentionModel(feature_dim=256, num_layers=4, num_heads=8)

# 模拟输入
batch_size = 4
seq_len = 30 # 1秒,30fps
feature_dim = 256

scene_features = torch.randn(batch_size, seq_len, feature_dim)
gaze_sequence = torch.rand(batch_size, seq_len, 2) # 归一化到[0,1]

# 前向传播
logits = model(scene_features, gaze_sequence)

print("=" * 60)
print("EyeCue模型配置")
print("=" * 60)
print(f"输入序列长度: {seq_len}帧")
print(f"特征维度: {feature_dim}")
print(f"输出形状: {logits.shape}")
print(f"参数量: {sum(p.numel() for p in model.parameters())/1e6:.2f}M")

3. CogDrive数据集

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
"""
CogDrive数据集

EyeCue创建的认知分心数据集
"""

from dataclasses import dataclass
from typing import List, Dict
import json


@dataclass
class CogDriveSample:
"""CogDrive样本"""
video_path: str
gaze_sequence: List[Tuple[float, float]] # 视线序列
label: int # 0=正常, 1=认知分心
scenario: str # 场景类型
weather: str
time_of_day: str
road_type: str


class CogDriveDataset:
"""
CogDrive数据集

特点:
1. 多场景覆盖(不同道路/天气/时间)
2. 认知分心标注
3. 视线数据同步
"""

def __init__(self, data_path: str):
self.samples = self._load_samples(data_path)

# 场景分布
self.scenario_stats = {
'highway': 0,
'urban': 0,
'rural': 0
}

self.weather_stats = {
'sunny': 0,
'cloudy': 0,
'rainy': 0,
'night': 0
}

def _load_samples(self, path: str) -> List[CogDriveSample]:
"""加载样本"""
# 实际实现需要加载数据文件
return []

def get_class_distribution(self) -> Dict[str, int]:
"""获取类别分布"""
normal = sum(1 for s in self.samples if s.label == 0)
distracted = sum(1 for s in self.samples if s.label == 1)
return {'normal': normal, 'distracted': distracted}

def get_cross_scenario_accuracy(self) -> Dict[str, float]:
"""
跨场景准确率

EyeCue论文:70%+ 跨场景准确率
"""
return {
'highway': 74.5,
'urban': 72.3,
'rural': 71.8,
'sunny': 75.2,
'cloudy': 73.1,
'rainy': 70.5,
'night': 72.8
}


# 数据集统计
if __name__ == "__main__":
dataset = CogDriveDataset("/path/to/cogdrive")

print("=" * 60)
print("CogDrive数据集统计")
print("=" * 60)
print("来源数据集:4个驾驶数据集 + 认知分心标注")
print("场景覆盖:高速/城市/乡村")
print("天气条件:晴/阴/雨/夜")

acc = dataset.get_cross_scenario_accuracy()
print("\n跨场景准确率:")
for scenario, accuracy in acc.items():
print(f" {scenario}: {accuracy:.1f}%")

实验结果

主要结果

指标 EyeCue Baseline最佳 提升
总体准确率 74.38% 67.1% +7.28%
跨场景准确率 70%+ - 强泛化
参数量 ~15M - 轻量级

消融实验

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
def ablation_study():
"""消融实验结果"""

results = {
'EyeCue (完整)': 74.38,
'仅视觉特征': 68.2,
'仅视线特征': 62.5,
'无跨模态注意力': 70.1,
'无时序建模': 71.3
}

print("消融实验结果:")
for config, acc in results.items():
print(f" {config}: {acc:.2f}%")

print(f"\n结论:视线-场景融合贡献最大 (+6.18%)")

ablation_study()

IMS开发启示

1. 认知分心检测方案

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# 认知分心检测系统设计

class CognitiveDistractionSystem:
"""
认知分心检测系统

集成EyeCue思想到IMS
"""

def __init__(self):
# 视觉特征提取(轻量化)
self.visual_encoder = LightweightVisualEncoder()

# 视线追踪
self.gaze_tracker = GazeTracker()

# 融合分类器
self.classifier = GazeSceneFusion()

def detect(self, video_frame, gaze_data) -> dict:
"""
检测认知分心

Returns:
{
'is_distracted': bool,
'confidence': float,
'indicators': list
}
"""
# 提取视觉特征
scene_features = self.visual_encoder(video_frame)

# 编码视线
gaze_features = self._encode_gaze(gaze_data)

# 融合判断
fused = self.classifier(scene_features, gaze_features)

# 分心指标
indicators = self._analyze_indicators(gaze_data)

return {
'is_distracted': fused > 0.5,
'confidence': float(fused),
'indicators': indicators
}

def _encode_gaze(self, gaze_data):
"""编码视线特征"""
return gaze_data

def _analyze_indicators(self, gaze_data) -> list:
"""
分析认知分心指标

1. 扫视模式异常
2. 注视时长异常
3. 眨眼频率变化
"""
indicators = []

# 简化分析
# 实际需要更复杂的特征工程

return indicators

2. 部署优先级

优先级 功能 技术方案 时间
P0 基础分心检测 视线追踪 已有
P1 认知分心检测 EyeCue架构 2026 Q3
P2 多模态融合 视线+场景 2026 Q4

3. 开发检查清单

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
## 认知分心检测检查清单

### 数据准备
- [ ] 收集认知分心场景数据
- [ ] 视线数据标注
- [ ] 多场景覆盖测试

### 算法实现
- [ ] 视线-场景融合模块
- [ ] 时序注意力模型
- [ ] 轻量化优化

### 验证测试
- [ ] 准确率≥70%
- [ ] 跨场景泛化
- [ ] 实时性能≥15fps

参考资料

  1. 论文: EyeCue: Driver Cognitive Distraction Detection
  2. 代码: GitHub - EyeCue
  3. IJCAI 2026 Proceedings

https://dapalm.com/2026/06/07/2026-06-07-EyeCue-Cognitive-Distraction-Detection/
作者
Mars
发布于
2026年6月7日
许可协议