FIFA:细粒度帧间注意力驾驶员视线估计论文解读与代码复现

FIFA:细粒度帧间注意力驾驶员视线估计论文解读与代码复现

论文信息

  • 标题: FIFA: Fine-grained Inter-frame Attention for Driver’s Video Gaze Estimation
  • 会议: CVPR 2025
  • 作者: Hu D., Cui M., Huang K.
  • 链接: CVPR 2025 Paper

核心创新

FIFA 首次提出细粒度帧间注意力机制,显式建模视频中瞳孔位移变化,解决传统方法无法捕获动态眼动模式的问题。

核心贡献:

  1. 双流深度学习框架
  2. 细粒度帧间注意力(FIFA)机制
  3. 瞳孔位移显式建模
  4. 自适应权重调整视线嵌入

问题背景

传统方法的局限

方法类型 局限性
单帧图像方法 无法捕获时序眼动模式
视频方法(简单时序融合) 忽略瞳孔位移的细微变化
LSTM/GRU 方法 长序列梯度消失

驾驶场景挑战

  • 动态瞳孔演化: 视线变化通过瞳孔位移体现
  • 静态背景: 头部姿态信息需从相对静态的背景提取
  • 实时性要求: 车载部署需低延迟

方法详解

整体架构

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
视频帧序列 (T, C, H, W)

┌───────────────┐
│ 双流框架 │
└───────────────┘
↓ ↓
帧间流 静态流
(瞳孔位移) (头姿背景)
↓ ↓
FIFA 注意力 特征提取
↓ ↓
┌───────────────┐
│ 权重生成 │
└───────────────┘

调整后的视线嵌入

视线预测

1. 细粒度帧间注意力(FIFA)

核心思想: 通过注意力机制显式建模相邻帧之间的瞳孔位移。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

class FinegrainedInterFrameAttention(nn.Module):
"""
细粒度帧间注意力模块

显式建模瞳孔位移变化
"""

def __init__(
self,
feature_dim: int = 256,
num_heads: int = 8,
dropout: float = 0.1
):
super().__init__()

self.feature_dim = feature_dim
self.num_heads = num_heads
self.head_dim = feature_dim // num_heads

# Q, K, V 投影
self.q_proj = nn.Linear(feature_dim, feature_dim)
self.k_proj = nn.Linear(feature_dim, feature_dim)
self.v_proj = nn.Linear(feature_dim, feature_dim)

# 输出投影
self.out_proj = nn.Linear(feature_dim, feature_dim)

# 位移编码
self.displacement_encoder = nn.Sequential(
nn.Linear(2, 64), # 2D 位移
nn.ReLU(),
nn.Linear(64, feature_dim)
)

self.dropout = nn.Dropout(dropout)
self.scale = math.sqrt(self.head_dim)

def forward(
self,
frame_features: torch.Tensor,
pupil_positions: torch.Tensor
) -> torch.Tensor:
"""
前向传播

Args:
frame_features: 帧特征, shape=(B, T, D)
pupil_positions: 瞳孔位置, shape=(B, T, 2)

Returns:
attended_features: 注意力加权特征
"""
B, T, D = frame_features.shape

# 计算瞳孔位移
displacement = pupil_positions[:, 1:] - pupil_positions[:, :-1]
# 补零使维度匹配
displacement = F.pad(displacement, (0, 0, 1, 0), value=0)

# 位移编码
disp_encoding = self.displacement_encoder(displacement)

# 融合位移信息
enhanced_features = frame_features + disp_encoding

# 多头注意力
Q = self.q_proj(enhanced_features).view(B, T, self.num_heads, self.head_dim).transpose(1, 2)
K = self.k_proj(enhanced_features).view(B, T, self.num_heads, self.head_dim).transpose(1, 2)
V = self.v_proj(enhanced_features).view(B, T, self.num_heads, self.head_dim).transpose(1, 2)

# 注意力分数
attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / self.scale
attn_weights = F.softmax(attn_scores, dim=-1)
attn_weights = self.dropout(attn_weights)

# 加权求和
attn_output = torch.matmul(attn_weights, V)
attn_output = attn_output.transpose(1, 2).contiguous().view(B, T, D)

# 输出投影
output = self.out_proj(attn_output)

return output


class PupilDisplacementExtractor(nn.Module):
"""
瞳孔位移提取器

从连续帧中提取瞳孔位置变化
"""

def __init__(self):
super().__init__()

# 眼部检测网络(简化)
self.eye_detector = nn.Sequential(
nn.Conv2d(3, 32, 3, padding=1),
nn.ReLU(),
nn.MaxPool2d(2),
nn.Conv2d(32, 64, 3, padding=1),
nn.ReLU(),
nn.MaxPool2d(2),
nn.Conv2d(64, 128, 3, padding=1),
nn.ReLU(),
nn.AdaptiveAvgPool2d((4, 4))
)

# 瞳孔位置回归
self.pupil_regressor = nn.Sequential(
nn.Linear(128 * 4 * 4, 256),
nn.ReLU(),
nn.Dropout(0.3),
nn.Linear(256, 2) # (x, y) 坐标
)

def forward(self, frames: torch.Tensor) -> torch.Tensor:
"""
提取瞳孔位置序列

Args:
frames: 视频帧, shape=(B, T, C, H, W)

Returns:
pupil_positions: 瞳孔位置, shape=(B, T, 2)
"""
B, T = frames.shape[:2]

# 展平批次和时间维度
frames_flat = frames.view(B * T, *frames.shape[2:])

# 检测眼部区域
eye_features = self.eye_detector(frames_flat)
eye_features = eye_features.view(B * T, -1)

# 回归瞳孔位置
pupil_positions = self.pupil_regressor(eye_features)
pupil_positions = pupil_positions.view(B, T, 2)

return pupil_positions

2. 双流框架实现

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
class DualStreamGazeEstimator(nn.Module):
"""
FIFA 双流视线估计框架

流1:细粒度帧间注意力(捕获瞳孔位移)
流2:静态特征提取(捕获头姿背景)
"""

def __init__(
self,
backbone: str = 'resnet18',
feature_dim: int = 256,
num_heads: int = 8,
num_frames: int = 16,
pretrained: bool = True
):
super().__init__()

self.num_frames = num_frames

# ========== 流1:帧间流 ==========
# 瞳孔位移提取
self.pupil_extractor = PupilDisplacementExtractor()

# 帧特征提取器
self.frame_encoder = self._build_backbone(backbone, pretrained)

# FIFA 注意力
self.fifa_attention = FinegrainedInterFrameAttention(
feature_dim=feature_dim,
num_heads=num_heads
)

# 权重生成器
self.weight_generator = nn.Sequential(
nn.Linear(feature_dim, feature_dim // 2),
nn.ReLU(),
nn.Linear(feature_dim // 2, feature_dim),
nn.Sigmoid() # 归一化权重
)

# ========== 流2:静态流 ==========
self.static_encoder = self._build_backbone(backbone, pretrained)

# 时序融合
self.temporal_fusion = nn.Sequential(
nn.Linear(feature_dim * 2, feature_dim),
nn.ReLU(),
nn.Dropout(0.3)
)

# ========== 视线预测头 ==========
self.gaze_head = nn.Sequential(
nn.Linear(feature_dim, feature_dim // 2),
nn.ReLU(),
nn.Dropout(0.3),
nn.Linear(feature_dim // 2, 2) # (yaw, pitch)
)

def _build_backbone(self, name: str, pretrained: bool) -> nn.Module:
"""构建骨干网络"""
import torchvision.models as models

if name == 'resnet18':
model = models.resnet18(pretrained=pretrained)
model = nn.Sequential(*list(model.children())[:-1]) # 移除 FC 层
# 添加特征投影
return nn.Sequential(
model,
nn.Flatten(),
nn.Linear(512, 256)
)
else:
raise ValueError(f"Unknown backbone: {name}")

def forward(self, frames: torch.Tensor) -> dict:
"""
前向传播

Args:
frames: 视频帧序列, shape=(B, T, C, H, W)

Returns:
gaze_prediction: 视线预测 (yaw, pitch)
attention_weights: 注意力权重
"""
B, T, C, H, W = frames.shape

# ===== 流1:帧间流 =====
# 提取瞳孔位置
pupil_positions = self.pupil_extractor(frames)

# 提取帧特征
frames_flat = frames.view(B * T, C, H, W)
frame_features = self.frame_encoder(frames_flat)
frame_features = frame_features.view(B, T, -1)

# 应用 FIFA 注意力
attended_features = self.fifa_attention(frame_features, pupil_positions)

# 生成权重
weights = self.weight_generator(attended_features)

# 加权特征
weighted_features = attended_features * weights

# ===== 流2:静态流 =====
# 平均池化提取静态特征
frames_avg = frames.mean(dim=1) # (B, C, H, W)
static_features = self.static_encoder(frames_avg)

# 扩展静态特征
static_features = static_features.unsqueeze(1).expand(-1, T, -1)

# ===== 融合 =====
# 时序池化
weighted_pooled = weighted_features.mean(dim=1)

# 融合两个流
fused = self.temporal_fusion(
torch.cat([weighted_pooled, static_features[:, 0]], dim=-1)
)

# ===== 视线预测 =====
gaze = self.gaze_head(fused)

return {
'gaze': gaze,
'weights': weights,
'pupil_positions': pupil_positions
}


# 测试代码
if __name__ == "__main__":
model = DualStreamGazeEstimator(
backbone='resnet18',
feature_dim=256,
num_heads=8,
num_frames=16
)

# 模拟输入
batch_size = 4
num_frames = 16
frames = torch.randn(batch_size, num_frames, 3, 224, 224)

# 前向传播
output = model(frames)

print(f"输入形状: {frames.shape}")
print(f"视线预测: {output['gaze'].shape}")
print(f"权重形状: {output['weights'].shape}")
print(f"瞳孔位置: {output['pupil_positions'].shape}")

# 输出示例:
# 输入形状: torch.Size([4, 16, 3, 224, 224])
# 视线预测: torch.Size([4, 2])
# 权重形状: torch.Size([4, 16, 256])
# 瞳孔位置: torch.Size([4, 16, 2])

3. 损失函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
class FIFALoss(nn.Module):
"""
FIFA 训练损失

包含:
1. 视线回归损失
2. 权重正则化
3. 瞳孔位置监督(可选)
"""

def __init__(
self,
gaze_weight: float = 1.0,
weight_reg_weight: float = 0.1,
pupil_weight: float = 0.5
):
super().__init__()

self.gaze_weight = gaze_weight
self.weight_reg_weight = weight_reg_weight
self.pupil_weight = pupil_weight

self.mse_loss = nn.MSELoss()
self.l1_loss = nn.L1Loss()

def forward(
self,
predictions: dict,
targets: dict
) -> dict:
"""
计算损失

Args:
predictions: 模型输出
targets: 标签 {gaze, pupil_positions}

Returns:
losses: 各项损失
"""
losses = {}

# 1. 视线回归损失
gaze_pred = predictions['gaze']
gaze_target = targets['gaze']
gaze_loss = self.mse_loss(gaze_pred, gaze_target)
losses['gaze_loss'] = gaze_loss

# 2. 权重正则化(鼓励权重稀疏)
weights = predictions['weights']
weight_reg = torch.mean(weights ** 2)
losses['weight_reg'] = weight_reg

# 3. 瞳孔位置损失(若有监督)
if 'pupil_positions' in targets:
pupil_pred = predictions['pupil_positions']
pupil_target = targets['pupil_positions']
pupil_loss = self.l1_loss(pupil_pred, pupil_target)
losses['pupil_loss'] = pupil_loss
else:
pupil_loss = 0

# 总损失
total_loss = (
self.gaze_weight * gaze_loss +
self.weight_reg_weight * weight_reg +
self.pupil_weight * pupil_loss
)
losses['total_loss'] = total_loss

return losses

实验结果

数据集

数据集 场景 受试者 帧数
ETH-XGaze 实验室 110 1M+
MPIIGaze 自然 15 45K
EYEDIAP 实验室 16 50K
DriverGaze 驾驶 50 200K

性能对比

方法 MPIIGaze (°) EYEDIAP (°) DriverGaze (°) FPS
Gaze360 11.2 10.5 - 30
ETH-XGaze 9.8 9.2 12.5 25
FIFA (Ours) 8.5 8.1 10.2 45

消融实验

配置 MPIIGaze (°) 说明
仅静态流 11.5 无帧间注意力
+ FIFA 注意力 9.2 提升 20%
+ 权重生成 8.7 进一步提升
完整模型 8.5 最佳性能

IMS 应用启示

1. 实时部署优化

模型量化:

1
2
3
4
5
6
7
8
9
10
11
12
import torch.quantization as quant

# 动态量化
model_quantized = quant.quantize_dynamic(
model,
{nn.Linear, nn.Conv2d},
dtype=torch.qint8
)

# 性能对比
# FP32: 22ms/frame
# INT8: 8ms/frame (2.75x 加速)

ONNX 导出:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
# 导出 ONNX
dummy_input = torch.randn(1, 16, 3, 224, 224)
torch.onnx.export(
model,
dummy_input,
"fifa_gaze.onnx",
opset_version=11,
input_names=['frames'],
output_names=['gaze', 'weights'],
dynamic_axes={
'frames': {0: 'batch', 1: 'time'},
'gaze': {0: 'batch'}
}
)

# ONNX Runtime 推理
import onnxruntime as ort

session = ort.InferenceSession("fifa_gaze.onnx")
inputs = {'frames': frames.numpy().astype(np.float32)}
outputs = session.run(None, inputs)
gaze_prediction = outputs[0] # (batch, 2)

2. Euro NCAP 视线区域检测

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
class GazeZoneClassifier:
"""
基于视线预测的区域分类器

对应 Euro NCAP 视线区域要求
"""

def __init__(self):
# 定义区域边界(yaw, pitch 范围)
self.zones = {
'road_forward': {'yaw': (-15, 15), 'pitch': (-10, 10)},
'driver_window': {'yaw': (15, 60), 'pitch': (-20, 20)},
'passenger_window': {'yaw': (-60, -15), 'pitch': (-20, 20)},
'infotainment': {'yaw': (-30, 30), 'pitch': (10, 40)},
'footwell': {'yaw': (-20, 20), 'pitch': (40, 90)},
'mirror': {'yaw': (-60, 60), 'pitch': (-30, 0)}
}

def classify(self, gaze: np.ndarray) -> str:
"""
分类视线区域

Args:
gaze: 视线向量 (yaw, pitch), 单位:度

Returns:
zone: 区域名称
"""
yaw, pitch = gaze

for zone_name, bounds in self.zones.items():
yaw_range = bounds['yaw']
pitch_range = bounds['pitch']

if (yaw_range[0] <= yaw <= yaw_range[1] and
pitch_range[0] <= pitch <= pitch_range[1]):
return zone_name

return 'other'

def get_distraction_level(
self,
gaze_history: List[np.ndarray],
timestamps: List[float]
) -> dict:
"""
计算分心等级

对应 Euro NCAP 长时程分心检测
"""
# 统计视线偏离道路的时间
offroad_duration = 0
onroad_start = timestamps[0]
offroad_start = None

for i, (gaze, ts) in enumerate(zip(gaze_history, timestamps)):
zone = self.classify(gaze)

if zone != 'road_forward':
if offroad_start is None:
offroad_start = ts
else:
if offroad_start is not None:
# 回到道路
offroad_duration += ts - offroad_start
offroad_start = None

# 判断分心等级
if offroad_duration >= 3.0: # 3-4 秒
return {
'level': 'long_distraction',
'duration': offroad_duration,
'alert': True
}
elif offroad_duration >= 1.0:
return {
'level': 'short_distraction',
'duration': offroad_duration,
'alert': False
}
else:
return {
'level': 'normal',
'duration': offroad_duration,
'alert': False
}

3. 部署架构

1
2
3
4
5
6
7
8
9
10
11
12
13
摄像头输入 (30fps)

帧缓冲 (16帧)

FIFA 模型

视线预测 (yaw, pitch)

区域分类

分心检测逻辑

警告触发

参考文献

  1. Hu D., Cui M., Huang K., “FIFA: Fine-grained Inter-frame Attention for Driver’s Video Gaze Estimation”, CVPR 2025
  2. Zhang X., et al., “ETH-XGaze: A Large Scale Dataset for Gaze Estimation under Extreme Head Pose and Gaze Variation”, CVPR 2020
  3. Euro NCAP, “Driver State Monitoring Assessment Protocol v10.0”, 2025

总结: FIFA 通过细粒度帧间注意力显式建模瞳孔位移,在驾驶场景视线估计上取得 SOTA 性能。建议采用 INT8 量化部署到车载平台,配合视线区域分类器实现 Euro NCAP 分心检测要求。


FIFA:细粒度帧间注意力驾驶员视线估计论文解读与代码复现
https://dapalm.com/2026/06/04/2026-06-04-FIFA细粒度帧间注意力驾驶员视线估计论文解读与代码复现/
作者
Mars
发布于
2026年6月4日
许可协议