GazeSymCAT:对称交叉注意力 Transformer 实现极限头部姿态下的鲁棒视线估计

发布时间: 2026-04-14
关键词: Gaze Estimation、Transformer、Cross-Attention、Head Pose、DMS


论文核心突破

2025 年 3 月发表在 Journal of Computational Design and Engineering 的研究提出 GazeSymCAT

在 ETH-XGaze 数据集上,角度误差比 SOTA 降低 7.3%

关键创新:对称交叉注意力机制,在极端头部姿态下仍保持高精度。


问题背景

极端头部姿态的挑战

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
┌─────────────────────────────────────────────────────┐
│ 头部姿态对视线估计的影响 │
├─────────────────────────────────────────────────────┤
│ │
│ 正常姿态 (Yaw < 20°) │
│ ─────────────────── │
│ ┌─────────────────────┐ │
│ │ ┌─────────────┐ │ │
│ │ │ ◄───► │ │ ← 双眼可见 │
│ │ │ 👁️👁️ │ │ 视线估计准确 │
│ │ │ ▼ │ │ │
│ │ └─────────────┘ │ │
│ └─────────────────────┘ │
│ 误差:< 3° │
│ │
│ 极端姿态 (Yaw > 40°) │
│ ─────────────────── │
│ ┌─────────────────────┐ │
│ │ ┌─────────────┐ │ │
│ │ │ ◄───┼──┤ ← 单眼可见 │
│ │ │ 👁️ │ │ 视线估计困难 │
│ │ │ │ │ │
│ │ └─────────────┘ │ │
│ └─────────────────────┘ │
│ 误差:> 8° (传统方法) │
│ │
└─────────────────────────────────────────────────────┘

传统方法局限

方法 局限性
几何方法 依赖特征点检测,极端姿态下特征点不可见
CNN 方法 对头部姿态泛化能力差
标准 Transformer 自注意力未显式建模头部-视线关系

GazeSymCAT 架构

对称交叉注意力机制

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
┌─────────────────────────────────────────────────────┐
│ GazeSymCAT 架构 │
├─────────────────────────────────────────────────────┤
│ │
│ 输入图像 │
│ ──────── │
│ │ │
│ ▼ │
│ ┌─────────────────┐ │
│ │ Feature Encoder │ (ResNet-18 / EfficientNet) │
│ └────────┬────────┘ │
│ │ │
│ ▼ │
│ ┌─────────────────────────────────────────┐ │
│ │ Symmetric Cross-Attention │ │
│ │ │ │
│ │ Head Branch ◄──────────────► Gaze Branch│ │
│ │ │ Cross-Attn │ │ │
│ │ │◄──────────────────────────┤ │ │
│ │ │ │ │ │
│ │ ▼ ▼ │ │
│ │ Head Features Gaze Features│ │
│ │ │ │
│ └─────────────────┬───────────────────────┘ │
│ │ │
│ ▼ │
│ ┌─────────────────────────────────────────┐ │
│ │ Fusion Layer │ │
│ └─────────────────┬───────────────────────┘ │
│ │ │
│ ▼ │
│ ┌─────────────────────────────────────────┐ │
│ │ Gaze Prediction │ │
│ │ (Pitch, Yaw) │ │
│ └─────────────────────────────────────────┘ │
│ │
└─────────────────────────────────────────────────────┘

核心创新:对称交叉注意力

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
import torch
import torch.nn as nn
import torch.nn.functional as F

class SymmetricCrossAttention(nn.Module):
"""对称交叉注意力模块

头部特征和视线特征互相增强
"""

def __init__(self,
head_dim: int = 256,
gaze_dim: int = 256,
num_heads: int = 8,
dropout: float = 0.1):
super().__init__()

self.num_heads = num_heads
self.head_dim = head_dim
self.gaze_dim = gaze_dim

# 头部 -> 视线 的交叉注意力
self.head_to_gaze_attn = nn.MultiheadAttention(
embed_dim=head_dim,
num_heads=num_heads,
dropout=dropout,
batch_first=True
)

# 视线 -> 头部 的交叉注意力
self.gaze_to_head_attn = nn.MultiheadAttention(
embed_dim=gaze_dim,
num_heads=num_heads,
dropout=dropout,
batch_first=True
)

# 层归一化
self.norm1 = nn.LayerNorm(head_dim)
self.norm2 = nn.LayerNorm(gaze_dim)

# FFN
self.ffn_head = nn.Sequential(
nn.Linear(head_dim, head_dim * 4),
nn.GELU(),
nn.Dropout(dropout),
nn.Linear(head_dim * 4, head_dim),
)

self.ffn_gaze = nn.Sequential(
nn.Linear(gaze_dim, gaze_dim * 4),
nn.GELU(),
nn.Dropout(dropout),
nn.Linear(gaze_dim * 4, gaze_dim),
)

def forward(self,
head_features: torch.Tensor,
gaze_features: torch.Tensor) -> tuple:
"""
Args:
head_features: (B, N, head_dim) 头部特征
gaze_features: (B, N, gaze_dim) 视线特征

Returns:
(enhanced_head, enhanced_gaze)
"""
# 头部 -> 视线 交叉注意力
# Query: 视线特征, Key/Value: 头部特征
gaze_enhanced, _ = self.head_to_gaze_attn(
query=gaze_features,
key=head_features,
value=head_features
)
gaze_features = self.norm2(gaze_features + gaze_enhanced)
gaze_features = gaze_features + self.ffn_gaze(gaze_features)

# 视线 -> 头部 交叉注意力
# Query: 头部特征, Key/Value: 视线特征
head_enhanced, _ = self.gaze_to_head_attn(
query=head_features,
key=gaze_features,
value=gaze_features
)
head_features = self.norm1(head_features + head_enhanced)
head_features = head_features + self.ffn_head(head_features)

return head_features, gaze_features


class GazeSymCAT(nn.Module):
"""GazeSymCAT 完整模型"""

def __init__(self,
backbone: str = 'resnet18',
pretrained: bool = True,
feature_dim: int = 256,
num_sa_layers: int = 2,
num_ca_layers: int = 2):
super().__init__()

# 特征编码器
if backbone == 'resnet18':
self.backbone = torch.hub.load(
'pytorch/vision:v0.10.0',
'resnet18',
pretrained=pretrained
)
backbone_dim = 512
elif backbone == 'efficientnet_b0':
self.backbone = torch.hub.load(
'NVIDIA/DeepLearningExamples:torchhub',
'nvidia_efficientnet_b0',
pretrained=pretrained
)
backbone_dim = 1280

# 特征投影
self.head_proj = nn.Linear(backbone_dim, feature_dim)
self.gaze_proj = nn.Linear(backbone_dim, feature_dim)

# 自注意力层
self.self_attn_layers = nn.ModuleList([
nn.TransformerEncoderLayer(
d_model=feature_dim,
nhead=8,
dim_feedforward=feature_dim * 4,
dropout=0.1,
batch_first=True
) for _ in range(num_sa_layers)
])

# 对称交叉注意力层
self.cross_attn_layers = nn.ModuleList([
SymmetricCrossAttention(
head_dim=feature_dim,
gaze_dim=feature_dim
) for _ in range(num_ca_layers)
])

# 融合层
self.fusion = nn.Sequential(
nn.Linear(feature_dim * 2, feature_dim),
nn.GELU(),
nn.Dropout(0.1),
nn.Linear(feature_dim, 2) # (pitch, yaw)
)

def forward(self, x: torch.Tensor) -> torch.Tensor:
"""
Args:
x: (B, C, H, W) 输入图像(人脸区域)

Returns:
(B, 2) 视线方向 (pitch, yaw)
"""
B = x.shape[0]

# 特征提取
features = self.backbone(x) # (B, backbone_dim)

# 投影到共同空间
head_features = self.head_proj(features).unsqueeze(1) # (B, 1, feature_dim)
gaze_features = self.gaze_proj(features).unsqueeze(1) # (B, 1, feature_dim)

# 自注意力
for sa_layer in self.self_attn_layers:
head_features = sa_layer(head_features)
gaze_features = sa_layer(gaze_features)

# 对称交叉注意力
for ca_layer in self.cross_attn_layers:
head_features, gaze_features = ca_layer(head_features, gaze_features)

# 融合
combined = torch.cat([head_features, gaze_features], dim=-1).squeeze(1)
gaze = self.fusion(combined)

return gaze


# 角度误差计算
def angular_error(pred: torch.Tensor, gt: torch.Tensor) -> torch.Tensor:
"""计算角度误差

Args:
pred: (B, 2) 预测的 (pitch, yaw)
gt: (B, 2) 真实的 (pitch, yaw)

Returns:
(B,) 角度误差(度)
"""
# 转换为 3D 向量
pred_vec = torch.stack([
torch.cos(pred[:, 0]) * torch.sin(pred[:, 1]),
torch.sin(pred[:, 0]),
torch.cos(pred[:, 0]) * torch.cos(pred[:, 1])
], dim=-1)

gt_vec = torch.stack([
torch.cos(gt[:, 0]) * torch.sin(gt[:, 1]),
torch.sin(gt[:, 0]),
torch.cos(gt[:, 0]) * torch.cos(gt[:, 1])
], dim=-1)

# 计算夹角
cos_angle = torch.sum(pred_vec * gt_vec, dim=-1) / (
torch.norm(pred_vec, dim=-1) * torch.norm(gt_vec, dim=-1) + 1e-8
)
cos_angle = torch.clamp(cos_angle, -1, 1)

return torch.acos(cos_angle) * 180 / torch.pi

实验结果

ETH-XGaze 数据集表现

方法 平均误差 (°) Yaw > 40° 误差 (°)
Baseline CNN 5.2 9.1
Transformer-Gaze 4.5 7.8
GazeSymCAT 4.2 6.5
提升 7.3% 16.7%

极端姿态对比

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
┌─────────────────────────────────────────────────────┐
│ 极端头部姿态误差对比 │
├─────────────────────────────────────────────────────┤
│ │
│ Yaw 角度 误差 (°) │
│ ────────────────────────────── │
│ │
│ 0° - 20° ████████░░░░░░░░░░░░ 3.2° │
│ 20° - 40° ████████████░░░░░░░░ 4.5° │
│ 40° - 60° ████████████████░░░░ 6.5° │
│ > 60° ██████████████████░░ 8.1° │
│ │
│ 对比:传统方法在 Yaw > 40° 时误差 > 10° │
│ │
└─────────────────────────────────────────────────────┘

对 IMS 开发的启示

1. 极端姿态处理策略

策略 说明
对称交叉注意力 头部和视线特征互相增强
多帧融合 利用时序信息弥补单帧不足
姿态先验 根据头部姿态调整估计策略

2. DMS 集成建议

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
class RobustGazeEstimator:
"""鲁棒视线估计器(集成 GazeSymCAT)"""

def __init__(self, model_path: str):
self.model = GazeSymCAT()
self.model.load_state_dict(torch.load(model_path))
self.model.eval()

# 头部姿态估计器
self.head_pose_estimator = HeadPoseEstimator()

# 时序平滑
self.gaze_history = deque(maxlen=5)

def estimate(self,
face_image: np.ndarray,
head_pose: dict) -> dict:
"""估计视线方向

Args:
face_image: 人脸图像 (224, 224, 3)
head_pose: {'yaw': float, 'pitch': float, 'roll': float}

Returns:
{
'gaze_pitch': float,
'gaze_yaw': float,
'confidence': float,
'is_reliable': bool
}
"""
# 预处理
tensor = self._preprocess(face_image)

# 模型推理
with torch.no_grad():
gaze = self.model(tensor)

gaze_pitch = gaze[0, 0].item()
gaze_yaw = gaze[0, 1].item()

# 根据头部姿态调整可靠性
yaw = abs(head_pose['yaw'])
if yaw > 40:
# 极端姿态下可靠性降低
confidence = 0.7
is_reliable = False
elif yaw > 20:
confidence = 0.85
is_reliable = True
else:
confidence = 0.95
is_reliable = True

# 时序平滑
self.gaze_history.append((gaze_pitch, gaze_yaw))
smoothed = self._smooth()

return {
'gaze_pitch': smoothed[0],
'gaze_yaw': smoothed[1],
'confidence': confidence,
'is_reliable': is_reliable,
'raw_pitch': gaze_pitch,
'raw_yaw': gaze_yaw
}

def _smooth(self) -> tuple:
"""时序平滑"""
pitch = np.mean([g[0] for g in self.gaze_history])
yaw = np.mean([g[1] for g in self.gaze_history])
return (pitch, yaw)

def _preprocess(self, image: np.ndarray) -> torch.Tensor:
"""预处理"""
# 归一化
normalized = image.astype(np.float32) / 255.0
# 标准化
mean = [0.485, 0.456, 0.406]
std = [0.229, 0.224, 0.225]
normalized = (normalized - mean) / std
# 转换为 tensor
tensor = torch.from_numpy(normalized).permute(2, 0, 1).unsqueeze(0)
return tensor

3. Euro NCAP 合规

  • GazeSymCAT 在极端姿态下的误差降低 16.7%
  • 满足 Euro NCAP 对”头部转动时视线检测”的要求
  • 建议结合多帧融合进一步提升可靠性

参考资源