TDGH-YOLOv7:基于 Transformer 的驾驶员头部姿态与眼动检测模型

TDGH-YOLOv7:基于 Transformer 的驾驶员头部姿态与眼动检测模型

模型背景

传统方法的局限

传统的驾驶员头部姿态和眼动检测方法存在以下问题:

方法 局限性
传统 CNN 难以精确定位小目标(眼睛)
单独训练 头部姿态和眼动分开训练,缺乏协同
固定输入 对不同尺寸目标适应性差

TDGH-YOLOv7 创新点

TDGH(Transformer Detection of Gaze and Head)集成到 YOLOv7 架构:

1
2
3
4
5
创新点:
├─ Transformer 机制增强小目标检测
├─ 头部姿态和眼动联合检测
├─ 实时性能(>30 FPS)
└─ 鲁棒性(遮挡、光照变化)

模型架构

整体结构

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
import torch
import torch.nn as nn
import torch.nn.functional as F
from typing import Tuple, List

class TDGHYOLOv7(nn.Module):
"""
TDGH-YOLOv7: Transformer Detection of Gaze and Head

基于 YOLOv7 的驾驶员头部姿态和眼动检测模型

参考:
"AI-enabled driver assistance: monitoring head and gaze movements
for enhanced safety" Complex & Intelligent Systems, 2025
"""

def __init__(self,
num_head_poses: int = 9, # 9 个头部姿态类别
num_gaze_directions: int = 9): # 9 个注视方向
super().__init__()

# Backbone: YOLOv7 的 E-ELAN 结构
self.backbone = self._build_backbone()

# Neck: PAN + Transformer
self.neck = self._build_neck()

# Head: 多任务检测头
self.head = self._build_head(num_head_poses, num_gaze_directions)

def _build_backbone(self) -> nn.Module:
"""构建 Backbone"""
# 简化实现:使用标准的 CNN 结构
layers = []

# Stem
layers.extend([
nn.Conv2d(3, 32, 3, stride=1, padding=1),
nn.BatchNorm2d(32),
nn.SiLU(),
nn.Conv2d(32, 64, 3, stride=2, padding=1),
nn.BatchNorm2d(64),
nn.SiLU(),
])

# Stage 1
layers.extend([
self._make_elan_block(64, 64, expand_ratio=2),
nn.Conv2d(128, 128, 3, stride=2, padding=1),
nn.BatchNorm2d(128),
nn.SiLU(),
])

# Stage 2
layers.extend([
self._make_elan_block(128, 128, expand_ratio=2),
nn.Conv2d(256, 256, 3, stride=2, padding=1),
nn.BatchNorm2d(256),
nn.SiLU(),
])

# Stage 3
layers.extend([
self._make_elan_block(256, 256, expand_ratio=2),
])

return nn.Sequential(*layers)

def _make_elan_block(self, in_channels: int,
out_channels: int,
expand_ratio: int = 2) -> nn.Module:
"""构建 E-ELAN 模块"""
hidden_channels = in_channels * expand_ratio

return nn.Sequential(
nn.Conv2d(in_channels, hidden_channels, 1),
nn.BatchNorm2d(hidden_channels),
nn.SiLU(),
nn.Conv2d(hidden_channels, hidden_channels, 3, padding=1),
nn.BatchNorm2d(hidden_channels),
nn.SiLU(),
nn.Conv2d(hidden_channels, out_channels, 1),
nn.BatchNorm2d(out_channels),
nn.SiLU(),
)

def _build_neck(self) -> nn.Module:
"""构建 Neck(包含 Transformer)"""
return TransformerNeck(
in_channels=512,
hidden_dim=256,
num_heads=8,
num_layers=3
)

def _build_head(self, num_poses: int, num_gazes: int) -> nn.Module:
"""构建多任务检测头"""
return MultiTaskHead(
in_channels=256,
num_head_poses=num_poses,
num_gaze_directions=num_gazes
)

def forward(self, x: torch.Tensor) -> dict:
"""
前向传播

Args:
x: 输入图像, shape=(batch, 3, H, W)

Returns:
outputs: 检测结果
"""
# Backbone
features = self.backbone(x)

# Neck
enhanced_features = self.neck(features)

# Head
outputs = self.head(enhanced_features)

return outputs


class TransformerNeck(nn.Module):
"""
Transformer Neck

使用 Transformer 增强特征提取能力,特别是小目标(眼睛)检测
"""

def __init__(self,
in_channels: int,
hidden_dim: int,
num_heads: int,
num_layers: int):
super().__init__()

self.in_channels = in_channels
self.hidden_dim = hidden_dim

# 输入投影
self.input_proj = nn.Conv2d(in_channels, hidden_dim, 1)

# Transformer Encoder
encoder_layer = nn.TransformerEncoderLayer(
d_model=hidden_dim,
nhead=num_heads,
dim_feedforward=hidden_dim * 4,
dropout=0.1,
activation='gelu',
batch_first=True
)
self.transformer = nn.TransformerEncoder(encoder_layer, num_layers)

# 输出投影
self.output_proj = nn.Conv2d(hidden_dim, hidden_dim, 1)

# BPFE (Binary Pattern Feature Extraction)
self.bpfe = BPFE(hidden_dim)

def forward(self, x: torch.Tensor) -> torch.Tensor:
"""
前向传播

Args:
x: 输入特征, shape=(batch, C, H, W)

Returns:
output: 增强特征
"""
batch, C, H, W = x.shape

# 投影
x = self.input_proj(x) # (batch, hidden_dim, H, W)

# 展平为序列
x_flat = x.flatten(2).transpose(1, 2) # (batch, H*W, hidden_dim)

# Transformer
x_transformed = self.transformer(x_flat)

# 恢复形状
x_out = x_transformed.transpose(1, 2).reshape(batch, self.hidden_dim, H, W)

# BPFE
x_out = self.bpfe(x_out)

# 输出投影
x_out = self.output_proj(x_out)

return x_out


class BPFE(nn.Module):
"""
Binary Pattern Feature Extraction

提取纹理特征用于面部区域检测
"""

def __init__(self, channels: int):
super().__init__()

self.conv1 = nn.Conv2d(channels, channels, 3, padding=1)
self.conv2 = nn.Conv2d(channels, channels, 3, padding=1)
self.bn = nn.BatchNorm2d(channels)
self.act = nn.SiLU()

def forward(self, x: torch.Tensor) -> torch.Tensor:
# 二进制模式提取
x1 = self.conv1(x)
x2 = self.conv2(x)

# 融合
x_out = x + self.act(self.bn(x1 + x2))

return x_out


class MultiTaskHead(nn.Module):
"""
多任务检测头

同时检测:
- 人脸边界框
- 头部姿态类别
- 眼对边界框
- 注视方向类别
"""

def __init__(self,
in_channels: int,
num_head_poses: int,
num_gaze_directions: int):
super().__init__()

# 共享卷积
self.shared_conv = nn.Sequential(
nn.Conv2d(in_channels, 128, 3, padding=1),
nn.BatchNorm2d(128),
nn.SiLU(),
)

# 人脸检测分支
self.face_det = nn.Conv2d(128, 5, 1) # (x, y, w, h, conf)

# 头部姿态分类分支
self.head_pose_cls = nn.Sequential(
nn.AdaptiveAvgPool2d(1),
nn.Flatten(),
nn.Linear(128, 64),
nn.ReLU(),
nn.Linear(64, num_head_poses)
)

# 眼对检测分支
self.eye_det = nn.Conv2d(128, 10, 1) # 左眼(x,y,w,h,conf) + 右眼(x,y,w,h,conf)

# 注视方向分类分支
self.gaze_cls = nn.Sequential(
nn.AdaptiveAvgPool2d(1),
nn.Flatten(),
nn.Linear(128, 64),
nn.ReLU(),
nn.Linear(64, num_gaze_directions)
)

def forward(self, x: torch.Tensor) -> dict:
"""
前向传播

Args:
x: 输入特征, shape=(batch, C, H, W)

Returns:
outputs: 多任务输出
"""
# 共享特征
shared = self.shared_conv(x)

# 各任务输出
face_bbox = self.face_det(shared)
head_pose = self.head_pose_cls(shared)
eye_bbox = self.eye_det(shared)
gaze_dir = self.gaze_cls(shared)

return {
'face_bbox': face_bbox,
'head_pose': head_pose,
'eye_bbox': eye_bbox,
'gaze_direction': gaze_dir
}

头部姿态分类定义

9 类头部姿态

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
头部姿态定义(俯仰角 × 偏航角):

向上



向左 ←───── 正常 ─────→ 向右



向下

组合 9 类:
┌───────┬───────┬───────┐
│左上 │ 正上 │ 右上 │
├───────┼───────┼───────┤
│ 向左 │ 正常 │ 向右 │
├───────┼───────┼───────┤
│左下 │ 正下 │ 右下 │
└───────┴───────┴───────┘

注视方向分类

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# 注视方向定义(与头部姿态类似,但更精细)
class GazeDirection:
"""
注视方向枚举

定义 9 个注视区域
"""

DIRECTIONS = {
0: 'center', # 中央(道路)
1: 'up', # 向上
2: 'down', # 向下
3: 'left', # 向左
4: 'right', # 向右
5: 'up_left', # 左上
6: 'up_right', # 右上
7: 'down_left', # 左下
8: 'down_right' # 右下
}

# 对应 Euro NCAP 注视区域
EURO_NCAP_MAPPING = {
'center': 'road_forward',
'left': 'driver_side_mirror',
'right': 'passenger_side_mirror',
'up': 'rear_mirror',
'down': 'instrument_cluster',
'down_left': 'driver_lap',
'down_right': 'center_stack'
}

训练与部署

数据集构建

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
class FaceEyeDataset(torch.utils.data.Dataset):
"""
面部与眼动数据集

数据格式:
- 图像:驾驶员驾驶时的面部图像
- 标注:人脸框、眼对框、头部姿态、注视方向
"""

def __init__(self,
image_dir: str,
annotation_file: str,
transform=None):
self.image_dir = image_dir
self.annotations = self._load_annotations(annotation_file)
self.transform = transform

def __len__(self):
return len(self.annotations)

def __getitem__(self, idx):
ann = self.annotations[idx]

# 加载图像
image = cv2.imread(ann['image_path'])
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

# 标注
target = {
'face_bbox': torch.tensor(ann['face_bbox'], dtype=torch.float32),
'head_pose': torch.tensor(ann['head_pose'], dtype=torch.long),
'eye_left_bbox': torch.tensor(ann['eye_left_bbox'], dtype=torch.float32),
'eye_right_bbox': torch.tensor(ann['eye_right_bbox'], dtype=torch.float32),
'gaze_direction': torch.tensor(ann['gaze_direction'], dtype=torch.long)
}

if self.transform:
image = self.transform(image)

return image, target

def _load_annotations(self, file_path):
# 加载标注文件(JSON/CSV 格式)
import json
with open(file_path, 'r') as f:
return json.load(f)

训练代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
def train_model():
"""训练 TDGH-YOLOv7 模型"""

# 数据加载
train_dataset = FaceEyeDataset(
image_dir='data/train/images',
annotation_file='data/train/annotations.json'
)

train_loader = torch.utils.data.DataLoader(
train_dataset,
batch_size=16,
shuffle=True,
num_workers=4
)

# 模型
model = TDGHYOLOv7(num_head_poses=9, num_gaze_directions=9)
model = model.cuda()

# 损失函数
bbox_loss = nn.SmoothL1Loss()
cls_loss = nn.CrossEntropyLoss()

# 优化器
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=100)

# 训练循环
for epoch in range(100):
model.train()
total_loss = 0

for batch_idx, (images, targets) in enumerate(train_loader):
images = images.cuda()

# 前向传播
outputs = model(images)

# 计算损失
loss_face = bbox_loss(outputs['face_bbox'], targets['face_bbox'].cuda())
loss_head_pose = cls_loss(outputs['head_pose'], targets['head_pose'].cuda())
loss_eye = bbox_loss(outputs['eye_bbox'], targets['eye_bbox'].cuda())
loss_gaze = cls_loss(outputs['gaze_direction'], targets['gaze_direction'].cuda())

loss = loss_face + loss_head_pose + loss_eye + loss_gaze

# 反向传播
optimizer.zero_grad()
loss.backward()
optimizer.step()

total_loss += loss.item()

scheduler.step()

print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}")

# 验证
if (epoch + 1) % 10 == 0:
validate(model)

# 保存模型
torch.save(model.state_dict(), 'tdgh_yolov7.pth')


def validate(model):
"""验证模型"""
model.eval()
# 实现验证逻辑
pass

性能指标

实验结果

任务 准确率 帧率
人脸检测 99.2% 45 FPS
头部姿态分类 95.4% 45 FPS
眼动检测 97.1% 45 FPS
注视方向分类 93.8% 45 FPS

与 Baseline 对比

模型 头部姿态准确率 注视方向准确率 帧率
ResNet-50 89.3% 85.2% 30 FPS
YOLOv5 91.5% 87.6% 40 FPS
TDGH-YOLOv7 95.4% 93.8% 45 FPS

IMS 开发启示

1. 模型选型

场景 推荐模型 原因
实时性优先 TDGH-YOLOv7 45 FPS 高帧率
精度优先 HRNet + Transformer 更高精度
嵌入式部署 MobileNetV3 + 轻量 Transformer 低功耗

2. 部署优化

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
# ONNX 导出
def export_to_onnx(model, output_path):
"""导出为 ONNX 格式"""
model.eval()

dummy_input = torch.randn(1, 3, 640, 640)

torch.onnx.export(
model,
dummy_input,
output_path,
opset_version=11,
input_names=['image'],
output_names=['face_bbox', 'head_pose', 'eye_bbox', 'gaze_direction'],
dynamic_axes={
'image': {0: 'batch_size'},
'face_bbox': {0: 'batch_size'},
'head_pose': {0: 'batch_size'},
'eye_bbox': {0: 'batch_size'},
'gaze_direction': {0: 'batch_size'}
}
)

print(f"Model exported to {output_path}")


# TensorRT 优化
def optimize_with_tensorrt(onnx_path, engine_path):
"""使用 TensorRT 优化"""
import tensorrt as trt

logger = trt.Logger(trt.Logger.WARNING)
builder = trt.Builder(logger)

network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
parser = trt.OnnxParser(network, logger)

with open(onnx_path, 'rb') as f:
parser.parse(f.read())

config = builder.create_builder_config()
config.max_workspace_size = 1 << 30 # 1GB
config.set_flag(trt.BuilderFlag.FP16) # FP16 精度

engine = builder.build_engine(network, config)

with open(engine_path, 'wb') as f:
f.write(engine.serialize())

print(f"TensorRT engine saved to {engine_path}")

3. 边缘设备性能

设备 帧率 (FP16) 功耗
Jetson Orin NX 35 FPS 15W
Jetson Xavier NX 25 FPS 15W
QCS8255 30 FPS 10W
Intel Movidius 15 FPS 2W

参考来源:


TDGH-YOLOv7:基于 Transformer 的驾驶员头部姿态与眼动检测模型
https://dapalm.com/2026/06/13/2026-06-13-TDGH-YOLOv7-Head-Pose-Gaze-Detection/
作者
Mars
发布于
2026年6月13日
许可协议