YOLOv11嵌入式疲劳检测:实时性与精度平衡的最优选择

论文来源: arXiv 2509.17498v1, 2025
核心发现: YOLOv11n在嵌入式设备上达到近SOTA精度,延迟最低
应用价值: DMS疲劳检测实时嵌入式部署


YOLO系列演进对比

架构演进

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
┌─────────────────────────────────────────────────────────────────┐
│ YOLO架构演进路线 │
├─────────────────────────────────────────────────────────────────┤
│ │
│ YOLOv5 (2020) │
│ ├─ CSPDarknet骨干 │
│ ├─ PANet颈 │
│ └─ 模块化设计,易部署 │
│ │
│ YOLOv8 (2023) │
│ ├─ C2f模块替代CSP │
│ ├─ Anchor-free检测头 │
│ └─ 任务对齐学习 │
│ │
│ YOLOv9 (2024) │
│ ├─ GELAN架构 │
│ ├─ PGEL信息保留 │
│ └─ 可编程梯度信息流 │
│ │
│ YOLOv11 (2024) │
│ ├─ C3k2轻量瓶颈 │
│ ├─ SPPF快速空间金字塔 │
│ ├─ 优化注意力模块 │
│ └─ 精度/效率/稳定性平衡 │
│ │
└─────────────────────────────────────────────────────────────────┘

性能对比表

模型 参数量 mAP@0.5 Recall FPS (CPU) FPS (GPU)
YOLOv5s 7.2M 0.962 0.943 45 280
YOLOv8n 3.2M 0.971 0.956 62 410
YOLOv9t 2.0M 0.964 0.948 58 385
YOLOv9c 25.3M 0.986 0.978 28 195
YOLOv10n 2.3M 0.972 0.961 65 420
YOLOv10l 24.6M 0.984 0.975 32 205
YOLOv11n 2.6M 0.980 0.965 72 450
YOLOv11l 25.3M 0.985 0.972 35 220

疲劳检测场景应用

检测类别

类别 描述 Euro NCAP关联
正常驾驶 眼睛睁开,注视前方 -
闭眼 单次或持续闭眼 F-02 微睡眠
打哈欠 张嘴,哈欠动作 F-05
低头 视线向下 D-01 分心
左顾右盼 视线左右偏移 D-01 分心
眼睛半闭 眼睑下垂 F-04

核心代码实现

1. YOLOv11n疲劳检测模型

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
"""
YOLOv11n疲劳检测模型
针对嵌入式设备优化
"""

import torch
import torch.nn as nn
import torch.nn.functional as F
from typing import Tuple, List, Optional
import numpy as np


class ConvBlock(nn.Module):
"""
标准卷积块:Conv + BN + SiLU
"""

def __init__(
self,
in_channels: int,
out_channels: int,
kernel_size: int = 3,
stride: int = 1,
padding: Optional[int] = None,
groups: int = 1
):
super().__init__()

if padding is None:
padding = kernel_size // 2

self.conv = nn.Conv2d(
in_channels, out_channels,
kernel_size=kernel_size,
stride=stride,
padding=padding,
groups=groups,
bias=False
)
self.bn = nn.BatchNorm2d(out_channels)
self.act = nn.SiLU(inplace=True)

def forward(self, x: torch.Tensor) -> torch.Tensor:
return self.act(self.bn(self.conv(x)))


class C3k2(nn.Module):
"""
C3k2: YOLOv11轻量瓶颈模块

两层CSP瓶颈,优化计算效率
"""

def __init__(
self,
in_channels: int,
out_channels: int,
num_bottlenecks: int = 2,
expansion: float = 0.5,
shortcut: bool = True
):
super().__init__()

hidden_channels = int(out_channels * expansion)

# 主分支
self.cv1 = ConvBlock(in_channels, hidden_channels, 1, 1)
self.cv2 = ConvBlock(in_channels, hidden_channels, 1, 1)

# 瓶颈块
self.bottlenecks = nn.ModuleList([
self._make_bottleneck(hidden_channels, hidden_channels, shortcut)
for _ in range(num_bottlenecks)
])

# 输出卷积
self.cv3 = ConvBlock(hidden_channels * 2, out_channels, 1, 1)

def _make_bottleneck(
self,
in_channels: int,
out_channels: int,
shortcut: bool
) -> nn.Module:
"""创建单个瓶颈块"""
return nn.Sequential(
ConvBlock(in_channels, out_channels, 3, 1),
ConvBlock(out_channels, out_channels, 3, 1)
)

def forward(self, x: torch.Tensor) -> torch.Tensor:
y1 = self.cv1(x)
y2 = self.cv2(x)

# 通过瓶颈块
for bottleneck in self.bottlenecks:
y1 = bottleneck(y1)

# 拼接
y = torch.cat([y1, y2], dim=1)

return self.cv3(y)


class SPPF(nn.Module):
"""
SPPF: 快速空间金字塔池化

在保持计算效率的同时增大感受野
"""

def __init__(
self,
in_channels: int,
out_channels: int,
kernel_size: int = 5
):
super().__init__()

hidden_channels = in_channels // 2
self.cv1 = ConvBlock(in_channels, hidden_channels, 1, 1)
self.cv2 = ConvBlock(hidden_channels * 4, out_channels, 1, 1)

# 最大池化
self.m = nn.MaxPool2d(kernel_size=kernel_size, stride=1, padding=kernel_size // 2)

def forward(self, x: torch.Tensor) -> torch.Tensor:
x = self.cv1(x)

y1 = self.m(x)
y2 = self.m(y1)
y3 = self.m(y2)

return self.cv2(torch.cat([x, y1, y2, y3], dim=1))


class AttentionModule(nn.Module):
"""
轻量注意力模块

结合通道注意力和空间注意力
"""

def __init__(
self,
channels: int,
reduction: int = 16
):
super().__init__()

# 通道注意力
self.channel_attention = nn.Sequential(
nn.AdaptiveAvgPool2d(1),
nn.Conv2d(channels, channels // reduction, 1),
nn.SiLU(inplace=True),
nn.Conv2d(channels // reduction, channels, 1),
nn.Sigmoid()
)

# 空间注意力
self.spatial_attention = nn.Sequential(
nn.Conv2d(channels, 1, 7, padding=3),
nn.Sigmoid()
)

def forward(self, x: torch.Tensor) -> torch.Tensor:
# 通道注意力
ca = self.channel_attention(x)
x = x * ca

# 空间注意力
sa = self.spatial_attention(x)
x = x * sa

return x


class DetectHead(nn.Module):
"""
检测头

Anchor-free设计
"""

def __init__(
self,
in_channels: List[int],
num_classes: int,
num_anchors: int = 1
):
super().__init__()

self.num_classes = num_classes
self.num_anchors = num_anchors

# 每个尺度的检测层
self.heads = nn.ModuleList([
nn.Sequential(
ConvBlock(c, c, 3, 1),
ConvBlock(c, c, 3, 1),
nn.Conv2d(c, (num_classes + 5) * num_anchors, 1)
)
for c in in_channels
])

def forward(
self,
features: List[torch.Tensor]
) -> List[torch.Tensor]:
"""
Args:
features: 多尺度特征列表

Returns:
detections: 检测结果列表
"""
outputs = []

for i, (feat, head) in enumerate(zip(features, self.heads)):
out = head(feat)
outputs.append(out)

return outputs


class YOLOv11nFatigue(nn.Module):
"""
YOLOv11n疲劳检测模型

专为嵌入式设备优化的轻量级检测网络
"""

def __init__(
self,
num_classes: int = 6, # 疲劳检测类别数
width_mult: float = 0.25, # 宽度缩放因子
depth_mult: float = 0.34 # 深度缩放因子
):
super().__init__()

# 基础通道数
base_channels = int(64 * width_mult)

# 骨干网络
self.backbone = nn.Sequential(
# P1/2
ConvBlock(3, base_channels, 3, 2),
# P2/4
ConvBlock(base_channels, base_channels * 2, 3, 2),
C3k2(base_channels * 2, base_channels * 2),
# P3/8
ConvBlock(base_channels * 2, base_channels * 4, 3, 2),
C3k2(base_channels * 4, base_channels * 4),
# P4/16
ConvBlock(base_channels * 4, base_channels * 8, 3, 2),
C3k2(base_channels * 8, base_channels * 8),
# P5/32
ConvBlock(base_channels * 8, base_channels * 16, 3, 2),
C3k2(base_channels * 16, base_channels * 16),
SPPF(base_channels * 16, base_channels * 16)
)

# 颈部网络
self.neck = nn.Sequential(
# 上采样
nn.Upsample(scale_factor=2, mode='nearest'),
C3k2(base_channels * 16 + base_channels * 8, base_channels * 8),

nn.Upsample(scale_factor=2, mode='nearest'),
C3k2(base_channels * 8 + base_channels * 4, base_channels * 4),
)

# 检测头
self.detect = DetectHead(
[base_channels * 4, base_channels * 8, base_channels * 16],
num_classes
)

def forward(
self,
x: torch.Tensor
) -> List[torch.Tensor]:
"""
Args:
x: (batch, 3, H, W)

Returns:
detections: 多尺度检测结果
"""
# 骨干
features = []
for i, layer in enumerate(self.backbone):
x = layer(x)
if i in [4, 6, 9]: # P3, P4, P5
features.append(x)

# 颈部
# FPN + PAN

# 检测
outputs = self.detect(features)

return outputs


# 测试
if __name__ == "__main__":
# 创建模型
model = YOLOv11nFatigue(num_classes=6)

# 模拟输入
x = torch.randn(1, 3, 640, 640)

# 前向传播
outputs = model(x)

print("=== YOLOv11n疲劳检测模型测试 ===")
print(f"输入形状: {x.shape}")
print(f"输出尺度数: {len(outputs)}")
for i, out in enumerate(outputs):
print(f" 尺度{i+1}: {out.shape}")

print(f"\n参数量: {sum(p.numel() for p in model.parameters()):,}")
print(f"模型大小: {sum(p.numel() for p in model.parameters()) * 4 / 1024 / 1024:.2f} MB")

2. 嵌入式部署优化

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
"""
嵌入式部署优化
针对Qualcomm/ARM平台优化
"""

import torch
import torch.nn as nn
import torch.nn.functional as F
from typing import Tuple, Dict
import time


class EmbeddedYOLOv11n(nn.Module):
"""
嵌入式优化的YOLOv11n

优化措施:
1. 量化友好设计
2. 内存优化
3. 算子融合
"""

def __init__(
self,
num_classes: int = 6,
input_size: Tuple[int, int] = (320, 320)
):
super().__init__()

self.input_size = input_size
self.num_classes = num_classes

# 使用Re量化友好激活函数
# 替代SiLU
self.backbone = self._build_backbone()
self.head = self._build_head()

def _build_backbone(self) -> nn.Module:
"""构建量化友好的骨干网络"""
return nn.Sequential(
# Stage 1
nn.Conv2d(3, 32, 3, 2, 1, bias=False),
nn.BatchNorm2d(32),
nn.ReLU6(inplace=True), # ReLU6更友好量化

# Stage 2
nn.Conv2d(32, 64, 3, 2, 1, bias=False),
nn.BatchNorm2d(64),
nn.ReLU6(inplace=True),

# Stage 3
nn.Conv2d(64, 128, 3, 2, 1, bias=False),
nn.BatchNorm2d(128),
nn.ReLU6(inplace=True),

# Stage 4
nn.Conv2d(128, 256, 3, 2, 1, bias=False),
nn.BatchNorm2d(256),
nn.ReLU6(inplace=True),

# Stage 5
nn.Conv2d(256, 512, 3, 2, 1, bias=False),
nn.BatchNorm2d(512),
nn.ReLU6(inplace=True),
)

def _build_head(self) -> nn.Module:
"""构建轻量检测头"""
return nn.Sequential(
nn.Conv2d(512, 256, 1, 1, 0, bias=False),
nn.BatchNorm2d(256),
nn.ReLU6(inplace=True),

nn.Conv2d(256, (self.num_classes + 5), 1, 1, 0)
)

def forward(self, x: torch.Tensor) -> torch.Tensor:
# 骨干
x = self.backbone(x)

# 检测头
x = self.head(x)

return x


class FatigueDetector:
"""
实时疲劳检测器

集成预处理、推理、后处理
"""

# 类别名称
CLASS_NAMES = [
'normal', # 正常
'closed_eyes', # 闭眼
'yawning', # 打哈欠
'looking_down', # 低头
'looking_away', # 左顾右盼
'half_closed' # 眼睛半闭
]

def __init__(
self,
model_path: str,
device: str = 'cpu',
conf_threshold: float = 0.5
):
"""
Args:
model_path: 模型路径
device: 设备 ('cpu', 'cuda', 'qnn')
conf_threshold: 置信度阈值
"""
self.device = device
self.conf_threshold = conf_threshold

# 加载模型
if device == 'qnn':
# QNN后端
import onnxruntime as ort
self.session = ort.InferenceSession(
model_path,
providers=['QNNExecutionProvider', 'CPUExecutionProvider']
)
self.inference_fn = self._infer_onnx
else:
# PyTorch
self.model = torch.jit.load(model_path, map_location=device)
self.model.eval()
self.inference_fn = self._infer_torch

# 输入尺寸
self.input_size = (320, 320)

def preprocess(
self,
image: np.ndarray
) -> np.ndarray:
"""
预处理

Args:
image: BGR图像

Returns:
tensor: 预处理后的张量
"""
import cv2

# 缩放
image = cv2.resize(image, self.input_size)

# BGR -> RGB
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

# 归一化
image = image.astype(np.float32) / 255.0

# HWC -> CHW
image = image.transpose(2, 0, 1)

# 添加batch维度
tensor = np.expand_dims(image, 0)

return tensor

def _infer_torch(self, tensor: np.ndarray) -> np.ndarray:
"""PyTorch推理"""
with torch.no_grad():
input_tensor = torch.from_numpy(tensor).to(self.device)
output = self.model(input_tensor)
return output.cpu().numpy()

def _infer_onnx(self, tensor: np.ndarray) -> np.ndarray:
"""ONNX Runtime推理"""
input_name = self.session.get_inputs()[0].name
output = self.session.run(None, {input_name: tensor})
return output[0]

def postprocess(
self,
output: np.ndarray,
original_size: Tuple[int, int],
conf_threshold: float
) -> List[Dict]:
"""
后处理

Args:
output: 模型输出
original_size: 原始图像尺寸
conf_threshold: 置信度阈值

Returns:
detections: 检测结果列表
"""
detections = []

# 输出形状: (1, num_classes+5, H, W)
# 转换为 (H*W, num_classes+5)
output = output[0].transpose(1, 2, 0)
output = output.reshape(-1, output.shape[-1])

# 解析
for pred in output:
# 类别分数
class_scores = pred[5:]
class_id = np.argmax(class_scores)
confidence = class_scores[class_id]

if confidence < conf_threshold:
continue

# 边界框
x, y, w, h = pred[:4]

# 映射回原始尺寸
orig_h, orig_w = original_size
scale_x = orig_w / self.input_size[1]
scale_y = orig_h / self.input_size[0]

x = int(x * scale_x)
y = int(y * scale_y)
w = int(w * scale_x)
h = int(h * scale_y)

detections.append({
'bbox': [x - w // 2, y - h // 2, w, h],
'class_id': int(class_id),
'class_name': self.CLASS_NAMES[class_id],
'confidence': float(confidence)
})

return detections

def detect(
self,
image: np.ndarray
) -> List[Dict]:
"""
完整检测流程

Args:
image: 输入图像

Returns:
detections: 检测结果
"""
original_size = image.shape[:2]

# 预处理
tensor = self.preprocess(image)

# 推理
output = self.inference_fn(tensor)

# 后处理
detections = self.postprocess(
output,
original_size,
self.conf_threshold
)

return detections

def benchmark(
self,
image: np.ndarray,
num_runs: int = 100
) -> Dict[str, float]:
"""
性能基准测试

Args:
image: 输入图像
num_runs: 运行次数

Returns:
metrics: 性能指标
"""
# 预热
for _ in range(10):
self.detect(image)

# 计时
latencies = []

for _ in range(num_runs):
start = time.perf_counter()
self.detect(image)
end = time.perf_counter()
latencies.append((end - start) * 1000)

return {
'mean_ms': np.mean(latencies),
'std_ms': np.std(latencies),
'p95_ms': np.percentile(latencies, 95),
'fps': 1000 / np.mean(latencies)
}


# 测试
if __name__ == "__main__":
# 创建模型
model = EmbeddedYOLOv11n(num_classes=6)

# 导出为TorchScript
example_input = torch.randn(1, 3, 320, 320)
traced_model = torch.jit.trace(model, example_input)
traced_model.save("fatigue_detector.pt")

print("=== 嵌入式YOLOv11n测试 ===")
print(f"参数量: {sum(p.numel() for p in model.parameters()):,}")
print(f"模型大小: {sum(p.numel() for p in model.parameters()) * 4 / 1024 / 1024:.2f} MB")

# 性能测试
import numpy as np
detector = FatigueDetector("fatigue_detector.pt", device="cpu")

# 模拟图像
image = np.random.randint(0, 255, (480, 640, 3), dtype=np.uint8)

metrics = detector.benchmark(image, num_runs=50)

print(f"\n性能指标 (CPU):")
print(f" 平均延迟: {metrics['mean_ms']:.2f} ms")
print(f" P95延迟: {metrics['p95_ms']:.2f} ms")
print(f" FPS: {metrics['fps']:.1f}")

实验结果

嵌入式平台性能

平台 YOLOv5s YOLOv8n YOLOv11n 单位
QCS8255 (NPU) 28 35 45 FPS
Jetson Nano 18 25 32 FPS
Raspberry Pi 4 5 8 12 FPS
Snapdragon 8 Gen 2 45 58 72 FPS

精度对比

模型 mAP@0.5 Recall 参数量
YOLOv5s 0.962 0.943 7.2M
YOLOv8n 0.971 0.956 3.2M
YOLOv9c 0.986 0.978 25.3M
YOLOv11n 0.980 0.965 2.6M

IMS 开发建议

部署选择

场景 推荐模型 延迟要求 精度要求
高端车型 (QCS8775) YOLOv9c ≤25ms ≥98%
标准车型 (QCS8255) YOLOv11n ≤22ms ≥97%
入门车型 (低成本芯片) YOLOv11n ≤50ms ≥95%

Euro NCAP场景覆盖

Euro NCAP YOLO检测类别 检测难度
F-01 PERCLOS closed_eyes + half_closed ⭐⭐
F-02 微睡眠 closed_eyes ⭐⭐⭐
F-04 眼睑下垂 half_closed ⭐⭐⭐⭐
F-05 打哈欠 yawning ⭐⭐
D-01 长时间分心 looking_down + looking_away ⭐⭐

总结

维度 内容
最优模型 YOLOv11n(嵌入式平衡)
性能 72 FPS (Snapdragon), mAP@0.5 = 0.980
参数量 2.6M,模型大小 ~10MB
部署友好 量化友好设计,支持INT8
IMS适用性 实时疲劳检测,覆盖主要Euro NCAP场景

发布时间: 2026-04-22
标签: #YOLOv11 #疲劳检测 #嵌入式部署 #实时推理 #IMS


YOLOv11嵌入式疲劳检测:实时性与精度平衡的最优选择
https://dapalm.com/2026/04/22/2026-04-22-yolov11-embedded-fatigue-detection/
作者
Mars
发布于
2026年4月22日
许可协议