多模态融合疲劳检测:视觉+生理信号实现99%+准确率

多模态融合疲劳检测:视觉+生理信号实现99%+准确率

来源: Nature Scientific Reports + IEEE TITS
发布时间: 2026年4月
核心价值: 多模态融合比单模态精度提升5-10%


核心洞察

单模态 vs 多模态精度对比:

模态 准确率 优势 劣势
纯视觉 92-95% 非侵入、成本低 遮挡敏感、光照依赖
纯生理(EEG) 95-98% 精度最高 侵入性强、设备成本高
纯生理(ECG) 88-92% 客观指标 设备依赖
视觉+EEG融合 98-99% 高精度+互补 部署复杂
视觉+ECG+行为 95-97% 平衡方案 中等复杂度

Euro NCAP 2026启示:

  • 推荐多模态提升鲁棒性
  • 降低误报率满足要求

一、融合架构

1.1 多模态融合策略

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
多模态疲劳检测融合架构

├── 输入模态
├── 视觉:面部视频眼动特征
├── 生理:ECGEDAEMGEEG
└── 行为:方向盘踏板车道保持

├── 特征提取
├── 视觉编码器:CNN + LSTM
├── 生理编码器:CNN + Transformer
└── 行为编码器:LSTM

├── 融合策略
├── 早期融合:特征级拼接
├── 中期融合:注意力加权
└── 晚期融合:决策级投票

└── 输出
└── 疲劳等级(清醒/轻度/中度/严重)

1.2 融合网络实现

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
"""
多模态疲劳检测融合网络
"""

import torch
import torch.nn as nn
import torch.nn.functional as F
from typing import Dict, Tuple, Optional

class MultiModalFatigueDetector(nn.Module):
"""
多模态疲劳检测器

输入模态:
- 视觉:面部视频帧序列
- ECG:心电图信号
- EDA:皮肤电活动
- 行为:方向盘角度序列

融合策略:注意力机制中期融合
"""

def __init__(self,
visual_dim: int = 512,
ecg_dim: int = 128,
eda_dim: int = 64,
behavior_dim: int = 32,
fusion_dim: int = 256,
num_classes: int = 4):
super().__init__()

# 视觉编码器
self.visual_encoder = nn.Sequential(
nn.Linear(visual_dim, 256),
nn.ReLU(),
nn.Dropout(0.3),
nn.Linear(256, fusion_dim)
)

# ECG编码器
self.ecg_encoder = nn.Sequential(
nn.Linear(ecg_dim, 64),
nn.ReLU(),
nn.Dropout(0.3),
nn.Linear(64, fusion_dim)
)

# EDA编码器
self.eda_encoder = nn.Sequential(
nn.Linear(eda_dim, 32),
nn.ReLU(),
nn.Dropout(0.3),
nn.Linear(32, fusion_dim)
)

# 行为编码器
self.behavior_encoder = nn.Sequential(
nn.Linear(behavior_dim, 16),
nn.ReLU(),
nn.Dropout(0.3),
nn.Linear(16, fusion_dim)
)

# 跨模态注意力
self.cross_modal_attention = nn.MultiheadAttention(
embed_dim=fusion_dim,
num_heads=4,
batch_first=True
)

# 模态可靠性估计
self.reliability_net = nn.Sequential(
nn.Linear(fusion_dim * 4, 128),
nn.ReLU(),
nn.Linear(128, 4),
nn.Softmax(dim=-1)
)

# 分类头
self.classifier = nn.Sequential(
nn.Linear(fusion_dim, 128),
nn.ReLU(),
nn.Dropout(0.3),
nn.Linear(128, num_classes)
)

# 置信度估计
self.confidence_head = nn.Sequential(
nn.Linear(fusion_dim, 32),
nn.ReLU(),
nn.Linear(32, 1),
nn.Sigmoid()
)

def forward(self,
visual_feat: torch.Tensor,
ecg_feat: torch.Tensor,
eda_feat: torch.Tensor,
behavior_feat: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
"""
前向传播

Args:
visual_feat: 视觉特征 (B, visual_dim)
ecg_feat: ECG特征 (B, ecg_dim)
eda_feat: EDA特征 (B, eda_dim)
behavior_feat: 行为特征 (B, behavior_dim)

Returns:
logits: 分类输出 (B, num_classes)
confidence: 置信度 (B, 1)
modality_weights: 模态权重 (B, 4)
"""
batch_size = visual_feat.size(0)

# 1. 各模态编码
v_encoded = self.visual_encoder(visual_feat) # (B, fusion_dim)
e_encoded = self.ecg_encoder(ecg_feat) # (B, fusion_dim)
d_encoded = self.eda_encoder(eda_feat) # (B, fusion_dim)
b_encoded = self.behavior_encoder(behavior_feat) # (B, fusion_dim)

# 2. 堆叠为序列
multi_modal_seq = torch.stack([v_encoded, e_encoded, d_encoded, b_encoded], dim=1)
# (B, 4, fusion_dim)

# 3. 跨模态注意力
attended, _ = self.cross_modal_attention(
multi_modal_seq, multi_modal_seq, multi_modal_seq
) # (B, 4, fusion_dim)

# 4. 计算模态可靠性权重
concat_feat = attended.view(batch_size, -1) # (B, 4*fusion_dim)
modality_weights = self.reliability_net(concat_feat) # (B, 4)

# 5. 加权融合
weights_expanded = modality_weights.unsqueeze(-1) # (B, 4, 1)
weighted_feat = (attended * weights_expanded).sum(dim=1) # (B, fusion_dim)

# 6. 分类
logits = self.classifier(weighted_feat)
confidence = self.confidence_head(weighted_feat)

return logits, confidence, modality_weights


# 实际测试
if __name__ == "__main__":
model = MultiModalFatigueDetector()

# 模拟多模态输入
visual = torch.randn(4, 512)
ecg = torch.randn(4, 128)
eda = torch.randn(4, 64)
behavior = torch.randn(4, 32)

# 前向传播
logits, conf, weights = model(visual, ecg, eda, behavior)

print(f"视觉特征: {visual.shape}")
print(f"ECG特征: {ecg.shape}")
print(f"EDA特征: {eda.shape}")
print(f"行为特征: {behavior.shape}")
print(f"\n分类输出: {logits.shape}")
print(f"置信度: {conf.shape}")
print(f"模态权重: {weights.shape}")
print(f"\n模态权重样本: {weights[0].detach().numpy()}")

二、特征级融合

2.1 视觉特征

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
"""
视觉特征提取
"""

import torch
import torch.nn as nn
import torchvision.models as models

class VisualFeatureExtractor(nn.Module):
"""
视觉特征提取器

输入:面部视频帧序列
输出:时空特征向量
"""

def __init__(self,
backbone: str = 'resnet18',
lstm_hidden: int = 256):
super().__init__()

# CNN骨干
if backbone == 'resnet18':
self.cnn = models.resnet18(pretrained=True)
self.cnn.fc = nn.Identity()
feature_dim = 512
elif backbone == 'mobilenetv2':
self.cnn = models.mobilenet_v2(pretrained=True)
self.cnn.classifier = nn.Identity()
feature_dim = 1280

# LSTM
self.lstm = nn.LSTM(
input_size=feature_dim,
hidden_size=lstm_hidden,
num_layers=2,
batch_first=True,
bidirectional=True
)

# 注意力
self.attention = nn.Sequential(
nn.Linear(lstm_hidden * 2, 64),
nn.Tanh(),
nn.Linear(64, 1),
nn.Softmax(dim=1)
)

def forward(self, video_frames: torch.Tensor) -> torch.Tensor:
"""
前向传播

Args:
video_frames: (B, T, 3, H, W)

Returns:
features: (B, lstm_hidden*2)
"""
batch_size, seq_len = video_frames.size(0), video_frames.size(1)

# 展平处理每帧
frames_flat = video_frames.view(batch_size * seq_len, *video_frames.size()[2:])

# CNN特征
cnn_features = self.cnn(frames_flat)
cnn_features = cnn_features.view(batch_size, seq_len, -1)

# LSTM编码
lstm_out, _ = self.lstm(cnn_features)

# 注意力加权
attn_weights = self.attention(lstm_out)
context = torch.sum(lstm_out * attn_weights, dim=1)

return context

2.2 生理特征

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
"""
生理信号特征提取
"""

import torch
import torch.nn as nn
import numpy as np

class PhysiologicalFeatureExtractor(nn.Module):
"""
生理信号特征提取器

输入:ECG/EDA/EMG原始信号
输出:频域和时域特征
"""

def __init__(self,
signal_type: str = 'ecg',
input_length: int = 256):
super().__init__()

self.signal_type = signal_type

# 1D卷积特征提取
self.conv = nn.Sequential(
nn.Conv1d(1, 32, kernel_size=8, stride=2),
nn.BatchNorm1d(32),
nn.ReLU(),
nn.MaxPool1d(2),

nn.Conv1d(32, 64, kernel_size=8, stride=2),
nn.BatchNorm1d(64),
nn.ReLU(),
nn.MaxPool1d(2),

nn.Conv1d(64, 128, kernel_size=4, stride=1),
nn.BatchNorm1d(128),
nn.ReLU(),
nn.AdaptiveAvgPool1d(1)
)

# 频域特征计算(固定)
self.freq_bands = self._get_freq_bands(signal_type)

def forward(self, signal: torch.Tensor) -> torch.Tensor:
"""
前向传播

Args:
signal: 生理信号 (B, L)

Returns:
features: (B, 128)
"""
# 添加通道维度
x = signal.unsqueeze(1) # (B, 1, L)

# CNN特征
cnn_feat = self.conv(x).squeeze(-1) # (B, 128)

return cnn_feat

def _get_freq_bands(self, signal_type: str) -> dict:
"""获取频段定义"""
if signal_type == 'ecg':
return {
'hr': (0.8, 2.0), # 心率
'hrv': (0.04, 0.4), # 心率变异
}
elif signal_type == 'eda':
return {
'scl': (0, 0.05), # 皮肤电导水平
'scr': (0.05, 0.5), # 皮肤电导响应
}
elif signal_type == 'eeg':
return {
'delta': (0.5, 4),
'theta': (4, 8),
'alpha': (8, 13),
'beta': (13, 30),
}
return {}

def extract_handcrafted_features(self, signal: np.ndarray) -> np.ndarray:
"""
提取手工特征

Args:
signal: 原始信号 (L,)

Returns:
特征向量
"""
from scipy import signal as sp_signal

features = []

# 时域特征
features.append(np.mean(signal))
features.append(np.std(signal))
features.append(np.max(signal) - np.min(signal))

# 频域特征
freqs, psd = sp_signal.welch(signal, fs=256, nperseg=128)

for band_name, (f_low, f_high) in self.freq_bands.items():
mask = (freqs >= f_low) & (freqs <= f_high)
band_power = np.trapz(psd[mask], freqs[mask])
features.append(band_power)

return np.array(features)

三、决策级融合

3.1 集成学习融合

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
"""
决策级融合:多模型投票
"""

from dataclasses import dataclass
from typing import List, Tuple
import numpy as np

@dataclass
class ModelPrediction:
"""模型预测结果"""
model_name: str
fatigue_level: int # 0-3
confidence: float
features: dict

class DecisionLevelFusion:
"""
决策级融合

策略:
1. 加权投票
2. 置信度排序
3. 级联决策
"""

def __init__(self,
model_weights: dict = None):
"""
初始化

Args:
model_weights: 模型权重 {'visual': 0.4, 'ecg': 0.3, 'eda': 0.2, 'behavior': 0.1}
"""
self.model_weights = model_weights or {
'visual': 0.4,
'ecg': 0.3,
'eda': 0.2,
'behavior': 0.1
}

def weighted_voting(self,
predictions: List[ModelPrediction]) -> Tuple[int, float]:
"""
加权投票

Args:
predictions: 各模型预测列表

Returns:
(final_level, final_confidence)
"""
# 累积票数
votes = {}
weighted_conf = {}

for pred in predictions:
weight = self.model_weights.get(pred.model_name, 0.25)
level = pred.fatigue_level
conf = pred.confidence

if level not in votes:
votes[level] = 0
weighted_conf[level] = 0

votes[level] += weight
weighted_conf[level] += weight * conf

# 选择票数最多的
final_level = max(votes.keys(), key=lambda k: votes[k])
final_conf = weighted_conf[final_level] / votes[final_level]

return final_level, final_conf

def confidence_based_fusion(self,
predictions: List[ModelPrediction]) -> Tuple[int, float]:
"""
基于置信度的融合

选择置信度最高的预测
"""
# 找最高置信度
best_pred = max(predictions, key=lambda p: p.confidence)

return best_pred.fatigue_level, best_pred.confidence

def cascade_decision(self,
predictions: List[ModelPrediction],
threshold_high: float = 0.8,
threshold_low: float = 0.5) -> Tuple[int, float, str]:
"""
级联决策

Args:
predictions: 预测列表
threshold_high: 高置信度阈值
threshold_low: 低置信度阈值

Returns:
(level, confidence, decision_path)
"""
# 按权重排序
sorted_preds = sorted(
predictions,
key=lambda p: self.model_weights.get(p.model_name, 0.25),
reverse=True
)

# 级联检查
for pred in sorted_preds:
if pred.confidence >= threshold_high:
return pred.fatigue_level, pred.confidence, f"high_conf_{pred.model_name}"

# 中等置信度:加权投票
level, conf = self.weighted_voting(predictions)
if conf >= threshold_low:
return level, conf, "weighted_voting"

# 低置信度:保守决策(选择最高等级)
max_level = max(p.fatigue_level for p in predictions)
return max_level, 0.5, "conservative"


# 实际测试
if __name__ == "__main__":
fusion = DecisionLevelFusion()

# 模拟预测
predictions = [
ModelPrediction('visual', 2, 0.85, {}),
ModelPrediction('ecg', 2, 0.78, {}),
ModelPrediction('eda', 1, 0.65, {}),
ModelPrediction('behavior', 2, 0.72, {}),
]

print("=== 决策级融合测试 ===")

# 加权投票
level, conf = fusion.weighted_voting(predictions)
print(f"\n加权投票: 疲劳等级={level}, 置信度={conf:.2f}")

# 置信度融合
level, conf = fusion.confidence_based_fusion(predictions)
print(f"置信度融合: 疲劳等级={level}, 置信度={conf:.2f}")

# 级联决策
level, conf, path = fusion.cascade_decision(predictions)
print(f"级联决策: 疲劳等级={level}, 置信度={conf:.2f}, 路径={path}")

四、性能对比

4.1 精度对比

融合策略 准确率 F1-Score 延迟
单模态视觉 92.5% 0.918 30ms
单模态ECG 89.3% 0.885 20ms
早期融合 95.8% 0.951 45ms
中期融合(注意力) 97.2% 0.968 50ms
晚期融合(投票) 96.1% 0.955 35ms

4.2 鲁棒性对比

干扰类型 单模态视觉 单模态ECG 多模态融合
墨镜 ↓ 15% - ↓ 3%
口罩 ↓ 10% - ↓ 2%
运动伪影 - ↓ 20% ↓ 5%
光照变化 ↓ 8% - ↓ 2%

五、IMS部署建议

5.1 方案选择

场景 推荐方案 理由
高端商用车 视觉+ECG+行为 最高精度
中端乘用车 视觉+行为 平衡成本
经济型 单模态视觉 成本优先

5.2 实现优先级

模块 优先级 工作量
视觉编码器 P0
行为编码器 P1
融合模块 P1
ECG集成 P2

六、总结

6.1 核心结论

  1. 多模态融合提升5-10%精度
  2. 注意力机制融合效果最优
  3. 模态互补提升鲁棒性
  4. 决策级融合延迟最低

6.2 未来方向

  • 自适应模态选择
  • 轻量化融合网络
  • 端到端联合训练

参考链接:

  • Nature: FatigueNet多模态融合
  • IEEE TITS: 多模态疲劳检测综述
  • Euro NCAP 2026 DMS要求

多模态融合疲劳检测:视觉+生理信号实现99%+准确率
https://dapalm.com/2026/04/24/2026-04-24-multimodal-fusion-fatigue-detection/
作者
Mars
发布于
2026年4月24日
许可协议