1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68
| import torch import torch.nn as nn
class TemporalSelfAttention(nn.Module): def __init__(self, d_model, n_heads=8): super().__init__() self.attention = nn.MultiheadAttention(d_model, n_heads) self.norm = nn.LayerNorm(d_model) def forward(self, x): """ x: [seq_len, batch, d_model] 自注意力捕获帧间依赖 """ attn_out, _ = self.attention(x, x, x) x = self.norm(x + attn_out) return x
class DrowsinessTransformer(nn.Module): def __init__(self, d_model=256, n_layers=6, n_heads=8): super().__init__() self.cnn_backbone = CNNBackbone() self.pos_encoding = PositionalEncoding(d_model) encoder_layer = nn.TransformerEncoderLayer( d_model=d_model, nhead=n_heads, dim_feedforward=1024 ) self.transformer = nn.TransformerEncoder(encoder_layer, n_layers) self.classifier = nn.Linear(d_model, 4) def forward(self, frames): """ frames: [batch, seq_len, C, H, W] """ batch, seq_len = frames.shape[:2] features = [] for t in range(seq_len): feat = self.cnn_backbone(frames[:, t]) features.append(feat) features = torch.stack(features, dim=0) features = self.pos_encoding(features) encoded = self.transformer(features) output = self.classifier(encoded[-1]) return output
|