DMD 驾驶员监控数据集：最大规模开源多模态 DMS 训练数据

发表于 2026-04-15 更新于 2026-04-25 分类于技术研究

发布时间： 2026-04-15
关键词： DMD、Dataset、DMS、开源、多模态、RGB+IR+Depth

数据集概览

DMD（Driver Monitoring Dataset） 是目前最大规模的驾驶员监控开源数据集：

特点	数据
总时长	41 小时
多摄像头	身体、面部、手部同步
多模态	RGB + Depth + IR
场景	真车 + 驾驶模拟器
标注	分心、疲劳、视线、手部交互

数据集结构

场景分类

┌─────────────────────────────────────────────────────┐
│           DMD 数据集场景分类                        │
├─────────────────────────────────────────────────────┤
│                                                     │
│   分心场景 (Distraction)                           │
│   ─────────────────────                            │
│   • 手机使用（打电话、发短信）                     │
│   • 调整收音机/导航                                │
│   • 饮食/喝水                                      │
│   • 与乘客交谈                                     │
│   • 捡物品                                         │
│   • 化妆/整理仪容                                  │
│                                                     │
│   疲劳场景 (Drowsiness)                            │
│   ─────────────────────                            │
│   • 眼睛闭合                                       │
│   • 打哈欠                                         │
│   • 头部下垂                                       │
│   • 不同疲劳等级（0-3 级）                         │
│                                                     │
│   视线分配 (Gaze Allocation)                       │
│   ────────────────────────                         │
│   • 前方道路                                       │
│   • 左后视镜                                       │
│   • 右后视镜                                       │
│   • 仪表盘                                         │
│   • 中控屏                                         │
│   • 乘客侧                                         │
│                                                     │
│   手部交互 (Hands-Wheel Interaction)               │
│   ─────────────────────────────────               │
│   • 双手握方向盘                                   │
│   • 单手握方向盘                                   │
│   • 双手离开方向盘                                 │
│   • 手势操作                                       │
│                                                     │
└─────────────────────────────────────────────────────┘

数据格式

数据流	格式	分辨率	帧率
RGB 面部	MP4	1920×1080	30 fps
RGB 身体	MP4	1920×1080	30 fps
RGB 手部	MP4	640×480	30 fps
Depth	RAW	640×480	30 fps
IR	RAW	640×480	30 fps
标注	JSON	-	-

代码示例：数据加载

import json
import numpy as np
import cv2
from pathlib import Path
from dataclasses import dataclass
from typing import Dict, List, Optional

@dataclass
class DMDFrame:
    """DMD 单帧数据"""
    frame_id: int
    timestamp: float
    
    # 标注
    distraction_label: Optional[str]
    drowsiness_level: int  # 0-3
    gaze_zone: Optional[str]
    hands_state: Optional[str]
    
    # 文件路径
    rgb_face_path: str
    rgb_body_path: str
    rgb_hands_path: str
    depth_path: Optional[str]
    ir_path: Optional[str]

class DMDLoader:
    """DMD 数据集加载器"""
    
    def __init__(self, data_root: str):
        """
        Args:
            data_root: DMD 数据集根目录
        """
        self.data_root = Path(data_root)
        self.annotations = self._load_annotations()
        
    def _load_annotations(self) -> Dict:
        """加载标注文件"""
        annotation_file = self.data_root / 'annotations.json'
        
        if annotation_file.exists():
            with open(annotation_file, 'r') as f:
                return json.load(f)
        
        return {}
    
    def get_frame(self, 
                  scenario: str,
                  subject_id: int,
                  frame_id: int) -> DMDFrame:
        """获取单帧数据
        
        Args:
            scenario: 'distraction' | 'drowsiness' | 'gaze' | 'hands'
            subject_id: 受试者 ID
            frame_id: 帧 ID
            
        Returns:
            DMDFrame: 帧数据
        """
        # 构造路径
        subject_dir = self.data_root / scenario / f'subject_{subject_id:03d}'
        
        # 加载标注
        annotation = self.annotations.get(scenario, {}).get(f'subject_{subject_id:03d}', {})
        frame_annotation = annotation.get('frames', {}).get(str(frame_id), {})
        
        return DMDFrame(
            frame_id=frame_id,
            timestamp=frame_id / 30.0,  # 假设 30fps
            
            distraction_label=frame_annotation.get('distraction_label'),
            drowsiness_level=frame_annotation.get('drowsiness_level', 0),
            gaze_zone=frame_annotation.get('gaze_zone'),
            hands_state=frame_annotation.get('hands_state'),
            
            rgb_face_path=str(subject_dir / 'face' / f'{frame_id:06d}.jpg'),
            rgb_body_path=str(subject_dir / 'body' / f'{frame_id:06d}.jpg'),
            rgb_hands_path=str(subject_dir / 'hands' / f'{frame_id:06d}.jpg'),
            depth_path=str(subject_dir / 'depth' / f'{frame_id:06d}.raw') if (subject_dir / 'depth').exists() else None,
            ir_path=str(subject_dir / 'ir' / f'{frame_id:06d}.raw') if (subject_dir / 'ir').exists() else None,
        )
    
    def load_images(self, frame: DMDFrame) -> Dict[str, np.ndarray]:
        """加载图像数据
        
        Returns:
            {
                'rgb_face': np.ndarray,
                'rgb_body': np.ndarray,
                'rgb_hands': np.ndarray,
                'depth': np.ndarray (可选),
                'ir': np.ndarray (可选)
            }
        """
        images = {}
        
        # RGB 图像
        images['rgb_face'] = cv2.imread(frame.rgb_face_path)
        images['rgb_body'] = cv2.imread(frame.rgb_body_path)
        images['rgb_hands'] = cv2.imread(frame.rgb_hands_path)
        
        # Depth 数据
        if frame.depth_path:
            depth_raw = np.fromfile(frame.depth_path, dtype=np.uint16)
            images['depth'] = depth_raw.reshape((480, 640))
        
        # IR 数据
        if frame.ir_path:
            ir_raw = np.fromfile(frame.ir_path, dtype=np.uint8)
            images['ir'] = ir_raw.reshape((480, 640))
        
        return images
    
    def get_sequence(self,
                     scenario: str,
                     subject_id: int,
                     start_frame: int,
                     length: int) -> List[DMDFrame]:
        """获取帧序列
        
        Args:
            scenario: 场景类型
            subject_id: 受试者 ID
            start_frame: 起始帧
            length: 序列长度
            
        Returns:
            List[DMDFrame]: 帧序列
        """
        return [
            self.get_frame(scenario, subject_id, start_frame + i)
            for i in range(length)
        ]


# PyTorch DataLoader 封装
import torch
from torch.utils.data import Dataset, DataLoader

class DMDDataset(Dataset):
    """DMD PyTorch Dataset"""
    
    def __init__(self, 
                 data_root: str,
                 scenario: str = 'distraction',
                 transform=None):
        """
        Args:
            data_root: 数据集根目录
            scenario: 'distraction' | 'drowsiness' | 'gaze' | 'hands'
            transform: 图像变换
        """
        self.loader = DMDLoader(data_root)
        self.scenario = scenario
        self.transform = transform
        
        # 构建样本列表
        self.samples = self._build_sample_list()
        
        # 标签映射
        self.label_map = self._build_label_map()
        
    def _build_sample_list(self) -> List[tuple]:
        """构建样本列表"""
        samples = []
        
        # 遍历所有受试者和帧
        scenario_dir = self.loader.data_root / self.scenario
        
        if scenario_dir.exists():
            for subject_dir in sorted(scenario_dir.glob('subject_*')):
                face_dir = subject_dir / 'face'
                if face_dir.exists():
                    for frame_file in sorted(face_dir.glob('*.jpg')):
                        frame_id = int(frame_file.stem)
                        subject_id = int(subject_dir.name.split('_')[1])
                        samples.append((subject_id, frame_id))
        
        return samples
    
    def _build_label_map(self) -> Dict[str, int]:
        """构建标签映射"""
        if self.scenario == 'distraction':
            return {
                'safe_driving': 0,
                'talking_phone': 1,
                'texting_phone': 2,
                'operating_radio': 3,
                'drinking': 4,
                'eating': 5,
                'reaching_behind': 6,
                'hair_makeup': 7,
                'talking_passenger': 8,
            }
        elif self.scenario == 'drowsiness':
            return {
                'alert': 0,
                'low_vigilance': 1,
                'drowsy': 2,
                'sleep': 3,
            }
        elif self.scenario == 'gaze':
            return {
                'road': 0,
                'left_mirror': 1,
                'right_mirror': 2,
                'dashboard': 3,
                'center_console': 4,
                'passenger': 5,
            }
        else:
            return {}
    
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        subject_id, frame_id = self.samples[idx]
        
        # 获取帧数据
        frame = self.loader.get_frame(self.scenario, subject_id, frame_id)
        images = self.loader.load_images(frame)
        
        # 提取 RGB 面部图像
        rgb_face = images['rgb_face']
        
        # 变换
        if self.transform:
            rgb_face = self.transform(rgb_face)
        
        # 获取标签
        if self.scenario == 'distraction':
            label = frame.distraction_label
        elif self.scenario == 'drowsiness':
            label = frame.drowsiness_level
        elif self.scenario == 'gaze':
            label = frame.gaze_zone
        else:
            label = 0
        
        # 转换为索引
        label_idx = self.label_map.get(label, 0)
        
        return {
            'image': rgb_face,
            'label': torch.tensor(label_idx, dtype=torch.long),
            'subject_id': subject_id,
            'frame_id': frame_id,
        }


# 使用示例
if __name__ == '__main__':
    from torchvision import transforms
    
    # 数据变换
    transform = transforms.Compose([
        transforms.ToPILImage(),
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    
    # 创建数据集
    dataset = DMDDataset(
        data_root='/path/to/dmd',
        scenario='distraction',
        transform=transform
    )
    
    # 创建 DataLoader
    dataloader = DataLoader(
        dataset,
        batch_size=32,
        shuffle=True,
        num_workers=4
    )
    
    # 训练循环
    for batch in dataloader:
        images = batch['image']
        labels = batch['label']
        
        print(f"Batch size: {images.shape[0]}, Labels: {labels}")
        break

与其他数据集对比

主流 DMS 数据集

数据集	时长	受试者	模态	场景
DMD	41 h	多人	RGB+Depth+IR	分心+疲劳+视线+手部
NTHU-DDD	~10 h	36 人	RGB	疲劳检测
UTA-RLDD	~6 h	60 人	RGB	疲劳检测
DAD	~50 h	多人	RGB	分心检测
100-Driver	~20 h	100 人	RGB	分心+疲劳

DMD 优势

最大规模：41 小时，最多场景
多模态：RGB + Depth + IR
多摄像头：身体、面部、手部同步
真车 + 模拟器：覆盖多种场景
开源免费：学术和科研可免费使用

Euro NCAP 关联

训练数据要求

Euro NCAP 要求	DMD 支持
分心场景多样性	✅ 9 种分心类型
疲劳等级	✅ 0-3 级
多肤色公平性	⚠️ 需补充
太阳镜/口罩	⚠️ 需补充
夜间场景	⚠️ 需补充

数据增强建议

class DMDAugmentation:
    """DMD 数据增强"""
    
    def __init__(self):
        # 需要补充的场景
        self.augmentation_scenarios = [
            'sunglasses',      # 太阳镜
            'face_mask',       # 口罩
            'night',           # 夜间
            'backlight',       # 逆光
            'different_skin_tones',  # 不同肤色
        ]
    
    def augment_with_synthetic(self, 
                               real_data: np.ndarray,
                               augmentation_type: str) -> np.ndarray:
        """使用合成数据增强
        
        Args:
            real_data: 真实数据
            augmentation_type: 增强类型
            
        Returns:
            增强后的数据
        """
        if augmentation_type == 'sunglasses':
            # 添加太阳镜效果
            return self._add_sunglasses(real_data)
        elif augmentation_type == 'face_mask':
            # 添加口罩效果
            return self._add_face_mask(real_data)
        elif augmentation_type == 'night':
            # 模拟夜间场景
            return self._simulate_night(real_data)
        elif augmentation_type == 'backlight':
            # 模拟逆光
            return self._simulate_backlight(real_data)
        else:
            return real_data
    
    def _add_sunglasses(self, image: np.ndarray) -> np.ndarray:
        """添加太阳镜效果"""
        # 使用人脸关键点定位眼睛区域
        # 添加太阳镜贴图
        # 简化示例
        return image
    
    def _add_face_mask(self, image: np.ndarray) -> np.ndarray:
        """添加口罩效果"""
        # 使用人脸关键点定位嘴部区域
        # 添加口罩贴图
        return image
    
    def _simulate_night(self, image: np.ndarray) -> np.ndarray:
        """模拟夜间场景"""
        # 降低亮度和对比度
        dark = cv2.convertScaleAbs(image, alpha=0.3, beta=-50)
        # 添加噪声
        noise = np.random.normal(0, 10, image.shape).astype(np.int16)
        dark = np.clip(dark.astype(np.int16) + noise, 0, 255).astype(np.uint8)
        return dark
    
    def _simulate_backlight(self, image: np.ndarray) -> np.ndarray:
        """模拟逆光"""
        # 添加强光效果
        return image

下载与使用

下载方式

# 官方下载
# https://dmd.vicomtech.org/

# 示例下载脚本
wget https://dmd.vicomtech.org/download/sample.zip

# 完整数据集需要申请
# 填写表单后获取下载链接

GitHub 仓库

1	git clone https://github.com/Vicomtech/DMD-Driver-Monitoring-Dataset

参考资源

DMD 官方网站：https://dmd.vicomtech.org/
DMD GitHub：https://github.com/Vicomtech/DMD-Driver-Monitoring-Dataset
DMD 论文（ECCV 2020）：https://arxiv.org/pdf/2008.12085
NTHU-DDD 数据集：http://cv.cs.nthu.edu.tw/php/callforpaper/datasets/DDD/
Roboflow DMD 模型：https://universe.roboflow.com/driver-monitoring/dmd-tfiw0