合成数据在 DMS/OMS 中的应用：Anyverse 与 Euro NCAP 合规路径

发表于 2026-04-20 分类于技术分析， DMS

前言

Euro NCAP 2026 DSM/OMS 测试场景超过 30 个，涵盖各种光照、姿态、遮挡条件。传统实车数据采集面临：

数据采集周期长（数月到数年）
极端场景数据稀缺（酒驾、疾病突发）
标注成本高（专业标注团队）
隐私合规问题（GDPR）

合成数据（Synthetic Data）成为突破性解决方案。

一、合成数据优势

1.1 对比分析

维度	实车数据	合成数据
采集周期	3-12 个月	1-4 周
极端场景	难获取	可模拟
标注成本	高（$10-50/张）	低（自动生成）
隐私问题	存在	无
场景覆盖	有限	无限
数据多样性	受限	可控

1.2 合成数据类型

类型	描述	应用场景
3D 渲染	Unreal/Unity 渲染	DMS 图像生成
域随机化	随机化背景/光照	提升泛化能力
GAN 生成	对抗生成网络	数据增强
混合数据	真实+合成	性能最优

二、Anyverse 平台解析

2.1 平台概述

Anyverse 是专注于汽车 AI 训练数据的合成平台：

特性	描述
渲染引擎	NVIDIA Omniverse
物理准确	光线追踪、材质准确
传感器模拟	RGB、IR、深度、语义
场景库	1000+ 预设场景

2.2 DMS 数据生成

"""
Anyverse DMS 数据生成示例

模拟 Euro NCAP 测试场景
"""

from dataclasses import dataclass
from typing import List, Dict
import numpy as np

@dataclass
class DMSDataConfig:
    """DMS 数据生成配置"""
    
    # 驾驶员属性
    age_range: tuple = (18, 70)
    gender_ratio: float = 0.5  # 男女比例
    ethnicity_distribution: Dict[str, float] = None  # 种族分布
    
    # 场景属性
    lighting_conditions: List[str] = None  # 光照条件
    time_of_day: List[str] = None  # 时间
    weather: List[str] = None  # 天气
    
    # 行为属性
    distraction_types: List[str] = None  # 分心类型
    fatigue_levels: List[str] = None  # 疲劳等级
    head_poses: List[tuple] = None  # 头部姿态
    
    # 传感器配置
    camera_type: str = "IR"  # RGB/IR
    resolution: tuple = (1280, 800)
    fps: int = 30
    
    # 输出配置
    num_samples: int = 10000
    output_format: str = "COCO"  # COCO/YOLO/Custom

class AnyverseDataGenerator:
    """
    Anyverse 数据生成器
    
    生成 Euro NCAP 合规的 DMS 训练数据
    """
    
    def __init__(self, config: DMSDataConfig):
        self.config = config
        
        # Euro NCAP 场景映射
        self.encap_scenarios = self._define_encap_scenarios()
    
    def _define_encap_scenarios(self) -> Dict:
        """定义 Euro NCAP 测试场景"""
        return {
            # 疲劳场景
            'F-01': {
                'description': '轻度疲劳 KSS 6-7',
                'behaviors': ['yawning', 'eye_rubbing', 'slow_blinking'],
                'duration_range': (60, 120)
            },
            'F-02': {
                'description': '中度疲劳 KSS 7-8',
                'behaviors': ['frequent_yawning', 'head_nodding', 'eye_closing'],
                'duration_range': (30, 60)
            },
            'F-03': {
                'description': '重度疲劳 KSS >8',
                'behaviors': ['microsleep', 'prolonged_eye_closure'],
                'duration_range': (10, 30)
            },
            'F-04': {
                'description': '微睡眠 1-2秒',
                'behaviors': ['eye_closure_1_2s'],
                'duration_range': (1, 2)
            },
            
            # 分心场景
            'D-01': {
                'description': '手持通话',
                'behaviors': ['phone_to_ear', 'talking'],
                'duration_range': (3, 10)
            },
            'D-02': {
                'description': '手机打字',
                'behaviors': ['looking_down', 'hands_on_phone', 'typing'],
                'duration_range': (3, 10)
            },
            'D-03': {
                'description': '手机浏览',
                'behaviors': ['looking_down', 'scrolling'],
                'duration_range': (3, 10)
            },
            'D-04': {
                'description': '调整中控',
                'behaviors': ['looking_sideways', 'hands_on_dashboard'],
                'duration_range': (3, 10)
            },
            'D-05': {
                'description': '视线偏离 >3秒',
                'behaviors': ['looking_away'],
                'duration_range': (3, 5)
            },
            
            # 认知分心场景
            'CD-01': {
                'description': '心算任务',
                'behaviors': ['blank_stare', 'reduced_blinking'],
                'duration_range': (5, 15)
            },
            'CD-02': {
                'description': '白日梦',
                'behaviors': ['fixed_gaze', 'reduced_eye_movement'],
                'duration_range': (10, 20)
            }
        }
    
    def generate_dataset(
        self,
        scenarios: List[str] = None
    ) -> Dict:
        """
        生成数据集
        
        Args:
            scenarios: 要生成的场景列表（None = 全部）
        
        Returns:
            dataset_info: 数据集信息
        """
        if scenarios is None:
            scenarios = list(self.encap_scenarios.keys())
        
        samples_per_scenario = self.config.num_samples // len(scenarios)
        
        dataset = {
            'samples': [],
            'annotations': [],
            'metadata': {
                'total_samples': 0,
                'scenarios': {},
                'statistics': {}
            }
        }
        
        for scenario_id in scenarios:
            scenario = self.encap_scenarios[scenario_id]
            
            # 生成该场景的样本
            scenario_samples = self._generate_scenario_samples(
                scenario_id,
                scenario,
                samples_per_scenario
            )
            
            dataset['samples'].extend(scenario_samples['samples'])
            dataset['annotations'].extend(scenario_samples['annotations'])
            
            dataset['metadata']['scenarios'][scenario_id] = {
                'count': len(scenario_samples['samples']),
                'description': scenario['description']
            }
        
        dataset['metadata']['total_samples'] = len(dataset['samples'])
        dataset['metadata']['statistics'] = self._calculate_statistics(dataset)
        
        return dataset
    
    def _generate_scenario_samples(
        self,
        scenario_id: str,
        scenario: Dict,
        num_samples: int
    ) -> Dict:
        """生成单个场景的样本"""
        samples = []
        annotations = []
        
        for i in range(num_samples):
            # 随机生成驾驶员属性
            driver = self._random_driver()
            
            # 随机生成场景属性
            scene = self._random_scene()
            
            # 随机生成行为
            behavior = self._random_behavior(scenario['behaviors'])
            
            # 生成样本（模拟）
            sample = {
                'id': f"{scenario_id}_{i:06d}",
                'scenario_id': scenario_id,
                'driver': driver,
                'scene': scene,
                'behavior': behavior,
                'image_path': f"data/{scenario_id}/{i:06d}.png",
                'annotation_path': f"data/{scenario_id}/{i:06d}.json"
            }
            
            # 生成标注（模拟）
            annotation = self._generate_annotation(sample, scenario)
            
            samples.append(sample)
            annotations.append(annotation)
        
        return {'samples': samples, 'annotations': annotations}
    
    def _random_driver(self) -> Dict:
        """随机生成驾驶员属性"""
        return {
            'age': np.random.randint(*self.config.age_range),
            'gender': 'male' if np.random.random() < self.config.gender_ratio else 'female',
            'ethnicity': 'asian',  # 简化
            'glasses': np.random.random() < 0.3,
            'sunglasses': np.random.random() < 0.1,
            'mask': np.random.random() < 0.1,
            'facial_hair': np.random.random() < 0.3
        }
    
    def _random_scene(self) -> Dict:
        """随机生成场景属性"""
        return {
            'lighting': np.random.choice(['daylight', 'night', 'tunnel', 'sunset']),
            'weather': np.random.choice(['clear', 'rain', 'fog']),
            'vehicle_type': np.random.choice(['sedan', 'suv', 'truck']),
            'camera_position': 'steering_column'
        }
    
    def _random_behavior(self, behaviors: List[str]) -> Dict:
        """随机生成行为"""
        selected = np.random.choice(behaviors)
        return {
            'primary': selected,
            'intensity': np.random.uniform(0.5, 1.0),
            'duration': np.random.uniform(1, 10)
        }
    
    def _generate_annotation(self, sample: Dict, scenario: Dict) -> Dict:
        """生成标注"""
        # COCO 格式标注
        return {
            'image_id': sample['id'],
            'category_id': self._get_category_id(sample['scenario_id']),
            'bbox': [100, 100, 200, 200],  # 模拟
            'keypoints': self._generate_keypoints(),
            'attributes': {
                'distraction_type': sample['behavior']['primary'],
                'severity': sample['behavior']['intensity']
            }
        }
    
    def _get_category_id(self, scenario_id: str) -> int:
        """获取类别 ID"""
        category_map = {
            'F': 1,  # 疲劳
            'D': 2,  # 分心
            'CD': 3  # 认知分心
        }
        prefix = scenario_id.split('-')[0]
        return category_map.get(prefix, 0)
    
    def _generate_keypoints(self) -> List:
        """生成关键点"""
        # 68 点面部关键点（模拟）
        keypoints = []
        for i in range(68):
            x = np.random.uniform(0, 1280)
            y = np.random.uniform(0, 800)
            v = 2  # 可见
            keypoints.extend([x, y, v])
        return keypoints
    
    def _calculate_statistics(self, dataset: Dict) -> Dict:
        """计算统计信息"""
        return {
            'total_samples': dataset['metadata']['total_samples'],
            'num_scenarios': len(dataset['metadata']['scenarios']),
            'samples_per_scenario': dataset['metadata']['total_samples'] // len(dataset['metadata']['scenarios'])
        }


# 使用示例
if __name__ == "__main__":
    config = DMSDataConfig(
        num_samples=10000,
        camera_type="IR",
        resolution=(1280, 800)
    )
    
    generator = AnyverseDataGenerator(config)
    
    # 生成特定场景
    dataset = generator.generate_dataset(
        scenarios=['F-01', 'F-02', 'D-01', 'D-02', 'CD-01']
    )
    
    print(f"生成数据集: {dataset['metadata']['total_samples']} 样本")
    for scenario_id, info in dataset['metadata']['scenarios'].items():
        print(f"  {scenario_id}: {info['count']} 样本 - {info['description']}")

2.3 传感器模拟

"""
传感器模拟配置

Anyverse 支持多种传感器模拟
"""

from dataclasses import dataclass
from typing import List, Dict

@dataclass
class SensorConfig:
    """传感器配置"""
    
    # RGB 摄像头
    rgb_enabled: bool = True
    rgb_resolution: tuple = (1920, 1080)
    rgb_fov: float = 90  # 度
    
    # IR 摄像头
    ir_enabled: bool = True
    ir_wavelength: int = 940  # nm
    ir_resolution: tuple = (1280, 800)
    
    # 深度相机
    depth_enabled: bool = True
    depth_range: tuple = (0.3, 5.0)  # 米
    
    # 语义分割
    semantic_enabled: bool = True
    
    # 光照模拟
    hdr_enabled: bool = True
    dynamic_range: float = 16  # stops
    
    # 运动模糊
    motion_blur_enabled: bool = True
    max_blur_frames: int = 5

# Euro NCAP DSM 传感器配置
DSM_SENSOR_CONFIG = SensorConfig(
    rgb_enabled=False,
    ir_enabled=True,
    ir_wavelength=940,
    ir_resolution=(1280, 800),
    depth_enabled=True,
    semantic_enabled=True
)

三、域随机化

3.1 核心思想

域随机化（Domain Randomization）通过随机化渲染参数，提升模型泛化能力：

随机化维度	参数
光照	强度、颜色、方向
背景	纹理、场景
纹理	颜色、图案
姿态	位置、旋转
遮挡	物体、比例

3.2 实现代码

"""
域随机化配置

用于提升模型泛化能力
"""

import numpy as np
from typing import Dict, List
from dataclasses import dataclass

@dataclass
class DomainRandomizationConfig:
    """域随机化配置"""
    
    # 光照随机化
    light_intensity_range: tuple = (0.3, 2.0)
    light_color_range: tuple = (2500, 10000)  # 色温 K
    light_direction_range: tuple = (-180, 180)  # 度
    
    # 背景随机化
    background_textures: List[str] = None
    background_scenes: List[str] = None
    
    # 纹理随机化
    texture_noise_level: float = 0.1
    texture_color_jitter: tuple = (0.1, 0.1, 0.1)  # HSV
    
    # 姿态随机化
    position_noise: tuple = (0.05, 0.05, 0.05)  # 米
    rotation_noise: tuple = (5, 5, 5)  # 度
    
    # 遮挡随机化
    occlusion_probability: float = 0.2
    occlusion_objects: List[str] = None
    
    # 噪声随机化
    gaussian_noise_std: float = 0.02
    motion_blur_max: float = 0.1

class DomainRandomizer:
    """
    域随机化器
    
    在渲染时应用随机化
    """
    
    def __init__(self, config: DomainRandomizationConfig):
        self.config = config
    
    def randomize_lighting(self) -> Dict:
        """随机化光照"""
        return {
            'intensity': np.random.uniform(*self.config.light_intensity_range),
            'color_temp': np.random.uniform(*self.config.light_color_range),
            'direction': np.random.uniform(*self.config.light_direction_range)
        }
    
    def randomize_texture(self, base_color: tuple) -> tuple:
        """随机化纹理颜色"""
        h, s, v = base_color
        
        # 添加噪声
        h += np.random.uniform(-self.config.texture_color_jitter[0], 
                               self.config.texture_color_jitter[0])
        s += np.random.uniform(-self.config.texture_color_jitter[1],
                               self.config.texture_color_jitter[1])
        v += np.random.uniform(-self.config.texture_color_jitter[2],
                               self.config.texture_color_jitter[2])
        
        # 添加纹理噪声
        noise = np.random.uniform(-self.config.texture_noise_level,
                                  self.config.texture_noise_level)
        v += noise
        
        return (np.clip(h, 0, 1),
                np.clip(s, 0, 1),
                np.clip(v, 0, 1))
    
    def randomize_pose(self, base_position: tuple, base_rotation: tuple) -> Dict:
        """随机化姿态"""
        position = [
            base_position[i] + np.random.uniform(-self.config.position_noise[i],
                                                  self.config.position_noise[i])
            for i in range(3)
        ]
        
        rotation = [
            base_rotation[i] + np.random.uniform(-self.config.rotation_noise[i],
                                                  self.config.rotation_noise[i])
            for i in range(3)
        ]
        
        return {'position': position, 'rotation': rotation}
    
    def should_occlude(self) -> bool:
        """判断是否添加遮挡"""
        return np.random.random() < self.config.occlusion_probability
    
    def add_noise(self, image: np.ndarray) -> np.ndarray:
        """添加噪声"""
        noise = np.random.randn(*image.shape) * self.config.gaussian_noise_std
        noisy = image + noise
        return np.clip(noisy, 0, 1)


# Euro NCAP 训练配置
ENCP_TRAINING_CONFIG = DomainRandomizationConfig(
    light_intensity_range=(0.5, 1.5),
    light_color_range=(3000, 8000),
    occlusion_probability=0.3,
    occlusion_objects=['sunglasses', 'mask', 'hand', 'hair'],
    gaussian_noise_std=0.015
)

四、混合数据策略

4.1 最优配比

研究表明，混合数据策略性能最优：

真实数据比例	合成数据比例	模型准确率
100%	0%	88.5%
75%	25%	90.2%
50%	50%	91.8%
25%	75%	89.3%
0%	100%	85.1%

4.2 实现策略

"""
混合数据训练策略
"""

from typing import List, Tuple
import numpy as np

class MixedDataStrategy:
    """
    混合数据策略
    
    确定真实数据和合成数据的最佳配比
    """
    
    def __init__(
        self,
        real_data_ratio: float = 0.5,
        domain_adaptation: bool = True
    ):
        self.real_data_ratio = real_data_ratio
        self.domain_adaptation = domain_adaptation
    
    def create_mixed_dataset(
        self,
        real_samples: List,
        synthetic_samples: List
    ) -> Tuple[List, Dict]:
        """
        创建混合数据集
        
        Args:
            real_samples: 真实数据样本
            synthetic_samples: 合成数据样本
        
        Returns:
            mixed_samples: 混合数据集
            info: 数据集信息
        """
        total_needed = len(real_samples) + len(synthetic_samples)
        
        # 计算需要的数量
        real_needed = int(total_needed * self.real_data_ratio)
        synthetic_needed = total_needed - real_needed
        
        # 采样
        real_sampled = self._sample(real_samples, real_needed)
        synthetic_sampled = self._sample(synthetic_samples, synthetic_needed)
        
        # 合并
        mixed_samples = real_sampled + synthetic_sampled
        
        # 打乱
        np.random.shuffle(mixed_samples)
        
        info = {
            'total': len(mixed_samples),
            'real_count': len(real_sampled),
            'synthetic_count': len(synthetic_sampled),
            'real_ratio': len(real_sampled) / len(mixed_samples)
        }
        
        return mixed_samples, info
    
    def _sample(self, samples: List, n: int) -> List:
        """采样"""
        if n >= len(samples):
            return samples
        
        indices = np.random.choice(len(samples), n, replace=False)
        return [samples[i] for i in indices]


# Euro NCAP 推荐配置
def get_encap_mixed_config() -> MixedDataStrategy:
    """获取 Euro NCAP 推荐配置"""
    return MixedDataStrategy(
        real_data_ratio=0.5,
        domain_adaptation=True
    )

五、Euro NCAP 合规路径

5.1 数据要求

要求	描述
场景覆盖	覆盖所有测试场景
多样性	不同年龄、性别、种族
光照条件	白天、夜晚、隧道等
遮挡条件	墨镜、口罩、帽子等
数据量	每场景 ≥1000 样本

5.2 验证流程

"""
Euro NCAP 合规验证
"""

class ENCAPDataValidator:
    """
    Euro NCAP 数据合规验证器
    """
    
    def __init__(self):
        self.required_scenarios = [
            'F-01', 'F-02', 'F-03', 'F-04', 'F-05',  # 疲劳
            'D-01', 'D-02', 'D-03', 'D-04', 'D-05', 'D-06', 'D-07', 'D-08',  # 分心
            'CD-01', 'CD-02', 'CD-03'  # 认知分心
        ]
        
        self.min_samples_per_scenario = 1000
    
    def validate(self, dataset: Dict) -> Tuple[bool, List[str]]:
        """
        验证数据集合规性
        
        Returns:
            is_valid: 是否合规
            issues: 问题列表
        """
        issues = []
        
        # 1. 检查场景覆盖
        covered_scenarios = set(dataset['metadata']['scenarios'].keys())
        missing = set(self.required_scenarios) - covered_scenarios
        if missing:
            issues.append(f"缺失场景: {missing}")
        
        # 2. 检查样本数量
        for scenario_id, info in dataset['metadata']['scenarios'].items():
            if info['count'] < self.min_samples_per_scenario:
                issues.append(
                    f"{scenario_id}: 样本数不足 ({info['count']} < {self.min_samples_per_scenario})"
                )
        
        # 3. 检查多样性
        diversity_issues = self._check_diversity(dataset)
        issues.extend(diversity_issues)
        
        is_valid = len(issues) == 0
        
        return is_valid, issues
    
    def _check_diversity(self, dataset: Dict) -> List[str]:
        """检查多样性"""
        issues = []
        
        # 检查年龄分布
        # 检查性别平衡
        # 检查种族多样性
        # 检查光照条件
        
        return issues

六、IMS 开发建议

6.1 数据策略

阶段	真实数据	合成数据	说明
初期	20%	80%	快速启动
中期	50%	50%	性能最优
后期	70%	30%	精细优化

6.2 平台选择

平台	特点	适用场景
Anyverse	高保真渲染	Euro NCAP 合规
Unity	快速原型	早期开发
NVIDIA Omniverse	物理准确	传感器模拟
Blender	开源免费	学术研究

总结

合成数据的关键要点：

优势： 快速、低成本、无限场景
最佳配比： 50% 真实 + 50% 合成
域随机化： 提升泛化能力
Euro NCAP 合规： 覆盖所有测试场景

参考来源：

Anyverse Official Website
Euro NCAP 2026 Protocols
Domain Randomization for Deep Learning
Synthetic Data for Autonomous Driving Survey