边缘 AI 模型压缩实战:DMS 实时部署优化指南

边缘 AI 模型压缩实战:DMS 实时部署优化指南

背景

Euro NCAP 2026 要求 DMS 实时检测,车载平台算力有限。模型压缩是必须掌握的技术。


模型压缩技术概览

压缩方法对比

方法 压缩率 精度损失 实现难度
量化(INT8) 4x <1%
剪枝(结构化) 2-5x 1-3%
知识蒸馏 2-10x <2%
混合精度 2x <0.5%

1. 量化优化

量化类型

类型 精度 用途
FP32 全精度 训练、基准
FP16 半精度 GPU 推理
INT8 8位整数 NPU 加速
INT4 4位整数 极限压缩

PyTorch 量化实现

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import torch
import torch.nn as nn
import torchvision.models as models

class DMSModel(nn.Module):
"""DMS 模型示例"""

def __init__(self, num_classes: int = 2):
super().__init__()

# 骨干网络
mobilenet = models.mobilenet_v3_small(pretrained=True)
self.features = mobilenet.features

# 分类头
self.classifier = nn.Sequential(
nn.Linear(576, 128),
nn.ReLU(),
nn.Dropout(0.2),
nn.Linear(128, num_classes)
)

def forward(self, x):
x = self.features(x)
x = x.mean([2, 3]) # 全局平均池化
x = self.classifier(x)
return x


# ===== 动态量化 =====
def apply_dynamic_quantization(model: nn.Module) -> nn.Module:
"""
动态量化

优点:简单,无需校准数据
缺点:仅量化权重,激活值仍为 FP32
"""
quantized_model = torch.quantization.quantize_dynamic(
model,
{nn.Linear, nn.Conv2d},
dtype=torch.qint8
)
return quantized_model


# ===== 静态量化 =====
def apply_static_quantization(
model: nn.Module,
calibration_loader
) -> nn.Module:
"""
静态量化

优点:最大压缩,INT8 激活值
缺点:需要校准数据
"""
# 1. 设置量化配置
model.qconfig = torch.quantization.get_default_qconfig('qnnpack')

# 2. 融合模块(可选,提高性能)
model = torch.quantization.fuse_modules(model, [['features', 'classifier']])

# 3. 准备量化
torch.quantization.prepare(model, inplace=True)

# 4. 校准
model.eval()
with torch.no_grad():
for data, _ in calibration_loader:
model(data)

# 5. 转换
torch.quantization.convert(model, inplace=True)

return model


# ===== 量化感知训练(QAT)=====
class QATTrainer:
"""
量化感知训练

在训练中模拟量化误差,提高 INT8 精度
"""

def __init__(self, model: nn.Module):
self.model = model

# 设置 QAT 配置
self.model.qconfig = torch.quantization.get_default_qat_qconfig('qnnpack')

# 准备 QAT
torch.quantization.prepare_qat(self.model, inplace=True)

def train_step(self, data, target, optimizer):
"""训练步骤"""
self.model.train()

optimizer.zero_grad()
output = self.model(data)
loss = nn.functional.cross_entropy(output, target)
loss.backward()
optimizer.step()

return loss.item()

def get_quantized_model(self) -> nn.Module:
"""获取量化后模型"""
self.model.eval()
quantized = torch.quantization.convert(self.model)
return quantized


# 测试
if __name__ == "__main__":
# 创建模型
model = DMSModel()

# 动态量化
dynamic_quantized = apply_dynamic_quantization(model)

# 性能对比
dummy_input = torch.randn(1, 3, 224, 224)

import time

# FP32
model.eval()
start = time.time()
for _ in range(100):
model(dummy_input)
fp32_time = (time.time() - start) / 100

# INT8
dynamic_quantized.eval()
start = time.time()
for _ in range(100):
dynamic_quantized(dummy_input)
int8_time = (time.time() - start) / 100

print(f"FP32: {fp32_time*1000:.2f}ms")
print(f"INT8: {int8_time*1000:.2f}ms")
print(f"加速: {fp32_time/int8_time:.2f}x")

ONNX 量化

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import onnx
from onnxruntime.quantization import quantize_dynamic, QuantType

def quantize_onnx_model(
input_model: str,
output_model: str
):
"""
ONNX 模型量化

Args:
input_model: 输入 ONNX 模型路径
output_model: 输出量化模型路径
"""
quantize_dynamic(
model_input=input_model,
model_output=output_model,
weight_type=QuantType.QInt8,
optimize_model=True
)

print(f"量化完成: {output_model}")

# 比较模型大小
import os
original_size = os.path.getsize(input_model) / 1024 / 1024
quantized_size = os.path.getsize(output_model) / 1024 / 1024

print(f"原始大小: {original_size:.2f} MB")
print(f"量化大小: {quantized_size:.2f} MB")
print(f"压缩比: {original_size/quantized_size:.2f}x")


# 示例
if __name__ == "__main__":
quantize_onnx_model(
"dms_model.onnx",
"dms_model_int8.onnx"
)

2. 剪枝优化

剪枝类型

类型 描述 优点 缺点
非结构化剪枝 随机删除权重 高压缩率 难以加速
结构化剪枝 删除整个通道/层 硬件友好 精度损失大
半结构化剪枝 N:M 稀疏模式 平衡压缩与加速 需硬件支持

结构化剪枝实现

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import torch
import torch.nn as nn
import torch.nn.utils.prune as prune

class ChannelPruner:
"""
通道剪枝器

剪枝整个卷积通道
"""

def __init__(
self,
model: nn.Module,
pruning_ratio: float = 0.3
):
self.model = model
self.pruning_ratio = pruning_ratio

def compute_importance(self, module: nn.Module) -> torch.Tensor:
"""
计算通道重要性

使用 L1 范数作为重要性指标
"""
if isinstance(module, nn.Conv2d):
weight = module.weight.data
# 计算每个输出通道的 L1 范数
importance = weight.abs().sum(dim=(1, 2, 3))
return importance
return None

def prune_conv_layer(
self,
conv: nn.Conv2d,
num_channels_to_prune: int
):
"""剪枝卷积层"""
importance = self.compute_importance(conv)

# 找到最不重要的通道
_, indices = torch.sort(importance)
prune_indices = indices[:num_channels_to_prune]

# 创建掩码
mask = torch.ones(conv.out_channels)
mask[prune_indices] = 0

# 应用剪枝
prune.custom_from_mask(conv, name='weight', mask=mask)

def prune_model(self):
"""剪枝整个模型"""
for name, module in self.model.named_modules():
if isinstance(module, nn.Conv2d):
num_channels = module.out_channels
num_to_prune = int(num_channels * self.pruning_ratio)
self.prune_conv_layer(module, num_to_prune)

return self.model


class FineGrainedPruner:
"""
细粒度剪枝

基于 TensorFlow Model Optimization Toolkit 风格
"""

def __init__(
self,
model: nn.Module,
target_sparsity: float = 0.5
):
self.model = model
self.target_sparsity = target_sparsity

def iterative_pruning(
self,
train_loader,
num_iterations: int = 10,
epochs_per_iteration: int = 5
):
"""
迭代剪枝

逐步增加稀疏度,配合微调
"""
optimizer = torch.optim.Adam(self.model.parameters(), lr=1e-4)

for iteration in range(num_iterations):
# 计算当前稀疏度目标
current_sparsity = self.target_sparsity * (iteration + 1) / num_iterations

# 剪枝
self._apply_global_pruning(current_sparsity)

# 微调
self._finetune(train_loader, optimizer, epochs_per_iteration)

print(f"Iteration {iteration+1}/{num_iterations}, "
f"Sparsity: {current_sparsity:.2%}")

return self.model

def _apply_global_pruning(self, sparsity: float):
"""应用全局剪枝"""
parameters_to_prune = []

for name, module in self.model.named_modules():
if isinstance(module, nn.Conv2d) or isinstance(module, nn.Linear):
parameters_to_prune.append((module, 'weight'))

prune.global_unstructured(
parameters_to_prune,
pruning_method=prune.L1Unstructured,
amount=sparsity
)

def _finetune(self, train_loader, optimizer, epochs: int):
"""微调"""
self.model.train()

for epoch in range(epochs):
for data, target in train_loader:
optimizer.zero_grad()
output = self.model(data)
loss = nn.functional.cross_entropy(output, target)
loss.backward()
optimizer.step()


# 测试
if __name__ == "__main__":
model = DMSModel()

# 通道剪枝
pruner = ChannelPruner(model, pruning_ratio=0.3)
pruned_model = pruner.prune_model()

# 检查稀疏度
total_params = 0
zero_params = 0

for name, param in pruned_model.named_parameters():
total_params += param.numel()
zero_params += (param == 0).sum().item()

print(f"稀疏度: {zero_params/total_params:.2%}")

3. 知识蒸馏

蒸馏框架

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import torch
import torch.nn as nn
import torch.nn.functional as F

class DistillationLoss(nn.Module):
"""
知识蒸馏损失

结合硬标签损失和软标签损失
"""

def __init__(
self,
temperature: float = 4.0,
alpha: float = 0.7
):
super().__init__()
self.temperature = temperature
self.alpha = alpha

def forward(
self,
student_logits: torch.Tensor,
teacher_logits: torch.Tensor,
labels: torch.Tensor
) -> torch.Tensor:
"""
计算蒸馏损失

Args:
student_logits: 学生模型输出
teacher_logits: 教师模型输出
labels: 真实标签

Returns:
loss: 总损失
"""
# 软标签损失(KL 散度)
soft_loss = F.kl_div(
F.log_softmax(student_logits / self.temperature, dim=1),
F.softmax(teacher_logits / self.temperature, dim=1),
reduction='batchmean'
) * (self.temperature ** 2)

# 硬标签损失
hard_loss = F.cross_entropy(student_logits, labels)

# 加权组合
total_loss = self.alpha * soft_loss + (1 - self.alpha) * hard_loss

return total_loss


class DistillationTrainer:
"""
知识蒸馏训练器
"""

def __init__(
self,
teacher_model: nn.Module,
student_model: nn.Module,
temperature: float = 4.0,
alpha: float = 0.7
):
self.teacher = teacher_model
self.student = student_model

# 冻结教师模型
for param in self.teacher.parameters():
param.requires_grad = False
self.teacher.eval()

self.criterion = DistillationLoss(temperature, alpha)
self.optimizer = torch.optim.Adam(student_model.parameters(), lr=1e-4)

def train_step(
self,
data: torch.Tensor,
labels: torch.Tensor
) -> dict:
"""训练步骤"""
self.student.train()

# 教师模型推理
with torch.no_grad():
teacher_logits = self.teacher(data)

# 学生模型推理
student_logits = self.student(data)

# 计算损失
loss = self.criterion(student_logits, teacher_logits, labels)

# 反向传播
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()

return {
'loss': loss.item(),
'teacher_logits': teacher_logits,
'student_logits': student_logits
}


# 测试
if __name__ == "__main__":
# 教师模型(大模型)
teacher = models.resnet50(pretrained=True)
teacher.fc = nn.Linear(2048, 2)

# 学生模型(小模型)
student = models.mobilenet_v3_small(pretrained=True)
student.classifier[-1] = nn.Linear(1024, 2)

# 蒸馏训练
trainer = DistillationTrainer(teacher, student)

# 模拟训练
data = torch.randn(32, 3, 224, 224)
labels = torch.randint(0, 2, (32,))

for i in range(10):
result = trainer.train_step(data, labels)
print(f"Step {i}, Loss: {result['loss']:.4f}")

4. 高通 NPU 部署

QCS8255 平台特性

参数 规格
CPU 8x Kryo CPU
NPU Hexagon NPU, 26 TOPS
内存 LPDDR5, 最高 16GB
功耗 5-15W

SNPE/QNN 部署

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import subprocess
import os

class SnapdragonDeployment:
"""
高通 Snapdragon 部署工具

使用 SNPE/QNN 将模型部署到 NPU
"""

def __init__(self, sdk_path: str):
self.sdk_path = sdk_path
self.snpe_path = os.path.join(sdk_path, 'bin', 'x86_64-linux-clang')

def convert_to_dlc(
self,
onnx_model: str,
output_dlc: str
):
"""
将 ONNX 转换为 DLC 格式

DLC 是 SNPE 的模型格式
"""
cmd = [
os.path.join(self.snpe_path, 'snpe-onnx-to-dlc'),
'--input_network', onnx_model,
'--output_path', output_dlc
]

result = subprocess.run(cmd, capture_output=True, text=True)

if result.returncode == 0:
print(f"DLC 转换成功: {output_dlc}")
else:
print(f"转换失败: {result.stderr}")

def quantize_dlc(
self,
input_dlc: str,
output_dlc: str,
calibration_data: str
):
"""
量化 DLC 模型

Args:
input_dlc: 输入 DLC 模型
output_dlc: 输出量化模型
calibration_data: 校准数据列表文件
"""
cmd = [
os.path.join(self.snpe_path, 'snpe-dlc-quantize'),
'--input_dlc', input_dlc,
'--input_list', calibration_data,
'--output_dlc', output_dlc
]

result = subprocess.run(cmd, capture_output=True, text=True)

if result.returncode == 0:
print(f"量化成功: {output_dlc}")
else:
print(f"量化失败: {result.stderr}")

def benchmark(
self,
dlc_model: str,
device: str = 'cpu'
):
"""
性能测试

Args:
dlc_model: DLC 模型路径
device: 'cpu' / 'gpu' / 'dsp' / 'npu'
"""
cmd = [
os.path.join(self.snpe_path, 'snpe-benchmark'),
'--model', dlc_model,
'--runtime', device
]

result = subprocess.run(cmd, capture_output=True, text=True)
print(result.stdout)


# QNN 部署
class QNNDeployment:
"""
QNN(Qualcomm AI Neural Network)部署

新一代部署框架
"""

def __init__(self, qnn_sdk_path: str):
self.sdk_path = qnn_sdk_path

def convert_to_qnn(
self,
onnx_model: str,
output_dir: str
):
"""转换为 QNN 格式"""
cmd = [
os.path.join(self.sdk_path, 'bin', 'qnn-onnx-converter'),
'--input_model', onnx_model,
'--output_path', os.path.join(output_dir, 'model.cpp')
]

subprocess.run(cmd)

def compile_qnn_model(
self,
model_cpp: str,
output_so: str,
target_arch: str = 'aarch64'
):
"""编译 QNN 模型"""
cmd = [
os.path.join(self.sdk_path, 'bin', 'qnn-model-lib-generator'),
'--model', model_cpp,
'--output', output_so,
'--target', target_arch
]

subprocess.run(cmd)


# 使用示例
if __name__ == "__main__":
# 假设 SNPE SDK 路径
sdk_path = "/opt/qcom/snpe"

deployer = SnapdragonDeployment(sdk_path)

# 1. 转换 ONNX 到 DLC
deployer.convert_to_dlc(
"dms_model.onnx",
"dms_model.dlc"
)

# 2. 量化
deployer.quantize_dlc(
"dms_model.dlc",
"dms_model_quantized.dlc",
"calibration_list.txt"
)

# 3. 性能测试
deployer.benchmark("dms_model_quantized.dlc", device='npu')

性能优化结果

DMS 模型优化案例

优化方法 模型大小 推理时间 精度 加速比
原始 FP32 45MB 25ms 94.2% 1.0x
FP16 23MB 15ms 94.1% 1.7x
INT8 量化 12MB 8ms 93.8% 3.1x
30% 剪枝 32MB 18ms 93.5% 1.4x
蒸馏 MobileNet 10MB 6ms 92.8% 4.2x
量化+剪枝+蒸馏 6MB 3ms 92.5% 8.3x

NPU vs CPU 性能

平台 CPU 时间 NPU 时间 加速比
QCS8255 18ms 2.5ms 7.2x
QCS8295 15ms 2.0ms 7.5x
QCS8775 12ms 1.5ms 8.0x

参考文献

  1. Qualcomm, “Snapdragon Ride SDK Documentation”, 2025
  2. Han S., et al., “Deep Compression: Compressing DNNs with Pruning, Trained Quantization and Huffman Coding”, ICLR 2016
  3. Hinton G., et al., “Distilling the Knowledge in a Neural Network”, NIPS 2014

总结: 通过量化、剪枝、知识蒸馏的组合,DMS 模型可实现 8x 压缩和 8x 加速。建议优先采用 INT8 量化(简单高效),再考虑剪枝(结构化更佳),最后通过蒸馏训练更小的学生模型。部署时使用 NPU 加速,可获得额外 7-8x 加速。


边缘 AI 模型压缩实战:DMS 实时部署优化指南
https://dapalm.com/2026/06/04/2026-06-04-边缘AI模型压缩实战DMS实时部署优化指南/
作者
Mars
发布于
2026年6月4日
许可协议