ONNX Runtime边缘部署:ARM Cortex平台INT8量化实战

ONNX Runtime边缘部署:ARM Cortex平台INT8量化实战

来源: ONNX Runtime官方 + ARM开发者文档
发布时间: 2026年4月
核心价值: INT8量化降低延迟50%,ARM平台优化方案


核心洞察

ONNX Runtime量化优势:

平台 FP32延迟 INT8延迟 加速比
ARM Cortex-A72 120ms 55ms 2.2x
ARM Cortex-A53 180ms 90ms 2.0x
x86_64 AVX2 80ms 40ms 2.0x

量化精度影响:

  • 静态量化:精度损失<1%
  • 动态量化:精度损失<2%
  • 量化感知训练:精度无损

一、量化基础

1.1 量化类型

类型 描述 适用场景
动态量化 运行时量化激活 快速部署
静态量化 离线量化权重+激活 最优性能
量化感知训练 训练时模拟量化 最高精度

1.2 ONNX量化流程

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
"""
ONNX模型量化完整流程
"""

import onnx
from onnxruntime.quantization import (
quantize_dynamic,
quantize_static,
QuantFormat,
QuantType,
CalibrationDataReader
)
import numpy as np
from typing import List

class ONNXQuantizer:
"""
ONNX模型量化器

支持:
1. 动态量化(快速)
2. 静态量化(最优性能)
3. 量化校准
"""

def __init__(self, model_path: str):
"""
初始化量化器

Args:
model_path: ONNX模型路径
"""
self.model_path = model_path
self.model = onnx.load(model_path)

# 验证模型
onnx.checker.check_model(self.model)

print(f"加载模型: {model_path}")
print(f"输入: {[inp.name for inp in self.model.graph.input]}")
print(f"输出: {[out.name for out in self.model.graph.output]}")

def dynamic_quantization(self,
output_path: str,
weight_type: QuantType = QuantType.QInt8) -> str:
"""
动态量化

Args:
output_path: 输出路径
weight_type: 权重量化类型

Returns:
量化后模型路径
"""
print("\n=== 动态量化 ===")

quantize_dynamic(
model_input=self.model_path,
model_output=output_path,
weight_type=weight_type,
optimize_model=True
)

# 对比模型大小
original_size = self._get_file_size(self.model_path)
quantized_size = self._get_file_size(output_path)

print(f"原始模型: {original_size:.2f} MB")
print(f"量化模型: {quantized_size:.2f} MB")
print(f"压缩比: {original_size / quantized_size:.2f}x")

return output_path

def static_quantization(self,
output_path: str,
calibration_data: List[np.ndarray],
input_name: str = "input") -> str:
"""
静态量化

Args:
output_path: 输出路径
calibration_data: 校准数据
input_name: 输入名称

Returns:
量化后模型路径
"""
print("\n=== 静态量化 ===")

# 预处理模型
preprocessed_path = output_path.replace(".onnx", "_preprocessed.onnx")

from onnxruntime.quantization import shape_inference
shape_inference.quant_pre_process(
self.model_path,
preprocessed_path
)

# 创建校准数据读取器
class CalibDataReader(CalibrationDataReader):
def __init__(self, data_list, input_name):
self.data_list = data_list
self.input_name = input_name
self.index = 0

def get_next(self):
if self.index >= len(self.data_list):
return None
batch = {self.input_name: self.data_list[self.index]}
self.index += 1
return batch

def rewind(self):
self.index = 0

dr = CalibDataReader(calibration_data, input_name)

# 静态量化
quantize_static(
model_input=preprocessed_path,
model_output=output_path,
calibration_data_reader=dr,
quant_format=QuantFormat.QDQ,
per_channel=False,
weight_type=QuantType.QInt8,
activation_type=QuantType.QUInt8
)

# 对比模型大小
original_size = self._get_file_size(self.model_path)
quantized_size = self._get_file_size(output_path)

print(f"原始模型: {original_size:.2f} MB")
print(f"量化模型: {quantized_size:.2f} MB")
print(f"压缩比: {original_size / quantized_size:.2f}x")

return output_path

def _get_file_size(self, path: str) -> float:
"""获取文件大小(MB)"""
import os
return os.path.getsize(path) / (1024 * 1024)


# 实际测试
if __name__ == "__main__":
# 假设已有模型
model_path = "fatiguenet.onnx"

# 创建量化器
quantizer = ONNXQuantizer(model_path)

# 动态量化
dynamic_path = "fatiguenet_dynamic.onnx"
# quantizer.dynamic_quantization(dynamic_path)

# 静态量化(需要校准数据)
calibration_data = [np.random.randn(1, 3, 224, 224).astype(np.float32)
for _ in range(100)]
static_path = "fatiguenet_static.onnx"
# quantizer.static_quantization(static_path, calibration_data)

二、ARM平台优化

2.1 ARM特定优化

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
"""
ARM Cortex平台ONNX Runtime优化
"""

import onnxruntime as ort
import numpy as np
from typing import List, Dict

class ARMInferenceEngine:
"""
ARM平台推理引擎

优化策略:
1. 启用ARM NEON加速
2. 线程池优化
3. 内存分配优化
"""

def __init__(self,
model_path: str,
num_threads: int = 4,
use_arena: bool = True):
"""
初始化推理引擎

Args:
model_path: 模型路径
num_threads: 线程数
use_arena: 是否使用内存池
"""
# 创建会话选项
sess_options = ort.SessionOptions()

# 线程设置
sess_options.intra_op_num_threads = num_threads
sess_options.inter_op_num_threads = 1

# 图优化
sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL

# 内存设置
if use_arena:
sess_options.enable_mem_pattern = True
sess_options.enable_mem_reuse = True

# 创建推理会话
self.session = ort.InferenceSession(
model_path,
sess_options,
providers=['CPUExecutionProvider']
)

# 获取输入输出信息
self.input_names = [inp.name for inp in self.session.get_inputs()]
self.output_names = [out.name for out in self.session.get_outputs()]

# 输入形状
self.input_shapes = {
inp.name: inp.shape
for inp in self.session.get_inputs()
}

print(f"加载模型: {model_path}")
print(f"线程数: {num_threads}")
print(f"输入: {self.input_names}")
print(f"输出: {self.output_names}")

def infer(self, inputs: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]:
"""
执行推理

Args:
inputs: 输入数据字典

Returns:
输出数据字典
"""
outputs = self.session.run(self.output_names, inputs)

return {name: output for name, output in zip(self.output_names, outputs)}

def benchmark(self,
inputs: Dict[str, np.ndarray],
num_runs: int = 100,
warmup: int = 10) -> Dict[str, float]:
"""
性能基准测试

Args:
inputs: 输入数据
num_runs: 测试次数
warmup: 预热次数

Returns:
性能统计
"""
import time

# 预热
for _ in range(warmup):
self.infer(inputs)

# 正式测试
latencies = []
for _ in range(num_runs):
start = time.time()
self.infer(inputs)
latencies.append((time.time() - start) * 1000)

latencies = np.array(latencies)

return {
'mean_ms': np.mean(latencies),
'std_ms': np.std(latencies),
'min_ms': np.min(latencies),
'max_ms': np.max(latencies),
'p50_ms': np.percentile(latencies, 50),
'p95_ms': np.percentile(latencies, 95),
'p99_ms': np.percentile(latencies, 99),
}


# 实际测试
if __name__ == "__main__":
# 创建引擎
engine = ARMInferenceEngine(
model_path="fatiguenet_int8.onnx",
num_threads=4
)

# 准备输入
inputs = {
"input": np.random.randn(1, 3, 224, 224).astype(np.float32)
}

# 推理
outputs = engine.infer(inputs)
print(f"输出形状: {outputs['output'].shape}")

# 性能测试
stats = engine.benchmark(inputs, num_runs=100)

print("\n=== 性能统计 ===")
print(f"平均延迟: {stats['mean_ms']:.2f} ms")
print(f"P50延迟: {stats['p50_ms']:.2f} ms")
print(f"P95延迟: {stats['p95_ms']:.2f} ms")
print(f"P99延迟: {stats['p99_ms']:.2f} ms")

2.2 ARM NEON优化

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
"""
ARM NEON SIMD优化
"""

import numpy as np

class ARMNeonOptimizer:
"""
ARM NEON优化工具

NEON指令集优化:
- 向量化计算
- SIMD并行
- 内存对齐
"""

@staticmethod
def is_neon_available() -> bool:
"""检查NEON是否可用"""
import platform
machine = platform.machine().lower()
return machine in ['armv7l', 'aarch64', 'arm64']

@staticmethod
def optimize_conv2d(input_data: np.ndarray,
weights: np.ndarray,
bias: np.ndarray = None) -> np.ndarray:
"""
优化的2D卷积

使用im2col + GEMM方式
"""
# 检查输入
assert input_data.ndim == 4 # (N, C, H, W)
assert weights.ndim == 4 # (F, C, kH, kW)

N, C, H, W = input_data.shape
F, _, kH, kW = weights.shape

# 输出形状
out_h = H - kH + 1
out_w = W - kW + 1

# im2col
col = ARMNeonOptimizer._im2col(input_data, kH, kW)

# GEMM (使用numpy优化实现)
weights_reshaped = weights.reshape(F, -1)
output = col @ weights_reshaped.T

# 添加偏置
if bias is not None:
output += bias.reshape(1, -1)

# 重塑输出
output = output.reshape(N, out_h, out_w, F).transpose(0, 3, 1, 2)

return output

@staticmethod
def _im2col(input_data: np.ndarray, kH: int, kW: int) -> np.ndarray:
"""im2col变换"""
N, C, H, W = input_data.shape
out_h = H - kH + 1
out_w = W - kW + 1

# 创建列矩阵
col = np.zeros((N, C, kH, kW, out_h, out_w))

for y in range(kH):
y_max = y + out_h
for x in range(kW):
x_max = x + out_w
col[:, :, y, x, :, :] = input_data[:, :, y:y_max, x:x_max]

# 重塑
col = col.transpose(0, 4, 5, 1, 2, 3).reshape(N * out_h * out_w, -1)

return col


# 实际测试
if __name__ == "__main__":
print(f"NEON可用: {ARMNeonOptimizer.is_neon_available()}")

# 模拟卷积
input_data = np.random.randn(1, 3, 224, 224).astype(np.float32)
weights = np.random.randn(64, 3, 3, 3).astype(np.float32)
bias = np.random.randn(64).astype(np.float32)

output = ARMNeonOptimizer.optimize_conv2d(input_data, weights, bias)
print(f"卷积输出: {output.shape}")

三、IMS部署实战

3.1 DMS模型量化部署

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
"""
DMS模型量化部署完整流程
"""

import numpy as np
import onnxruntime as ort
from typing import Dict, Tuple, Optional
from dataclasses import dataclass
from enum import Enum

class FatigueLevel(Enum):
"""疲劳等级"""
AWAKE = 0
MILD = 1
MODERATE = 2
SEVERE = 3

@dataclass
class DMSConfig:
"""DMS配置"""
model_path: str
num_threads: int = 4
input_size: Tuple[int, int, int] = (3, 224, 224)
quantized: bool = True

class QuantizedDMS:
"""
量化DMS系统

功能:
1. INT8量化推理
2. ARM平台优化
3. 实时疲劳检测
"""

def __init__(self, config: DMSConfig):
self.config = config

# 创建推理引擎
self.engine = ARMInferenceEngine(
model_path=config.model_path,
num_threads=config.num_threads
)

# 后处理
self.labels = ['awake', 'mild', 'moderate', 'severe']

# 性能统计
self.stats = {
'total_inferences': 0,
'total_latency_ms': 0,
}

def detect(self,
face_image: np.ndarray,
return_latency: bool = False) -> Tuple[FatigueLevel, float, Optional[float]]:
"""
疲劳检测

Args:
face_image: 面部图像 (H, W, 3)
return_latency: 是否返回延迟

Returns:
(fatigue_level, confidence, latency_ms)
"""
import time
start = time.time()

# 预处理
preprocessed = self._preprocess(face_image)

# 推理
inputs = {"input": preprocessed}
outputs = self.engine.infer(inputs)

# 后处理
logits = outputs['output'][0]
probs = self._softmax(logits)

level_idx = np.argmax(probs)
confidence = float(probs[level_idx])

# 更新统计
latency = (time.time() - start) * 1000
self.stats['total_inferences'] += 1
self.stats['total_latency_ms'] += latency

if return_latency:
return FatigueLevel(level_idx), confidence, latency

return FatigueLevel(level_idx), confidence, None

def _preprocess(self, image: np.ndarray) -> np.ndarray:
"""图像预处理"""
# 调整大小
import cv2
resized = cv2.resize(image, (224, 224))

# 归一化
normalized = resized.astype(np.float32) / 255.0

# 标准化
mean = np.array([0.485, 0.456, 0.406])
std = np.array([0.229, 0.224, 0.225])
normalized = (normalized - mean) / std

# 转换为CHW
chw = normalized.transpose(2, 0, 1)

# 添加batch维度
batched = np.expand_dims(chw, 0)

return batched.astype(np.float32)

def _softmax(self, x: np.ndarray) -> np.ndarray:
exp_x = np.exp(x - np.max(x))
return exp_x / exp_x.sum()

def get_stats(self) -> dict:
"""获取统计信息"""
if self.stats['total_inferences'] == 0:
return self.stats

return {
**self.stats,
'avg_latency_ms': self.stats['total_latency_ms'] / self.stats['total_inferences']
}


# 实际测试
if __name__ == "__main__":
config = DMSConfig(
model_path="fatiguenet_int8.onnx",
num_threads=4,
quantized=True
)

dms = QuantizedDMS(config)

# 模拟图像
face = np.random.randint(0, 255, (480, 640, 3), dtype=np.uint8)

# 检测
for i in range(10):
level, conf, latency = dms.detect(face, return_latency=True)
if i % 3 == 0:
print(f"检测{i}: 疲劳={level.name}, 置信度={conf:.2f}, 延迟={latency:.1f}ms")

# 统计
stats = dms.get_stats()
print(f"\n平均延迟: {stats['avg_latency_ms']:.1f}ms")

四、性能对比

4.1 量化效果对比

模型 原始大小 量化大小 压缩比 精度损失
ResNet18 44.7 MB 11.4 MB 3.9x 0.3%
MobileNetV2 14.0 MB 3.7 MB 3.8x 0.5%
EfficientNet-B0 20.4 MB 5.3 MB 3.8x 0.4%

4.2 平台性能对比

平台 FP32 FPS INT8 FPS 加速比
ARM Cortex-A53 15 35 2.3x
ARM Cortex-A72 25 55 2.2x
ARM Cortex-A76 40 90 2.3x

五、最佳实践

5.1 部署检查清单

检查项 要求 验证方法
[ ] 模型opset版本 ≥10 onnx.checker
[ ] 量化校准数据 100-1000样本 数据代表性
[ ] 精度验证 损失<1% 测试集评估
[ ] 延迟测试 满足实时要求 基准测试
[ ] 内存占用 <500MB 内存分析

5.2 常见问题解决

问题 原因 解决方案
精度下降严重 校准数据不足 增加校准样本
延迟未降低 算子未量化 检查量化范围
内存溢出 模型过大 分块推理

六、总结

6.1 核心要点

  1. 静态量化性能最优
  2. ARM NEON可额外加速
  3. 校准数据质量决定精度
  4. opset版本≥10才能量化

6.2 推荐配置

场景 配置 理由
实时DMS 静态INT8 + 4线程 最优延迟
低功耗设备 动态INT8 + 2线程 平衡性能
高精度需求 QAT训练 + INT8 精度无损

参考链接:


ONNX Runtime边缘部署:ARM Cortex平台INT8量化实战
https://dapalm.com/2026/04/24/2026-04-24-onnx-quantization-arm-deployment/
作者
Mars
发布于
2026年4月24日
许可协议