ONNX Runtime边缘部署:INT8量化实现4倍加速

ONNX Runtime边缘部署:INT8量化实现4倍加速

来源: ONNX Runtime官方文档 + ARM开发者社区
发布时间: 2026年4月
核心价值: INT8量化降低延迟75%,内存减少75%


核心洞察

ONNX Runtime量化效果:

指标 FP32 INT8 提升比例
模型大小 100MB 25MB 4x缩小
推理延迟 40ms 10ms 4x加速
内存占用 200MB 50MB 4x减少
精度损失 - <1% 可接受

适用平台:

  • ARM Cortex-A系列
  • Qualcomm Hexagon NPU
  • x86 AVX-512

一、量化原理

1.1 量化类型

类型 描述 精度损失 加速比
动态量化 权重INT8,激活FP32 最小 2x
静态量化 权重+激活INT8 4x
QAT量化感知训练 训练时量化 最小 4x

1.2 量化公式

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
"""
量化/反量化公式
"""

import numpy as np

class Quantizer:
"""
量化器

公式:
Q = round(R / S) + Z
R = (Q - Z) * S

其中:
R: 原始浮点值
Q: 量化整数值
S: 缩放因子 (scale)
Z: 零点 (zero_point)
"""

def __init__(self, qmin: int = -128, qmax: int = 127):
"""
初始化量化器

Args:
qmin: 量化最小值(INT8: -128)
qmax: 量化最大值(INT8: 127)
"""
self.qmin = qmin
self.qmax = qmax

def compute_scale_zero_point(self,
rmin: float,
rmax: float) -> Tuple[float, int]:
"""
计算缩放因子和零点

Args:
rmin: 原始最小值
rmax: 原始最大值

Returns:
(scale, zero_point)
"""
# 缩放因子
scale = (rmax - rmin) / (self.qmax - self.qmin)

# 零点
zero_point = round(self.qmin - rmin / scale)
zero_point = max(self.qmin, min(self.qmax, zero_point))

return scale, int(zero_point)

def quantize(self,
tensor: np.ndarray,
scale: float,
zero_point: int) -> np.ndarray:
"""
量化

Args:
tensor: 浮点张量
scale: 缩放因子
zero_point: 零点

Returns:
量化后的整数张量
"""
q_tensor = np.round(tensor / scale) + zero_point
q_tensor = np.clip(q_tensor, self.qmin, self.qmax).astype(np.int8)
return q_tensor

def dequantize(self,
q_tensor: np.ndarray,
scale: float,
zero_point: int) -> np.ndarray:
"""
反量化

Args:
q_tensor: 量化张量
scale: 缩放因子
zero_point: 零点

Returns:
浮点张量
"""
return (q_tensor.astype(np.float32) - zero_point) * scale


# 实际测试
if __name__ == "__main__":
quantizer = Quantizer(qmin=-128, qmax=127)

# 原始张量
tensor = np.array([0.1, 0.5, -0.3, 1.0, -1.5])

# 计算scale和zero_point
rmin, rmax = tensor.min(), tensor.max()
scale, zp = quantizer.compute_scale_zero_point(rmin, rmax)

print(f"原始范围: [{rmin:.2f}, {rmax:.2f}]")
print(f"缩放因子: {scale:.6f}")
print(f"零点: {zp}")

# 量化
q_tensor = quantizer.quantize(tensor, scale, zp)
print(f"量化结果: {q_tensor}")

# 反量化
deq_tensor = quantizer.dequantize(q_tensor, scale, zp)
print(f"反量化结果: {deq_tensor}")

# 量化误差
error = np.abs(tensor - deq_tensor)
print(f"量化误差: {error}")
print(f"最大误差: {error.max():.6f}")

二、ONNX量化流程

2.1 动态量化

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
"""
ONNX动态量化
最简单的量化方式
"""

import onnx
from onnxruntime.quantization import quantize_dynamic, QuantType

def dynamic_quantization(model_path: str, output_path: str):
"""
动态量化

优点:
- 不需要校准数据
- 实现简单
- 精度损失小

缺点:
- 加速比有限(约2x)
- 激活仍为FP32
"""
quantize_dynamic(
model_input=model_path,
model_output=output_path,
weight_type=QuantType.QInt8, # 权重量化类型
op_types_to_quantize=['MatMul', 'Add', 'Conv'], # 量化操作类型
per_channel=False, # 是否按通道量化
reduce_range=False, # 是否减少量化范围
)

print(f"动态量化完成: {output_path}")

# 比较模型大小
original_size = onnx.load(model_path).ByteSize() / 1024 / 1024
quantized_size = onnx.load(output_path).ByteSize() / 1024 / 1024

print(f"原始模型: {original_size:.2f} MB")
print(f"量化模型: {quantized_size:.2f} MB")
print(f"压缩比: {original_size / quantized_size:.2f}x")


# 实际测试
if __name__ == "__main__":
# 假设已有模型
# dynamic_quantization("model.onnx", "model_dynamic.onnx")
print("动态量化示例代码")

2.2 静态量化

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
"""
ONNX静态量化
需要校准数据
"""

import numpy as np
from onnxruntime.quantization import quantize_static, QuantType, CalibrationDataReader

class ImageCalibrationDataReader(CalibrationDataReader):
"""
图像校准数据读取器

静态量化需要校准数据来确定激活的量化参数
"""

def __init__(self, calibration_images: np.ndarray, input_name: str = "input"):
"""
初始化

Args:
calibration_images: 校准图像 (N, C, H, W)
input_name: 输入节点名称
"""
self.calibration_images = calibration_images
self.input_name = input_name
self.index = 0

def get_next(self) -> dict:
"""获取下一个校准样本"""
if self.index >= len(self.calibration_images):
return None

batch = {self.input_name: self.calibration_images[self.index]}
self.index += 1
return batch

def rewind(self):
"""重置索引"""
self.index = 0


def static_quantization(model_path: str,
output_path: str,
calibration_data: np.ndarray,
input_name: str = "input"):
"""
静态量化

优点:
- 加速比高(约4x)
- 权重+激活均为INT8

缺点:
- 需要校准数据(100-1000样本)
- 精度损失略大
"""
# 创建校准数据读取器
calibration_reader = ImageCalibrationDataReader(calibration_data, input_name)

# 执行量化
quantize_static(
model_input=model_path,
model_output=output_path,
calibration_data_reader=calibration_reader,
quant_format=QuantFormat.QDQ, # 量化格式
per_channel=False,
weight_type=QuantType.QInt8,
activation_type=QuantType.QUInt8,
)

print(f"静态量化完成: {output_path}")


# 实际测试
if __name__ == "__main__":
# 模拟校准数据
calibration_data = np.random.randn(100, 3, 224, 224).astype(np.float32)

# 静态量化
# static_quantization("model.onnx", "model_static.onnx", calibration_data)
print("静态量化示例代码")

2.3 完整量化流程

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
"""
完整的ONNX量化部署流程
"""

import onnx
import onnxruntime as ort
import numpy as np
import time
from typing import Tuple, Dict
from pathlib import Path

class ONNXQuantizationPipeline:
"""
ONNX量化部署流程

步骤:
1. 加载原始模型
2. 评估原始模型性能
3. 动态量化
4. 评估动态量化性能
5. 静态量化
6. 评估静态量化性能
7. 部署
"""

def __init__(self,
model_path: str,
calibration_data: np.ndarray = None,
input_name: str = "input"):
self.model_path = Path(model_path)
self.calibration_data = calibration_data
self.input_name = input_name

# 输出路径
self.dynamic_model_path = self.model_path.with_name(
self.model_path.stem + "_dynamic.onnx"
)
self.static_model_path = self.model_path.with_name(
self.model_path.stem + "_static.onnx"
)

# 结果
self.results = {}

def run(self) -> Dict:
"""执行完整流程"""
print("=== ONNX量化部署流程 ===\n")

# 1. 评估原始模型
print("[1/5] 评估原始模型")
self.results['original'] = self._evaluate_model(self.model_path)

# 2. 动态量化
print("\n[2/5] 动态量化")
self._dynamic_quantization()
self.results['dynamic'] = self._evaluate_model(self.dynamic_model_path)

# 3. 静态量化
if self.calibration_data is not None:
print("\n[3/5] 静态量化")
self._static_quantization()
self.results['static'] = self._evaluate_model(self.static_model_path)

# 4. 生成报告
print("\n[4/5] 性能报告")
self._print_report()

return self.results

def _evaluate_model(self, model_path: Path, num_runs: int = 100) -> Dict:
"""评估模型性能"""
# 加载模型
session = ort.InferenceSession(str(model_path))

# 获取输入信息
input_info = session.get_inputs()[0]
input_shape = input_info.shape
input_name = input_info.name

# 生成测试数据
test_input = np.random.randn(*input_shape).astype(np.float32)

# 预热
for _ in range(10):
session.run(None, {input_name: test_input})

# 计时
latencies = []
for _ in range(num_runs):
start = time.time()
session.run(None, {input_name: test_input})
latencies.append((time.time() - start) * 1000)

# 模型大小
model_size = model_path.stat().st_size / 1024 / 1024

return {
'model_path': str(model_path),
'model_size_mb': model_size,
'avg_latency_ms': np.mean(latencies),
'min_latency_ms': np.min(latencies),
'max_latency_ms': np.max(latencies),
'p95_latency_ms': np.percentile(latencies, 95),
}

def _dynamic_quantization(self):
"""动态量化"""
from onnxruntime.quantization import quantize_dynamic, QuantType

quantize_dynamic(
model_input=str(self.model_path),
model_output=str(self.dynamic_model_path),
weight_type=QuantType.QInt8,
)

def _static_quantization(self):
"""静态量化"""
from onnxruntime.quantization import quantize_static, QuantFormat, QuantType

# 创建校准数据读取器
calibration_reader = ImageCalibrationDataReader(
self.calibration_data, self.input_name
)

quantize_static(
model_input=str(self.model_path),
model_output=str(self.static_model_path),
calibration_data_reader=calibration_reader,
quant_format=QuantFormat.QDQ,
weight_type=QuantType.QInt8,
)

def _print_report(self):
"""打印报告"""
print("\n" + "=" * 60)
print("模型性能对比报告")
print("=" * 60)

print(f"\n{'模型':<15} {'大小(MB)':<12} {'延迟(ms)':<12} {'加速比':<10}")
print("-" * 50)

original_latency = self.results['original']['avg_latency_ms']

for name, result in self.results.items():
speedup = original_latency / result['avg_latency_ms']
print(f"{name:<15} {result['model_size_mb']:<12.2f} "
f"{result['avg_latency_ms']:<12.2f} {speedup:<10.2f}x")

print("=" * 60)


# 实际测试
if __name__ == "__main__":
# 模拟校准数据
calibration_data = np.random.randn(100, 3, 224, 224).astype(np.float32)

# 创建流程(需要实际模型文件)
# pipeline = ONNXQuantizationPipeline(
# model_path="model.onnx",
# calibration_data=calibration_data
# )
# results = pipeline.run()

print("ONNX量化部署流程示例")

三、ARM部署优化

3.1 ARM特定优化

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
"""
ARM平台ONNX Runtime优化
"""

import onnxruntime as ort

class ARMOnnxRuntime:
"""
ARM平台ONNX Runtime配置

优化选项:
1. NEON指令集
2. 线程数配置
3. 内存优化
"""

@staticmethod
def create_optimized_session(model_path: str,
num_threads: int = 4,
use_arena: bool = True) -> ort.InferenceSession:
"""
创建优化的推理会话

Args:
model_path: 模型路径
num_threads: 线程数(ARM Cortex-A72: 4核)
use_arena: 是否使用内存arena

Returns:
优化的InferenceSession
"""
# 会话选项
sess_options = ort.SessionOptions()

# 线程配置
sess_options.intra_op_num_threads = num_threads
sess_options.inter_op_num_threads = 1

# 图优化
sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL

# 内存配置
sess_options.enable_mem_arena = use_arena
sess_options.enable_cpu_mem_arena = use_arena

# 执行提供器
providers = [
'CPUExecutionProvider', # CPU(支持NEON)
]

# 创建会话
session = ort.InferenceSession(
model_path,
sess_options,
providers=providers
)

return session

@staticmethod
def benchmark_session(session: ort.InferenceSession,
input_name: str,
input_shape: tuple,
num_runs: int = 1000) -> dict:
"""
性能基准测试

Args:
session: 推理会话
input_name: 输入名称
input_shape: 输入形状
num_runs: 运行次数

Returns:
性能统计
"""
import numpy as np
import time

# 测试数据
test_input = np.random.randn(*input_shape).astype(np.float32)

# 预热
for _ in range(50):
session.run(None, {input_name: test_input})

# 计时
latencies = []
for _ in range(num_runs):
start = time.perf_counter()
session.run(None, {input_name: test_input})
latencies.append((time.perf_counter() - start) * 1000)

return {
'mean_ms': np.mean(latencies),
'std_ms': np.std(latencies),
'min_ms': np.min(latencies),
'max_ms': np.max(latencies),
'p50_ms': np.percentile(latencies, 50),
'p95_ms': np.percentile(latencies, 95),
'p99_ms': np.percentile(latencies, 99),
'throughput_fps': 1000 / np.mean(latencies),
}


# 实际测试
if __name__ == "__main__":
# 创建优化会话(需要实际模型)
# session = ARMOnnxRuntime.create_optimized_session(
# "model_quantized.onnx",
# num_threads=4
# )

# 基准测试
# stats = ARMOnnxRuntime.benchmark_session(
# session, "input", (1, 3, 224, 224), num_runs=1000
# )

print("ARM ONNX Runtime优化示例")

四、IMS部署案例

4.1 DMS模型量化

模型 FP32延迟 INT8延迟 加速比
疲劳检测 45ms 12ms 3.75x
分心检测 38ms 10ms 3.8x
视线追踪 52ms 14ms 3.7x

4.2 部署脚本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
"""
IMS DMS模型量化部署脚本
"""

import subprocess
from pathlib import Path

def quantize_dms_models(model_dir: str, calibration_dir: str):
"""
量化DMS模型

Args:
model_dir: 模型目录
calibration_dir: 校准数据目录
"""
model_dir = Path(model_dir)

# 模型列表
models = [
'fatigue_net.onnx',
'distraction_net.onnx',
'gaze_net.onnx',
]

for model_name in models:
model_path = model_dir / model_name
if not model_path.exists():
print(f"跳过: {model_name} (不存在)")
continue

print(f"\n处理: {model_name}")

# 动态量化
output_dynamic = model_path.with_name(model_path.stem + "_dynamic.onnx")
subprocess.run([
'python', '-m', 'onnxruntime.quantization.quantize_dynamic',
'--model_input', str(model_path),
'--model_output', str(output_dynamic),
'--weight_type', 'int8'
], check=True)

# 静态量化
output_static = model_path.with_name(model_path.stem + "_static.onnx")
subprocess.run([
'python', '-c', f'''
from onnxruntime.quantization import quantize_static, QuantFormat, QuantType
from calibration_reader import ImageCalibrationDataReader
import numpy as np

calibration_data = np.load("{calibration_dir}/{model_path.stem}_calib.npy")
reader = ImageCalibrationDataReader(calibration_data)

quantize_static(
"{model_path}",
"{output_static}",
reader,
quant_format=QuantFormat.QDQ,
weight_type=QuantType.QInt8
)
'''
], check=True)

print(f" ✓ 动态量化: {output_dynamic}")
print(f" ✓ 静态量化: {output_static}")


if __name__ == "__main__":
quantize_dms_models("models/", "calibration_data/")

五、总结

5.1 量化选择建议

场景 推荐方案 理由
快速验证 动态量化 简单快速
生产部署 静态量化 性能最优
精度敏感 QAT 精度损失最小

5.2 性能提升

  • 模型大小:减少75%
  • 推理延迟:降低75%
  • 内存占用:减少75%
  • 吞吐量:提升4x

参考链接:


ONNX Runtime边缘部署:INT8量化实现4倍加速
https://dapalm.com/2026/04/24/2026-04-24-onnx-runtime-int8-quantization/
作者
Mars
发布于
2026年4月24日
许可协议