1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164
| """ 完整的ONNX量化部署流程 """
import onnx import onnxruntime as ort import numpy as np import time from typing import Tuple, Dict from pathlib import Path
class ONNXQuantizationPipeline: """ ONNX量化部署流程 步骤: 1. 加载原始模型 2. 评估原始模型性能 3. 动态量化 4. 评估动态量化性能 5. 静态量化 6. 评估静态量化性能 7. 部署 """ def __init__(self, model_path: str, calibration_data: np.ndarray = None, input_name: str = "input"): self.model_path = Path(model_path) self.calibration_data = calibration_data self.input_name = input_name self.dynamic_model_path = self.model_path.with_name( self.model_path.stem + "_dynamic.onnx" ) self.static_model_path = self.model_path.with_name( self.model_path.stem + "_static.onnx" ) self.results = {} def run(self) -> Dict: """执行完整流程""" print("=== ONNX量化部署流程 ===\n") print("[1/5] 评估原始模型") self.results['original'] = self._evaluate_model(self.model_path) print("\n[2/5] 动态量化") self._dynamic_quantization() self.results['dynamic'] = self._evaluate_model(self.dynamic_model_path) if self.calibration_data is not None: print("\n[3/5] 静态量化") self._static_quantization() self.results['static'] = self._evaluate_model(self.static_model_path) print("\n[4/5] 性能报告") self._print_report() return self.results def _evaluate_model(self, model_path: Path, num_runs: int = 100) -> Dict: """评估模型性能""" session = ort.InferenceSession(str(model_path)) input_info = session.get_inputs()[0] input_shape = input_info.shape input_name = input_info.name test_input = np.random.randn(*input_shape).astype(np.float32) for _ in range(10): session.run(None, {input_name: test_input}) latencies = [] for _ in range(num_runs): start = time.time() session.run(None, {input_name: test_input}) latencies.append((time.time() - start) * 1000) model_size = model_path.stat().st_size / 1024 / 1024 return { 'model_path': str(model_path), 'model_size_mb': model_size, 'avg_latency_ms': np.mean(latencies), 'min_latency_ms': np.min(latencies), 'max_latency_ms': np.max(latencies), 'p95_latency_ms': np.percentile(latencies, 95), } def _dynamic_quantization(self): """动态量化""" from onnxruntime.quantization import quantize_dynamic, QuantType quantize_dynamic( model_input=str(self.model_path), model_output=str(self.dynamic_model_path), weight_type=QuantType.QInt8, ) def _static_quantization(self): """静态量化""" from onnxruntime.quantization import quantize_static, QuantFormat, QuantType calibration_reader = ImageCalibrationDataReader( self.calibration_data, self.input_name ) quantize_static( model_input=str(self.model_path), model_output=str(self.static_model_path), calibration_data_reader=calibration_reader, quant_format=QuantFormat.QDQ, weight_type=QuantType.QInt8, ) def _print_report(self): """打印报告""" print("\n" + "=" * 60) print("模型性能对比报告") print("=" * 60) print(f"\n{'模型':<15} {'大小(MB)':<12} {'延迟(ms)':<12} {'加速比':<10}") print("-" * 50) original_latency = self.results['original']['avg_latency_ms'] for name, result in self.results.items(): speedup = original_latency / result['avg_latency_ms'] print(f"{name:<15} {result['model_size_mb']:<12.2f} " f"{result['avg_latency_ms']:<12.2f} {speedup:<10.2f}x") print("=" * 60)
if __name__ == "__main__": calibration_data = np.random.randn(100, 3, 224, 224).astype(np.float32) print("ONNX量化部署流程示例")
|