1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
| """ TensorRT部署优化 """
import tensorrt as trt import pycuda.driver as cuda import numpy as np
class TRTInference: """ TensorRT推理引擎 """ def __init__(self, onnx_path: str, engine_path: str = None): """ Args: onnx_path: ONNX模型路径 engine_path: TensorRT引擎保存路径 """ self.logger = trt.Logger(trt.Logger.WARNING) if engine_path and os.path.exists(engine_path): self.engine = self.load_engine(engine_path) else: self.engine = self.build_engine(onnx_path) if engine_path: self.save_engine(engine_path) self.context = self.engine.create_execution_context() self.inputs, self.outputs, self.bindings, self.stream = self.allocate_buffers() def build_engine(self, onnx_path: str): """ 从ONNX构建TensorRT引擎 包括: - INT8量化 - 层融合优化 - 内核自动调优 """ builder = trt.Builder(self.logger) network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) parser = trt.OnnxParser(network, self.logger) with open(onnx_path, 'rb') as f: parser.parse(f.read()) config = builder.create_builder_config() config.max_workspace_size = 1 << 30 config.set_flag(trt.BuilderFlag.INT8) config.int8_calibrator = self.get_calibrator() config.set_flag(trt.BuilderFlag.FP16) engine = builder.build_engine(network, config) return engine def get_calibrator(self): """获取INT8校准器""" class DMSInt8Calibrator(trt.IInt8MinMaxCalibrator): def __init__(self, calibration_data): super().__init__() self.data = calibration_data self.index = 0 def get_batch_size(self): return 1 def get_batch(self, names): if self.index >= len(self.data): return None batch = self.data[self.index] self.index += 1 return [batch] def read_calibration_cache(self): return None def write_calibration_cache(self, cache): pass calibration_data = self.load_calibration_data() return DMSInt8Calibrator(calibration_data) def allocate_buffers(self): """分配GPU内存""" inputs = [] outputs = [] bindings = [] stream = cuda.Stream() for binding in self.engine: size = trt.volume(self.engine.get_binding_shape(binding)) * self.engine.max_batch_size dtype = trt.nptype(self.engine.get_binding_dtype(binding)) host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) bindings.append(int(device_mem)) if self.engine.binding_is_input(binding): inputs.append({'host': host_mem, 'device': device_mem}) else: outputs.append({'host': host_mem, 'device': device_mem}) return inputs, outputs, bindings, stream def infer(self, input_data: np.ndarray): """ 推理 Args: input_data: 输入数据 Returns: output: 输出结果 """ np.copyto(self.inputs[0]['host'], input_data.ravel()) cuda.memcpy_htod_async(self.inputs[0]['device'], self.inputs[0]['host'], self.stream) self.context.execute_async_v2(bindings=self.bindings, stream_handle=self.stream.handle) cuda.memcpy_dtoh_async(self.outputs[0]['host'], self.outputs[0]['device'], self.stream) self.stream.synchronize() return self.outputs[0]['host']
class QualcommNPUInference: """ 高通NPU推理(QCS8255/8295) """ def __init__(self, model_path: str): """ Args: model_path: ONNX/DLC模型路径 """ import onnxruntime as ort sess_options = ort.SessionOptions() sess_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL providers = ['QNNExecutionProvider'] self.session = ort.InferenceSession(model_path, sess_options, providers=providers) self.input_name = self.session.get_inputs()[0].name self.output_names = [o.name for o in self.session.get_outputs()] def infer(self, input_data: np.ndarray): """ NPU推理 Args: input_data: 输入数据 (H, W, C) Returns: output: 输出结果 """ input_tensor = self.preprocess(input_data) outputs = self.session.run( self.output_names, {self.input_name: input_tensor} ) return outputs[0] def preprocess(self, image): """预处理""" image = image.astype(np.float32) / 255.0 mean = [0.485, 0.456, 0.406] std = [0.229, 0.224, 0.225] image = (image - mean) / std image = np.transpose(image, (2, 0, 1)) image = np.expand_dims(image, 0) return image
|