1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92
| """ ARM NEON SIMD优化 """
import numpy as np
class ARMNeonOptimizer: """ ARM NEON优化工具 NEON指令集优化: - 向量化计算 - SIMD并行 - 内存对齐 """ @staticmethod def is_neon_available() -> bool: """检查NEON是否可用""" import platform machine = platform.machine().lower() return machine in ['armv7l', 'aarch64', 'arm64'] @staticmethod def optimize_conv2d(input_data: np.ndarray, weights: np.ndarray, bias: np.ndarray = None) -> np.ndarray: """ 优化的2D卷积 使用im2col + GEMM方式 """ assert input_data.ndim == 4 assert weights.ndim == 4 N, C, H, W = input_data.shape F, _, kH, kW = weights.shape out_h = H - kH + 1 out_w = W - kW + 1 col = ARMNeonOptimizer._im2col(input_data, kH, kW) weights_reshaped = weights.reshape(F, -1) output = col @ weights_reshaped.T if bias is not None: output += bias.reshape(1, -1) output = output.reshape(N, out_h, out_w, F).transpose(0, 3, 1, 2) return output @staticmethod def _im2col(input_data: np.ndarray, kH: int, kW: int) -> np.ndarray: """im2col变换""" N, C, H, W = input_data.shape out_h = H - kH + 1 out_w = W - kW + 1 col = np.zeros((N, C, kH, kW, out_h, out_w)) for y in range(kH): y_max = y + out_h for x in range(kW): x_max = x + out_w col[:, :, y, x, :, :] = input_data[:, :, y:y_max, x:x_max] col = col.transpose(0, 4, 5, 1, 2, 3).reshape(N * out_h * out_w, -1) return col
if __name__ == "__main__": print(f"NEON可用: {ARMNeonOptimizer.is_neon_available()}") input_data = np.random.randn(1, 3, 224, 224).astype(np.float32) weights = np.random.randn(64, 3, 3, 3).astype(np.float32) bias = np.random.randn(64).astype(np.float32) output = ARMNeonOptimizer.optimize_conv2d(input_data, weights, bias) print(f"卷积输出: {output.shape}")
|