neon是一种SIMD(单指令多数据)指令集,其效率相当于汇编,用于arm cpu平台的优化,在音视频、图形图像解决畛域性能晋升较大。arm架构的CPU从armv7a开始曾经反对neon(可选项),从而实现并行计算性能。

本文记录一下在android上应用neon减速的办法。

首先不必多说先创立反对native C++的android工程

而后在gradle中增加对neon的反对:

externalNativeBuild {

cmake {

cppFlags "-std=c++14"

arguments "-DANDROID_ARM_NEON=TRUE"

}

}

还要在cmake中增加对neon的反对 “-mfpu=neon”

最初在cpp中 #include

这样,我的项目就能够反对neon减速了。

为了比拟性能,当初用neon和纯C办法比拟一下将彩色图片转成灰度的工夫

//纯C函数

void method_argb2gray_c(AndroidBitmapInfo info, void *pixels) {

// rgb转灰度值公式

// Gray = (R*38 + G*75 + B*15) >> 7

cv::TickMeter tm1;

tm1.start();

uint32_t *pixel = NULL;

int a = 0, r = 0, g = 0, b = 0;

int rows=info.height;

int cols=info.width;

for (int y = 0; y < rows; ++y) {

for (int x = 0; x < cols; ++x) {

pixel = (uint32_t *) pixels + info.width * y + x;

a = (*pixel & 0xFF000000) >> 24;

r = (*pixel & 0x00FF0000) >> 16;

g = (*pixel & 0x0000FF00) >> 8;

b = (*pixel & 0x000000FF) >> 0;

int gray = (r * 38 + g * 75 + b * 15) >> 7;

*pixel = ((a << 24) | (gray << 16) | (gray << 8) | gray);

}

}

tm1.stop();

LOGI("method_argb2gray_c time: %lf", tm1.getTimeMilli());

}

//neon函数

void method_argb2gray_neon(AndroidBitmapInfo info, void *pixels) {

// Gray = (R*38 + G*75 + B*15) >> 7

TickMeter tm3;

tm3.start();

unsigned short *dst = (unsigned short *) pixels;

unsigned char *src = (unsigned char *) pixels;

uint8x8_t r = vdup_n_u8(38);

uint8x8_t g = vdup_n_u8(75);

uint8x8_t b = vdup_n_u8(15);

uint16x8_t alp = vdupq_n_u16(255 << 8);

uint16x8_t temp;

uint8x8_t gray;

uint8x8x4_t argb;

uint16x8_t hight;

uint16x8_t low;

uint16x8x2_t res;

int i, size = info.height * info.width / 8;

for (i = 0; i < size; ++i) {

//获取r、g、b值,计算灰度值

argb = vld4_u8(src);

temp = vmull_u8(argb.val[1], r);

temp = vmlal_u8(temp, argb.val[2], g);

temp = vmlal_u8(temp, argb.val[3], b);

gray = vshrn_n_u16 (temp, 7);

src += 8 * 4;

//赋值4通道argb

hight = vorrq_u16(alp, vmovl_u8(gray));

low = vorrq_u16(vshlq_n_u16(vmovl_u8(gray), 8), vmovl_u8(gray));

res = vzipq_u16(low, hight);

vst1q_u16(dst, res.val[0]);

dst += 8;

vst1q_u16(dst, res.val[1]);

dst += 8;

}

tm3.stop();

LOGI("method_argb2gray_neon time: %lf", tm3.getTimeMilli());

}

实测速度比拟如下

Logo

为开发者提供学习成长、分享交流、生态实践、资源工具等服务,帮助开发者快速成长。

更多推荐