基於OpenCL的影象積分圖演算法改進

複雜的演算法卻未必低效,簡單的演算法往往要付出代價,這個代價可能很大。在opencl環境下程式設計,與我們在CPU上的傳統程式設計思想有一些差異,這些差異看似微不足道,但往往是細節決定成功,就是這些看似微不足道的差異,在多核的GPU上被無限放大,導致同一種演算法在GPU和CPU執行效果有著巨大的差別。
之前寫過一篇文章《基於OpenCL的影象積分圖演算法實現》介紹了opencl中積分圖演算法的基本原理(不瞭解積分圖概念的朋友可以先參考這篇文章),並基於這個基本原理提供了kernel實現程式碼.但經過這兩個月的實踐檢驗,原先這個基於字首和計算加矩陣轉置的演算法被證明在GPU上是非常低效的。
為什麼呢?從根本上來說,之前的演算法不符合平行計算所要求的分治原則,每個kernel一次迴圈處理一整行資料,相著挺簡單,真正執行的時候,並不快。
下圖是原來的演算法在CodeXL GPU performance counters的記錄結果。一次積分圖計算的總執行時間在1.6ms左右
這裡寫圖片描述

注:為了提高效率這裡的kernel程式碼基於前一篇文章的演算法上有改進,將前經和計算和矩陣轉置合併為一個名為prefix_sum_col_and_transpose的kernel,沒有改進前的演算法更慢數倍。

於是我參考了OpenCLIPP的積分圖演算法思路,重寫了自己的程式碼,新的演算法思路是這樣的:
整個演算法分為5個步驟(kernel)來完成。
第一步(integral_block)將整個影象分為4×4的小塊,分別計算區域性積分圖。
這裡寫圖片描述
第二步(intergral_scan_v),縱向掃描計算前一步每個4×4塊最後一組資料的字首和矩陣vert。
這裡寫圖片描述
第三步(intergral_combine_v),結合前面兩步的結果將縱向互不關聯的4×4塊在縱向上連線起來。
這裡寫圖片描述
第四步(intergral_scan_h),橫向掃描計算前一步每個4×4塊最後一組資料的字首和矩陣horiz。
這裡寫圖片描述
第五步(intergral_combine_h),結合前面兩步的結果將橫向互不關聯的4×4塊在橫向上連線起來,就形成了一幅完整的積分圖。
這裡寫圖片描述
這個演算法思路與之前的演算法相比,沒有了耗時的矩陣轉置過程,但分為5步,更復雜了,實際的執行效果呢?出乎我的意料:5個kernel加起來的總時間是0.63ms左右,相比原來的演算法提高了近3倍。

這裡寫圖片描述

下面是完整的kernel程式碼

 ///////////////////////////////////////////////////////////////////////////////
//! @file   : integral_gpu.cl
//! @date   : 2016/05/08
//! @author: guyadong
//! @brief  : Calculates the integral sum scan of an image
////////////////////////////////////////////////////////////////////////////////
#include "common_types.h"
#ifndef CL_DEVICE_LOCAL_MEM_SIZE
#error not defined CL_DEVICE_LOCAL_MEM_SIZE by complier with options -D
#endif
#ifndef SRC_TYPE 
#error not defined SRC_TYPE by complier with options -D
#endif
#ifndef DST_TYPE 
#error not defined DST_TYPE by complier with options -D
#endif
#ifndef INTEG_TYPE 
#error not defined INTEG_TYPE by complier with options -D
#endif
#define V_TYPE 4
#define SHIFT_NUM 2
#define LOCAL_BUFFER_SIZE (CL_DEVICE_LOCAL_MEM_SIZE/sizeof(DST_TYPE))
#define _KERNEL_NAME(s,d,t) prefix_sum_col_and_transpose_##s##_##d##_##t
#define KERNEL_NAME(s,d,t) _KERNEL_NAME(s,d,t)
#define _KERNEL_NAME_INTEGRAL_BLOCK(s,d,t) integral_block_##s##_##d##_##t
#define KERNEL_NAME_INTEGRAL_BLOCK(s,d,t) _KERNEL_NAME_INTEGRAL_BLOCK(s,d,t)
#define _KERNEL_NAME_SCAN_V(s) integral_scan_v_##s
#define KERNEL_NAME_SCAN_V(s) _KERNEL_NAME_SCAN_V(s)
#define _KERNEL_NAME_COMBINE_V(s) integral_combine_v_##s
#define KERNEL_NAME_COMBINE_V(s) _KERNEL_NAME_COMBINE_V(s)
#define _KERNEL_NAME_SCAN_H(s) integral_scan_h_##s
#define KERNEL_NAME_SCAN_H(s) _KERNEL_NAME_SCAN_H(s)
#define _KERNEL_NAME_COMBINE_H(s) integral_combine_h_##s
#define KERNEL_NAME_COMBINE_H(s) _KERNEL_NAME_COMBINE_H(s)
#define _kernel_name_scan_v KERNEL_NAME_SCAN_V(DST_TYPE)
#define _kernel_name_scan_h KERNEL_NAME_SCAN_H(DST_TYPE)
#define _kernel_name_combine_v KERNEL_NAME_COMBINE_V(DST_TYPE)
#define _kernel_name_combine_h KERNEL_NAME_COMBINE_H(DST_TYPE)
#define VECTOR_SRC VECTOR(SRC_TYPE,V_TYPE)
#define VECTOR_DST VECTOR(DST_TYPE,V_TYPE)
#define VLOAD FUN_NAME(vload,V_TYPE)
#if INTEG_TYPE == INTEG_SQUARE
#define compute_src(src) src*src
#define _kernel_name_ KERNEL_NAME(SRC_TYPE,DST_TYPE,integ_square)
#define _kernel_name_integral_block KERNEL_NAME_INTEGRAL_BLOCK(SRC_TYPE,DST_TYPE,integ_square)
#elif INTEG_TYPE == INTEG_COUNT
#define compute_src(src) ((DST_TYPE)0!=src?(DST_TYPE)(1):(DST_TYPE)(0))
#define _kernel_name_ KERNEL_NAME(SRC_TYPE,DST_TYPE,integ_count)
#define _kernel_name_integral_block KERNEL_NAME_INTEGRAL_BLOCK(SRC_TYPE,DST_TYPE,integ_count)
#elif INTEG_TYPE == INTEG_DEFAULT
#define compute_src(src) src
#define _kernel_name_ KERNEL_NAME(SRC_TYPE,DST_TYPE,integ_default)
#define _kernel_name_integral_block KERNEL_NAME_INTEGRAL_BLOCK(SRC_TYPE,DST_TYPE,integ_default)
#else
#error unknow INTEG_TYPE by complier with options -D
#endif
///////////////////////////////////////////////////////////////////////////////
//! @brief  :   Calculates the integral of an image
////////////////////////////////////////////////////////////////////////////////
#define __SWAP(a,b) swap=a,a=b,b=swap;
// 4x4矩陣轉置
inline void transpose( VECTOR_DST m[V_TYPE] ){
DST_TYPE swap;
__SWAP(m[0].s1,m[1].s0);
__SWAP(m[0].s2,m[2].s0);
__SWAP(m[0].s3,m[3].s0);
__SWAP(m[1].s2,m[2].s1);
__SWAP(m[1].s3,m[3].s1);
__SWAP(m[2].s3,m[3].s2);
}
// 計算4x4的區域性積分圖
__kernel void _kernel_name_integral_block( __global SRC_TYPE *sourceImage, __global VECTOR_DST * dest, __constant integ_param* param){ 
int pos_x=get_global_id(0)*V_TYPE,pos_y=get_global_id(1)*V_TYPE;
if(pos_x>=param->width||pos_y>=param->height)return;
int count_x=min(V_TYPE,param->width -pos_x);
int count_y=min(V_TYPE,param->height-pos_y);
VECTOR_DST sum;
VECTOR_DST matrix[V_TYPE];
// 從原矩陣載入資料,並轉為目標矩陣的資料向量型別(VECTOR_DST),
//比如原矩陣是uchar,目標矩陣是float
matrix[0]= 0<count_y ?
count_x==V_TYPE?        VCONVERT(VECTOR_DST,)(VLOAD(0,sourceImage (pos_y 0)*param->src_width_step pos_x))
:(count_x==1?(VECTOR_DST)(VCONVERT(VECTOR_DST,)(VLOAD(0,sourceImage (pos_y 0)*param->src_width_step param->width-V_TYPE)).w,0,0,0)
:(count_x==2?(VECTOR_DST)(VCONVERT(VECTOR_DST,)(VLOAD(0,sourceImage (pos_y 0)*param->src_width_step param->width-V_TYPE)).zw,0,0)
:(VECTOR_DST)(VCONVERT(VECTOR_DST,)(VLOAD(0,sourceImage (pos_y 0)*param->src_width_step param->width-V_TYPE)).yzw,0)
)                       
):0;
matrix[1]= 1<count_y ?
count_x==V_TYPE?        VCONVERT(VECTOR_DST,)(VLOAD(0,sourceImage (pos_y 1)*param->src_width_step pos_x))
:(count_x==1?(VECTOR_DST)(VCONVERT(VECTOR_DST,)(VLOAD(0,sourceImage (pos_y 1)*param->src_width_step param->width-V_TYPE)).w,0,0,0)
:(count_x==2?(VECTOR_DST)(VCONVERT(VECTOR_DST,)(VLOAD(0,sourceImage (pos_y 1)*param->src_width_step param->width-V_TYPE)).zw,0,0)
:(VECTOR_DST)(VCONVERT(VECTOR_DST,)(VLOAD(0,sourceImage (pos_y 1)*param->src_width_step param->width-V_TYPE)).yzw,0)
)                       
):0;
matrix[2]= 2<count_y ?
count_x==V_TYPE?        VCONVERT(VECTOR_DST,)(VLOAD(0,sourceImage (pos_y 2)*param->src_width_step pos_x))
:(count_x==1?(VECTOR_DST)(VCONVERT(VECTOR_DST,)(VLOAD(0,sourceImage (pos_y 2)*param->src_width_step param->width-V_TYPE)).w,0,0,0)
:(count_x==2?(VECTOR_DST)(VCONVERT(VECTOR_DST,)(VLOAD(0,sourceImage (pos_y 2)*param->src_width_step param->width-V_TYPE)).zw,0,0)
:(VECTOR_DST)(VCONVERT(VECTOR_DST,)(VLOAD(0,sourceImage (pos_y 2)*param->src_width_step param->width-V_TYPE)).yzw,0)
)                       
):0;
matrix[3]= 3<count_y ?
count_x==V_TYPE?        VCONVERT(VECTOR_DST,)(VLOAD(0,sourceImage (pos_y 3)*param->src_width_step pos_x))
:(count_x==1?(VECTOR_DST)(VCONVERT(VECTOR_DST,)(VLOAD(0,sourceImage (pos_y 3)*param->src_width_step param->width-V_TYPE)).w,0,0,0)
:(count_x==2?(VECTOR_DST)(VCONVERT(VECTOR_DST,)(VLOAD(0,sourceImage (pos_y 3)*param->src_width_step param->width-V_TYPE)).zw,0,0)
:(VECTOR_DST)(VCONVERT(VECTOR_DST,)(VLOAD(0,sourceImage (pos_y 3)*param->src_width_step param->width-V_TYPE)).yzw,0)
)                       
):0;
sum=0;
//4x4矩陣縱向字首和計算
sum =compute_src(matrix[0]),matrix[0]=sum;
sum =compute_src(matrix[1]),matrix[1]=sum;
sum =compute_src(matrix[2]),matrix[2]=sum;
sum =compute_src(matrix[3]),matrix[3]=sum;
// 轉置矩陣
transpose(matrix);
sum=0;
//4x4矩陣橫向字首和計算 
sum =matrix[0],matrix[0]=sum;
sum =matrix[1],matrix[1]=sum;
sum =matrix[2],matrix[2]=sum;
sum =matrix[3],matrix[3]=sum;
// 第二次轉置矩陣,將矩陣方向恢復正常
transpose(matrix);  
// 計算結果將資料寫到目標矩陣
if(0<count_y)dest[((pos_y 0)*param->dst_width_step pos_x)/V_TYPE]=matrix[0];
if(1<count_y)dest[((pos_y 1)*param->dst_width_step pos_x)/V_TYPE]=matrix[1];
if(2<count_y)dest[((pos_y 2)*param->dst_width_step pos_x)/V_TYPE]=matrix[2];
if(3<count_y)dest[((pos_y 3)*param->dst_width_step pos_x)/V_TYPE]=matrix[3];
}
#undef __SWAP
// 將第一個kernel計算的結果(4x4分塊的區域性積分圖)作為輸入輸入矩陣(dest)
// 計算每個4x4塊縱向結尾資料的字首和,存入vert
__kernel void _kernel_name_scan_v( __global DST_TYPE * dest, __constant integ_param* param,__global DST_TYPE *vert,int vert_step){ 
int gid_y=get_global_id(0);
if(gid_y>=param->height)return;
DST_TYPE sum=0;
int dst_width_step=param->dst_width_step;
for(int x=V_TYPE-1,end_x=param->width;x<end_x;x =V_TYPE){
sum =dest[gid_y*dst_width_step x];
vert[gid_y*vert_step (x/V_TYPE)]=sum;
}
}
// 將上第一個kernel計算的結果(4x4分塊的區域性積分圖)作為輸入輸入矩陣(dest)
// 將上第二個kernel計算的分組字首和作為輸入輸入矩陣(vert)
// 對dest每個4x4塊資料加上vert對應的上一組增量,結果輸出到dest_out
__kernel void _kernel_name_combine_v( __global VECTOR_DST * dest, __constant integ_param* param,__global DST_TYPE *vert,int vert_step,__global VECTOR_DST * dest_out){ 
int gid_x=get_global_id(0),gid_y=get_global_id(1);
if(gid_x*V_TYPE>=param->width||gid_y>=param->height)return;
int dest_index=(gid_y*param->dst_width_step)/V_TYPE gid_x;
VECTOR_DST m  = dest[dest_index];   
m  = (VECTOR_DST)(gid_x>=1 ? vert[ gid_y*vert_step   gid_x-1]:0);
dest_out [dest_index]=m;
}
// 將上一個kernel計算的結果(4x4分塊的區域性積分圖)作為輸入輸入矩陣(dest)
// 計算每個4x4塊橫向結尾資料的字首和,存入horiz
__kernel void _kernel_name_scan_h( __global VECTOR_DST * dest, __constant integ_param* param,__global VECTOR_DST *horiz,int horiz_step){ 
int gid_x=get_global_id(0);
if(gid_x*V_TYPE>=param->width)return;
VECTOR_DST sum=0;
int dst_width_step=param->dst_width_step;
for(int y=V_TYPE-1,end_y=param->height;y<end_y;y =V_TYPE){
sum =dest[y*dst_width_step/V_TYPE gid_x];
horiz[(y/V_TYPE)*horiz_step/V_TYPE gid_x]=sum;
}
}
// 將第三個kernel計算的結果作為輸入輸入矩陣(dest)
// 將第四個kernel計算的分組字首和作為輸入輸入矩陣(vert)
// 對dest每個4x4塊資料加上horiz對應的上一組增量,結果輸出到dest_out
// dest_out就是最終的積分圖
__kernel void _kernel_name_combine_h( __global VECTOR_DST * dest, __constant integ_param* param,__global VECTOR_DST *horiz,int horiz_step,__global VECTOR_DST * dest_out){ 
int gid_x=get_global_id(0),gid_y=get_global_id(1);
if(gid_x*V_TYPE>=param->width||gid_y>=param->height)return;
VECTOR_DST m;
int dest_index=(gid_y*param->dst_width_step)/V_TYPE gid_x;
m  = dest[dest_index];  
m  = gid_y>=V_TYPE?horiz[((gid_y/V_TYPE)-1)*horiz_step/V_TYPE   gid_x  ]:(DST_TYPE)0;
dest_out[dest_index]=m; 
}

common_types.h

/*
* common_types.h
*
*  Created on: 2016年4月14日
*      Author: guyadong
*/
#ifndef FACEDETECT_CL_FILES_COMMON_TYPES_H_
#define FACEDETECT_CL_FILES_COMMON_TYPES_H_
#ifdef __OPENCL_VERSION__
typedef char    cl_char;
typedef uchar   cl_uchar;
typedef short   cl_short;
typedef ushort  cl_ushort;
typedef int     cl_int;
typedef uint    cl_uint;
typedef long    cl_long;
typedef ulong   cl_ulong;
typedef double  cl_double;
typedef float   cl_float;
typedef char2       cl_char2;
typedef char4       cl_char4;
typedef char8       cl_char8;
typedef char16      cl_char16;
typedef uchar2      cl_uchar2;
typedef uchar4      cl_uchar4;
typedef uchar8      cl_uchar8;
typedef uchar16     cl_uchar16;
typedef short2      cl_short2;
typedef short4      cl_short4;
typedef short8      cl_short8;
typedef short16     cl_short16;
typedef ushort2     cl_ushort2;
typedef ushort4     cl_ushort4;
typedef ushort8     cl_ushort8;
typedef ushort16    cl_ushort16;
typedef int2        cl_int2;
typedef int4        cl_int4;
typedef int8        cl_int8;
typedef int16       cl_int16;
typedef uint2       cl_uint2;
typedef uint4       cl_uint4;
typedef uint8       cl_uint8;
typedef uint16      cl_uint16;
typedef long2       cl_long2;
typedef long4       cl_long4;
typedef long8       cl_long8;
typedef long16      cl_long16;
typedef ulong2      cl_ulong2;
typedef ulong4      cl_ulong4;
typedef ulong8      cl_ulong8;
typedef ulong16     cl_ulong16;
typedef float2      cl_float2;
typedef float4      cl_float4;
typedef float8      cl_float8;
typedef float16     cl_float16;
typedef double2     cl_double2;
typedef double4     cl_double4;
typedef double8     cl_double8;
typedef double16    cl_double16;
#ifdef NDEBUG
#define DEBUG_LOG(format, ...) 
#else
#define DEBUG_LOG(format, ...) printf((__constant char*)format, __VA_ARGS__)
#endif
#define LOG(format, ...) printf((__constant char*)format, __VA_ARGS__)
#ifndef NULL
#define NULL 0
#endif
#define _VECTOR(t,n) t##n
#define VECTOR(t,n) _VECTOR(t,n)
#define _FUN_NAME(f,n) f##n
#define FUN_NAME(f,n) _FUN_NAME(f,n)
#define _FUN_NAME2(f,n,s) f##_##n##s
#define FUN_NAME2(f,n,s) _FUN_NAME2(f,n,s)
#define VCONVERT(vtype,suffix) FUN_NAME2(convert,vtype,suffix)
#define VCONVERT_SAT(vtype) VCONVERT(vtype,_sat)
#define VAS(vtype) FUN_NAME2(as,vtype,)
#define ALIGN_UP(v,a) ((v (1<<a)-1)>>a<<a)
//denominator/numerator
#define CEIL_DIV(d,n) (((d) (n)-1)/(n))
#endif
// define alignment macro for data struct crossed between host & device
#ifdef _MSC_VER
#define _CL_CROSS_ALIGN_(n) __declspec( align(n) )
#elif __GNUC__
#define _CL_CROSS_ALIGN_(n) __attribute__((aligned(n)))
#elif __cplusplus>=201103L
#define _CL_CROSS_ALIGN_(n) alignas(n)
#elif __OPENCL_VERSION__
#define _CL_CROSS_ALIGN_(n) __attribute__((aligned(n)))
#else
#warning  Need to implement some method to align data here
#define _CL_CROSS_ALIGN_(n)
#endif /*_MSC_VER*/
// define column num of each work-item working for integral kernel,
// is also equivalent to the number of local work-items so sad get_local_size(0)
#define INTEGRAL_COLUMN_STEP 16
#define IMGSCALE_LOCAL_SIZE 64
/* get divisor for len/num */
inline size_t gf_get_divisor(size_t len,size_t num){
return (size_t)(len/num (int)(len%num>0));
}
typedef struct _integ_param {
cl_int width,height,src_width_step,dst_width_step;
}integ_param;
typedef struct _matrix_info_cl {
cl_uint     width   ;
cl_uint     height  ;
cl_uint     row_stride;
/*
#ifdef __cplusplus
_matrix_info_cl(size_t width,size_t height,size_t row_stride=0):width(cl_uint(width)), height(cl_uint(width)),row_stride( cl_uint(row_stride? row_stride:width)) {}
_matrix_info_cl() = default;
_matrix_info_cl(const _matrix_info_cl&) = default;
_matrix_info_cl(_matrix_info_cl&&) = default;
_matrix_info_cl& operator=(const _matrix_info_cl&) = default;
_matrix_info_cl& operator=(_matrix_info_cl&&) = default;
#endif
*/
}matrix_info_cl;
// define integral matrix type
//  default intergal matrix
#define INTEG_DEFAULT   0
//  intergal matrix for suquare
#define INTEG_SQUARE    1
//  integral matrix for count of no zero
#define INTEG_COUNT     2
typedef enum _integral_type{
integ_default=INTEG_DEFAULT
,integ_square=INTEG_SQUARE
,integ_count=INTEG_COUNT
}integral_type;
#endif /* FACEDETECT_CL_FILES_COMMON_TYPES_H_ */