OpenCL でベクタ演算 (SIMD) を試してみる - その1 -
主旨
OpenCL でベクタ演算 (SIMD) を試してみます.問題はOpenCL をさわってみる - kawa0810の日記 で取り扱った配列同士の加算演算のベクタ演算 (SIMD) 化を考えます.また,プログラムはオンラインコンパイルで作成します.
OpenCL でベクタ演算 (SIMD)
OpenCL ではベクタ型を使用することで SIMD ユニットを活用できる可能性があるそうです (OpenCL 入門より).OpenCL で使用できるベクタ型には以下のものがあります.
char2 tmp; int4 x;//4個の int をもつ変数 double4 y;//4個の double をもつ変数 float8 z;//8個の float をもつ変数 //etc...
ベクタ型を利用することで以下のように計算することが可能となります
//1. 従来 float x[4], y[4], z[4]; for(size_t i=0; i<4; ++i) z[i] = x[i] + y[i]; //2. ベクタ型 float4 x, y, z; z = x + y;//1行で4つの計算が可能 //3. 各要素毎に計算する場合は x(0番目),y(1番目), z(2番目),w(3番目)でも可能 z.x = x.x + y.x; z.y = x.y + y.y; z.z = x.z + y.z; z.w = x.w + y.w; //4. s をつけると数字でアクセスすることも可能 z.s0 = x.s0 + y.s0; z.s1 = x.s1 + y.s1; z.s2 = x.s2 + y.s2; z.s3 = x.s3 + y.s3; //5. 奇数データ (odd),偶数データ (even) だけアクセスすることも可能 float2 odd = x.odd;// odd = [x[1], x[3]]; float2 even = x.even;// even = [x[0], x[2]]; //6. 上位半分 (hi),下位半分(lo) だけにもアクセス可能 float2 hi = x.hi;// odd = [x[0], x[1]] float2 lo = x.lo;// lo = [x[2], x[3]]
カーネルソースコード
/*-------------------- kernel.cl --------------------*/ __kernel void vec_add(size_t n, __global float* z, __global float* x, __global float* y){ const size_t para = 4; const size_t end = (n / para) * para; for(size_t i=0; i<end; i+=para){ float4 vtmp = vload4(0, x+i) + vload4(0, y+i); vstore4(vtmp, 0, z+i); } for(size_t i=end; i<n; ++i){ z[i] = x[i] + y[i]; } }
ホストソースコード
/*-------------------- opencl_test.cpp --------------------*/ #include <iostream> #include <cstdio> #include <cstdlib> #include <cstring> #include <numeric> #include <sys/time.h> #ifdef __APPLE__ #include <OpenCL/opencl.h> #else #include <CL/cl.h> #endif #define MAX_SOURCE_SIZE (0x100000) int main(void){ const char filename[] = "./kernel.cl"; std::size_t const n = 10; cl_platform_id platform_id = NULL; cl_device_id device_id = NULL; cl_context context = NULL; cl_command_queue command_queue = NULL; cl_program program = NULL; cl_kernel kernel = NULL; cl_uint ret_num_devices; cl_uint ret_num_platforms; cl_int ret; cl_mem x_dev, y_dev, z_dev;//for device memory float* x = (float*)malloc(sizeof(float) * n); float* y = (float*)malloc(sizeof(float) * n); float* z = (float*)malloc(sizeof(float) * n); srand( static_cast<unsigned>(time(NULL)) ); for(std::size_t i=0; i<n; ++i) x[i] = static_cast<float>( rand() ) / RAND_MAX; for(std::size_t i=0; i<n; ++i) y[i] = static_cast<float>( rand() ) / RAND_MAX; memset(z, 0x00, sizeof(float) * n); //Read a Kernel code FILE* fp = fopen(filename, "r"); if(!fp){ std::cerr << "Failed to load kernel" << std::endl; return -1; } char* source_str = (char*)malloc(MAX_SOURCE_SIZE); std::size_t source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp); fclose(fp); //Get the platforms ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms); //Get the device ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices); //Create context context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret); //Create Command Queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret); //Create Program Object program = clCreateProgramWithSource(context, 1, (const char**)&source_str, (const size_t*)&source_size, &ret); //Compile of kernel's source code ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); //Create Kernel object kernel = clCreateKernel(program, "vec_add", &ret); //Create memory object x_dev = clCreateBuffer(context, CL_MEM_READ_WRITE, n * sizeof(float), NULL, &ret); y_dev = clCreateBuffer(context, CL_MEM_READ_WRITE, n * sizeof(float), NULL, &ret); z_dev = clCreateBuffer(context, CL_MEM_READ_WRITE, n * sizeof(float), NULL, &ret); //memory copy host to device ret = clEnqueueWriteBuffer(command_queue, x_dev, CL_TRUE, 0, n * sizeof(float), x, 0, NULL, NULL); ret = clEnqueueWriteBuffer(command_queue, y_dev, CL_TRUE, 0, n * sizeof(float), y, 0, NULL, NULL); ret = clEnqueueWriteBuffer(command_queue, z_dev, CL_TRUE, 0, n * sizeof(float), z, 0, NULL, NULL); //Set args for kernel object ret = clSetKernelArg(kernel, 0, sizeof(std::size_t), &n); ret += clSetKernelArg(kernel, 1, sizeof(cl_mem), &z_dev); ret += clSetKernelArg(kernel, 2, sizeof(cl_mem), &x_dev); ret += clSetKernelArg(kernel, 3, sizeof(cl_mem), &y_dev); //Execute Kernel Program ret = clEnqueueTask(command_queue, kernel, 0, NULL, NULL); //memory copy device to host ret = clEnqueueReadBuffer(command_queue, z_dev, CL_TRUE, 0, n * sizeof(float), z, 0, NULL, NULL); for(std::size_t i=0; i<n; ++i){ std::cout << x[i] << " + " << y[i] << " = " << z[i]<< std::endl; } //memory free ret = clReleaseMemObject(x_dev); ret = clReleaseMemObject(y_dev); ret = clReleaseMemObject(z_dev); ret = clFlush(command_queue); ret = clFinish(command_queue); ret = clReleaseKernel(kernel); ret = clReleaseProgram(program); ret = clReleaseCommandQueue(command_queue); ret = clReleaseContext(context); free(x); free(y); free(z); free(source_str); return 0; }