>>数值归并例子1
一维数组,里面有8192个元素,每个元素值是1,我们求所有元素的和,即最后答案是8192。
主程序(main.py)
import pyopencl as cl import numpy as np if __name__ == '__main__': #要计算的元素个数 Array_size = 8192 #step 0:获取GPU计算设备 platform = cl.get_platforms()[0] devices = platform.get_devices() #step 1:选择设备并创建上下文 context = cl.create_some_context() #step 2:创建命令队列 queue = cl.CommandQueue(context,devices[0],cl.command_queue_properties.PROFILING_ENABLE) #获取系统信息,我的K600中max_work_group_size=1024 max_work_group_size = devices[0].get_info(cl.device_info.MAX_WORK_GROUP_SIZE) #分组 num_groups = Array_size // max_work_group_size #创建一个包含8192个1的浮点数组 buf_num = np.ones(Array_size,dtype=np.float32) buf_out = np.zeros(num_groups,dtype=np.float32) num_buf = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=buf_num) out_buf = cl.Buffer(context, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=buf_out) #为每个组准备一个存放中间计算值的空间 tmp_buf = cl.LocalMemory(Array_size) #加载并创建CL程序 f_cl = open("reduction_scalar.cl","r") f_buf = f_cl.read() prg = cl.Program(context,f_buf).build() #运行 prg.reduction_scalar(queue,(Array_size,),(max_work_group_size,),num_buf,tmp_buf,out_buf) cl.enqueue_copy(queue,buf_out,out_buf) sum=0 for i in range(0,num_groups): sum += buf_out[i] print(sum) num_buf.release() out_buf.release() queue.finish()
内核程序(reduction_scalar.cl)
__kernel void reduction_scalar(__global float* data, __local float* partial_sums, __global float* output){ int cid = get_local_id(0); int group_size = get_local_size(0); partial_sums[cid] = data[get_global_id(0)]; barrier(CLK_LOCAL_MEM_FENCE); for(int i=group_size/2;i>0;i/=2){ if(cid<i){ partial_sums[cid] += partial_sums[cid+i]; } barrier(CLK_LOCAL_MEM_FENCE); } if(cid==0){ output[get_group_id(0)] = partial_sums[0]; } }
PyOpenCL示例-05.数值归并