PyOpenCL示例-05.数值归并

>>数值归并例子1

一维数组,里面有8192个元素,每个元素值是1,我们求所有元素的和,即最后答案是8192。

主程序(main.py)

import pyopencl as cl
import numpy as np

if __name__ == '__main__':
    
    #要计算的元素个数
    Array_size = 8192    
    
    #step 0:获取GPU计算设备
    platform  = cl.get_platforms()[0]
    devices = platform.get_devices()
    
    #step 1:选择设备并创建上下文
    context = cl.create_some_context() 
    
    #step 2:创建命令队列
    queue = cl.CommandQueue(context,devices[0],cl.command_queue_properties.PROFILING_ENABLE)
    
    #获取系统信息,我的K600中max_work_group_size=1024
    max_work_group_size = devices[0].get_info(cl.device_info.MAX_WORK_GROUP_SIZE)
    
    #分组
    num_groups = Array_size // max_work_group_size
    
    #创建一个包含8192个1的浮点数组
    buf_num = np.ones(Array_size,dtype=np.float32)
    buf_out = np.zeros(num_groups,dtype=np.float32)
            
    num_buf = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=buf_num)
    out_buf = cl.Buffer(context, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=buf_out)
    #为每个组准备一个存放中间计算值的空间
    tmp_buf = cl.LocalMemory(Array_size)
    
    #加载并创建CL程序
    f_cl = open("reduction_scalar.cl","r")
    f_buf = f_cl.read()
    prg = cl.Program(context,f_buf).build()
    
    #运行
    prg.reduction_scalar(queue,(Array_size,),(max_work_group_size,),num_buf,tmp_buf,out_buf)
    
    cl.enqueue_copy(queue,buf_out,out_buf)
    
    sum=0
    for i in range(0,num_groups):
        sum += buf_out[i]

    print(sum)
 
    num_buf.release()
    out_buf.release()
    queue.finish()    


    

内核程序(reduction_scalar.cl)

__kernel void reduction_scalar(__global float* data, __local float* partial_sums, __global float* output){
         int cid = get_local_id(0);
         int group_size = get_local_size(0);
                 
         partial_sums[cid] = data[get_global_id(0)];
         barrier(CLK_LOCAL_MEM_FENCE);
                 
         for(int i=group_size/2;i>0;i/=2){
            if(cid<i){
                 partial_sums[cid] += partial_sums[cid+i];
            } 
            barrier(CLK_LOCAL_MEM_FENCE);
         }
                 
         if(cid==0){
             output[get_group_id(0)] = partial_sums[0];
         }
 } 
PyOpenCL示例-05.数值归并
滚动到顶部