求矩阵每行的和?
可以把每行放入一个不同线程块,这样行与行之间进行粗粒度的并行。而对于每行,其对应的线程块中分配n个线程(对应行宽),使用共享存储器,让每个线程从显存中读取一个数至shared memory中,然后使用规约算法计算和。
代码如下:
#include "cuda_runtime.h" //CUDA运行时API#include "device_launch_parameters.h"#include#include cudaError_t addWithCuda(int mat[4][8], int *ans, dim3 d);__global__ void addKernel(int *mat, int *ans, size_t pitch){ int bid = blockIdx.x; int tid = threadIdx.x; __shared__ int data[8]; int *row = (int*)((char*)mat + bid*pitch); data[tid] = row[tid]; __syncthreads(); for (int i = 4; i > 0; i /= 2) { if (tid < i) data[tid] = data[tid] + data[tid + i]; __syncthreads(); } if (tid == 0) ans[bid] = data[0];}int main(){ const int row = 4; const int col = 8; dim3 d(col, row); int mat[row][col] = { 1,2,3,4,5,1,2,3, 6,7,8,9,10,4,5,6, 11,12,13,14,15,7,8,9, 16,17,18,19,20,10,11,12 }; int ans[row]; // Add vectors in parallel. cudaError_t cudaStatus = addWithCuda(mat, ans, d); if (cudaStatus != cudaSuccess) { fprintf(stderr, "addWithCuda failed!\n"); return 1; } // cudaThreadExit must be called before exiting in order for profiling and // tracing tools such as Nsight and Visual Profiler to show complete traces. cudaStatus = cudaThreadExit(); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaThreadExit failed!"); return 1; } for (int i = 0; i < d.y; i++) { std::cout << ans[i] << " "; } return 0;}// 重点理解这个函数cudaError_t addWithCuda(int mat[4][8], int *ans, dim3 d){ int *dev_mat = 0; //GPU设备端数据指针 int *dev_ans = 0; int pitch; cudaError_t cudaStatus; //状态指示 // Choose which GPU to run on, change this on a multi-GPU system. cudaStatus = cudaSetDevice(0); //选择运行平台 if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?"); goto Error; } // 分配GPU设备端内存 cudaStatus = cudaMallocPitch((void**)&dev_mat, (size_t *)&pitch, d.x * sizeof(int), d.y); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMalloc failed!\n"); goto Error; } cudaStatus = cudaMalloc((void**)&dev_ans, d.y * sizeof(int)); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMalloc failed!\n"); goto Error; } // 拷贝数据到GPU cudaStatus = cudaMemcpy2D(dev_mat, pitch, mat, d.x*sizeof(int), d.x*sizeof(int), d.y, cudaMemcpyHostToDevice); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMemcpy for dev_mat failed!\n"); goto Error; } cudaStatus = cudaMemcpy(dev_ans, ans, d.y * sizeof(int), cudaMemcpyHostToDevice); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMemcpy for dev_ans failed!\n"); goto Error; } // 运行核函数 cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start, 0); addKernel<< >>(dev_mat, dev_ans, pitch); //addKernel_thd << <1, size >> >(dev_c, dev_a, dev_b); cudaEventRecord(stop, 0); cudaEventSynchronize(stop); float tm; cudaEventElapsedTime(&tm, start, stop); printf("GPU Elapsed time:%.6f ms.\n", tm); // cudaThreadSynchronize waits for the kernel to finish, and returns // any errors encountered during the launch. cudaStatus = cudaThreadSynchronize(); //同步线程 if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaThreadSynchronize returned error code %d after launching addKernel!\n", cudaStatus); goto Error; } // Copy output vector from GPU buffer to host memory. cudaStatus = cudaMemcpy(ans, dev_ans, d.y * sizeof(int), cudaMemcpyDeviceToHost); //拷贝结果回主机 if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMemcpy failed!"); goto Error; }Error: cudaFree(dev_mat); //释放GPU设备端内存 cudaFree(dev_ans); return cudaStatus;}