template<int block_size> __global__ voidReduceSumKernel(int *in, int *out, int n){ int offset = blockIdx.x * blockDim.x; int tid = offset + threadIdx.x; int stride = blockDim.x * gridDim.x;
__shared__ int buffer[block_size];
// 在上一个版本中,共享内存数组中的每一个元素仅仅保存了一个全局内存数组的数据 // 为了提高归约之前所做计算的比例,可以在归约之前将多个全局内存数组的数据累加到一个共享内存数组的一个元素中 // 如果一个线程处理相邻的几个数据,会导致全局内存的非合并访问,所以必须让相邻的线程访问相邻的数据 // 这就意味着同一个线程访问的数据之间有一个跨度,这里使用整个 grid 的大小作为跨度 int t = 0; for (int i = tid; i < n; i += stride) { t += in[i]; } buffer[threadIdx.x] = t; __syncthreads();
for (int i = block_size >> 1; i >= 32; i >>= 1) { if (threadIdx.x < i) { buffer[threadIdx.x] += buffer[threadIdx.x + i]; } __syncthreads(); }
t = buffer[threadIdx.x]; for (int i = 16; i >= 1; i >>= 1) { t += __shfl_down_sync(0xffffffff, t, i); }
__global__ voidVecTop2Kernel(int *in, int *out, int n){ int offset = blockIdx.x * blockDim.x; int tid = offset + threadIdx.x; int stride = blockDim.x * gridDim.x;
extern __shared__ int lich[];
// 每个线程先读取一个数据,并将数据写入共享内存 // 在该核函数被第一次调用时,每个线程中的 top1 为它读取的数,top2 为 INT_MIN int top1 = INT_MIN; int top2 = INT_MIN;
for (int i = tid; i < n; i += stride) { if (in[i] > top2) { top2 = min(in[i], top1); top1 = max(in[i], top1); } } lich[2 * threadIdx.x] = top1; lich[2 * threadIdx.x + 1] = top2; __syncthreads();
// 每次调用一半的线程,每个线程读取四个数据,从中选出两个最大的值,把这两个最大的值写入共享内存中的相应位置 // 每次循环都筛选掉一半数字,i 是每次循环使用的线程数,c、d 与 a、b 错开了 2 * i 个数 for (int i = blockDim.x / 2; i >= 1; i /= 2) { if (threadIdx.x < i) { int a = lich[2 * threadIdx.x]; int b = lich[2 * threadIdx.x + 1]; int c = lich[2 * threadIdx.x + 2 * i]; int d = lich[2 * threadIdx.x + 1 + 2 * i]; top1 = max(a, c); top2 = min(max(a, d), max(b, c));
template<int block_size> __global__ voidVecTop2Kernel(int *in, int *out, int n){ int offset = block_size * blockIdx.x; int tid = offset + threadIdx.x; int stride = block_size * gridDim.x;
__shared__ int lich[block_size * 2];
int top1 = INT_MIN; int top2 = INT_MIN;
for (int i = tid; i < n; i += stride) { if (in[i] > top2) { top2 = min(in[i], top1); top1 = max(in[i], top1); } } lich[2 * threadIdx.x] = top1; lich[2 * threadIdx.x + 1] = top2; __syncthreads();
for (int i = blockDim.x >> 1; i >= 1; i >>= 1) { if (threadIdx.x < i) { int a = lich[2 * threadIdx.x]; int b = lich[2 * threadIdx.x + 1]; int c = lich[2 * threadIdx.x + 2 * i]; int d = lich[2 * threadIdx.x + 1 + 2 * i]; top1 = max(a, c); top2 = min(max(a, d), max(b, c));
template <int block_size> __global__ voidVecTop2Kernel(int *in, int *out, int n){ int offset = block_size * blockIdx.x; int tid = offset + threadIdx.x; int stride = block_size * gridDim.x;
__shared__ int lich[2 * block_size];
int top1 = INT_MIN; int top2 = INT_MIN;
for (int i = tid; i < n; i += stride) { if (in[i] > top2) { top2 = min(in[i], top1); top1 = max(in[i], top1); } } lich[2 * threadIdx.x] = top1; lich[2 * threadIdx.x + 1] = top2; __syncthreads();
for (int i = block_size >> 1; i >= 32; i >>= 1) { if (threadIdx.x < i) { int a = lich[2 * threadIdx.x]; int b = lich[2 * threadIdx.x + 1]; int c = lich[2 * threadIdx.x + 2 * i]; int d = lich[2 * threadIdx.x + 1 + 2 * i]; top1 = max(a, c); top2 = min(max(a, d), max(b, c));
int a = lich[2 * threadIdx.x]; int b = lich[2 * threadIdx.x + 1]; for (int i = 16; i >= 1; i >>= 1) { int c = __shfl_down_sync(0xffffffff, a, i); int d = __shfl_down_sync(0xffffffff, b, i);