48 static void swap(T &p1, T &p2)
57 static T divUp(T a, T b)
59 return (a + b - 1) / b;
66 static __device__ __inline__
void assign(
volatile T *dst,
volatile T *src)
71 static __device__ __inline__
void reduce(
volatile T &in1out,
const volatile T &in2)
81 static __device__ __inline__
void assign(
volatile T *dst,
volatile T *src)
86 static __device__ __inline__
void reduce(
volatile T &in1out,
const volatile T &in2)
88 in1out = in1out > in2 ? in2 : in1out;
96 static __device__ __inline__
void assign(
volatile T *dst,
volatile T *src)
101 static __device__ __inline__
void reduce(
volatile T &in1out,
const volatile T &in2)
103 in1out = in1out > in2 ? in1out : in2;
108 template<
typename Tdata,
class Tfunc, Ncv32u nThreads>
109 static __device__ Tdata subReduce(Tdata threadElem)
113 __shared__ Tdata _reduceArr[nThreads];
114 volatile Tdata *reduceArr = _reduceArr;
115 functor.assign(reduceArr + threadIdx.x, &threadElem);
118 if (nThreads >= 256 && threadIdx.x < 128)
120 functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 128]);
124 if (nThreads >= 128 && threadIdx.x < 64)
126 functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 64]);
130 if (threadIdx.x < 32)
134 functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 32]);
136 if (nThreads >= 32 && threadIdx.x < 16)
138 functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 16]);
139 functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 8]);
140 functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 4]);
141 functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 2]);
142 functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 1]);
148 functor.assign(&reduceRes, reduceArr);
__device__ __host__ __forceinline__ void swap(T &a, T &b)
static __device__ __inline__ void reduce(volatile T &in1out, const volatile T &in2)
static __device__ __inline__ void assign(volatile T *dst, volatile T *src)
static __device__ __inline__ void assign(volatile T *dst, volatile T *src)
static __device__ __inline__ void reduce(volatile T &in1out, const volatile T &in2)
static __device__ __inline__ void reduce(volatile T &in1out, const volatile T &in2)
static __device__ __inline__ void assign(volatile T *dst, volatile T *src)