Point Cloud Library (PCL) 1.12.0
block.hpp
1/*
2* Software License Agreement (BSD License)
3*
4* Copyright (c) 2011, Willow Garage, Inc.
5* All rights reserved.
6*
7* Redistribution and use in source and binary forms, with or without
8* modification, are permitted provided that the following conditions
9* are met:
10*
11* * Redistributions of source code must retain the above copyright
12* notice, this list of conditions and the following disclaimer.
13* * Redistributions in binary form must reproduce the above
14* copyright notice, this list of conditions and the following
15* disclaimer in the documentation and/or other materials provided
16* with the distribution.
17* * Neither the name of Willow Garage, Inc. nor the names of its
18* contributors may be used to endorse or promote products derived
19* from this software without specific prior written permission.
20*
21* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
27* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
31* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32* POSSIBILITY OF SUCH DAMAGE.
33*
34* Author: Anatoly Baskeheev, Itseez Ltd, (myname.mysurname@mycompany.com)
35*/
36
37#ifndef PCL_DEVICE_UTILS_BLOCK_HPP_
38#define PCL_DEVICE_UTILS_BLOCK_HPP_
39
40namespace pcl
41{
42 namespace device
43 {
44 struct Block
45 {
46 static __device__ __forceinline__ unsigned int id()
47 {
48 return blockIdx.x;
49 }
50
51 static __device__ __forceinline__ unsigned int stride()
52 {
53 return blockDim.x * blockDim.y * blockDim.z;
54 }
55
56 static __device__ __forceinline__ void sync()
57 {
58 __syncthreads();
59 }
60
61 static __device__ __forceinline__ int flattenedThreadId()
62 {
63 return threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
64 }
65
66 template<typename It, typename T>
67 static __device__ __forceinline__ void fill(It beg, It end, const T& value)
68 {
69 int STRIDE = stride();
70 It t = beg + flattenedThreadId();
71
72 for(; t < end; t += STRIDE)
73 *t = value;
74 }
75
76 template<typename OutIt, typename T>
77 static __device__ __forceinline__ void yota(OutIt beg, OutIt end, T value)
78 {
79 int STRIDE = stride();
80 int tid = flattenedThreadId();
81 value += tid;
82
83 for(OutIt t = beg + tid; t < end; t += STRIDE, value += STRIDE)
84 *t = value;
85 }
86
87 template<typename InIt, typename OutIt>
88 static __device__ __forceinline__ void copy(InIt beg, InIt end, OutIt out)
89 {
90 int STRIDE = stride();
91 InIt t = beg + flattenedThreadId();
92 OutIt o = out + (t - beg);
93
94 for(; t < end; t += STRIDE, o += STRIDE)
95 *o = *t;
96 }
97
98 template<typename InIt, typename OutIt, class UnOp>
99 static __device__ __forceinline__ void transform(InIt beg, InIt end, OutIt out, UnOp op)
100 {
101 int STRIDE = stride();
102 InIt t = beg + flattenedThreadId();
103 OutIt o = out + (t - beg);
104
105 for(; t < end; t += STRIDE, o += STRIDE)
106 *o = op(*t);
107 }
108
109 template<typename InIt1, typename InIt2, typename OutIt, class BinOp>
110 static __device__ __forceinline__ void transform(InIt1 beg1, InIt1 end1, InIt2 beg2, OutIt out, BinOp op)
111 {
112 int STRIDE = stride();
113 InIt1 t1 = beg1 + flattenedThreadId();
114 InIt2 t2 = beg2 + flattenedThreadId();
115 OutIt o = out + (t1 - beg1);
116
117 for(; t1 < end1; t1 += STRIDE, t2 += STRIDE, o += STRIDE)
118 *o = op(*t1, *t2);
119 }
120
121 template<int CTA_SIZE, typename T, class BinOp>
122 static __device__ __forceinline__ void reduce(volatile T* buffer, BinOp op)
123 {
124 int tid = flattenedThreadId();
125 T val = buffer[tid];
126
127 if (CTA_SIZE >= 1024) { if (tid < 512) buffer[tid] = val = op(val, buffer[tid + 512]); __syncthreads(); }
128 if (CTA_SIZE >= 512) { if (tid < 256) buffer[tid] = val = op(val, buffer[tid + 256]); __syncthreads(); }
129 if (CTA_SIZE >= 256) { if (tid < 128) buffer[tid] = val = op(val, buffer[tid + 128]); __syncthreads(); }
130 if (CTA_SIZE >= 128) { if (tid < 64) buffer[tid] = val = op(val, buffer[tid + 64]); __syncthreads(); }
131
132 if (tid < 32)
133 {
134 if (CTA_SIZE >= 64) { buffer[tid] = val = op(val, buffer[tid + 32]); }
135 if (CTA_SIZE >= 32) { buffer[tid] = val = op(val, buffer[tid + 16]); }
136 if (CTA_SIZE >= 16) { buffer[tid] = val = op(val, buffer[tid + 8]); }
137 if (CTA_SIZE >= 8) { buffer[tid] = val = op(val, buffer[tid + 4]); }
138 if (CTA_SIZE >= 4) { buffer[tid] = val = op(val, buffer[tid + 2]); }
139 if (CTA_SIZE >= 2) { buffer[tid] = val = op(val, buffer[tid + 1]); }
140 }
141 }
142
143 template<int CTA_SIZE, typename T, class BinOp>
144 static __device__ __forceinline__ T reduce(volatile T* buffer, T init, BinOp op)
145 {
146 int tid = flattenedThreadId();
147 T val = buffer[tid] = init;
148 __syncthreads();
149
150 if (CTA_SIZE >= 1024) { if (tid < 512) buffer[tid] = val = op(val, buffer[tid + 512]); __syncthreads(); }
151 if (CTA_SIZE >= 512) { if (tid < 256) buffer[tid] = val = op(val, buffer[tid + 256]); __syncthreads(); }
152 if (CTA_SIZE >= 256) { if (tid < 128) buffer[tid] = val = op(val, buffer[tid + 128]); __syncthreads(); }
153 if (CTA_SIZE >= 128) { if (tid < 64) buffer[tid] = val = op(val, buffer[tid + 64]); __syncthreads(); }
154
155 if (tid < 32)
156 {
157 if (CTA_SIZE >= 64) { buffer[tid] = val = op(val, buffer[tid + 32]); }
158 if (CTA_SIZE >= 32) { buffer[tid] = val = op(val, buffer[tid + 16]); }
159 if (CTA_SIZE >= 16) { buffer[tid] = val = op(val, buffer[tid + 8]); }
160 if (CTA_SIZE >= 8) { buffer[tid] = val = op(val, buffer[tid + 4]); }
161 if (CTA_SIZE >= 4) { buffer[tid] = val = op(val, buffer[tid + 2]); }
162 if (CTA_SIZE >= 2) { buffer[tid] = val = op(val, buffer[tid + 1]); }
163 }
164 __syncthreads();
165 return buffer[0];
166 }
167
168 template <typename T, class BinOp>
169 static __device__ __forceinline__ void reduce_n(T* data, unsigned int n, BinOp op)
170 {
171 int ftid = flattenedThreadId();
172 int sft = stride();
173
174 if (sft < n)
175 {
176 for (unsigned int i = sft + ftid; i < n; i += sft)
177 data[ftid] = op(data[ftid], data[i]);
178
179 __syncthreads();
180
181 n = sft;
182 }
183
184 while (n > 1)
185 {
186 unsigned int half = n/2;
187
188 if (ftid < half)
189 data[ftid] = op(data[ftid], data[n - ftid - 1]);
190
191 __syncthreads();
192
193 n = n - half;
194 }
195 }
196 };
197 }
198}
199
200#endif /* PCL_DEVICE_UTILS_BLOCK_HPP_ */
201
static __device__ __forceinline__ void yota(OutIt beg, OutIt end, T value)
Definition: block.hpp:77
static __device__ __forceinline__ void fill(It beg, It end, const T &value)
Definition: block.hpp:67
static __device__ __forceinline__ void reduce(volatile T *buffer, BinOp op)
Definition: block.hpp:122
static __device__ __forceinline__ T reduce(volatile T *buffer, T init, BinOp op)
Definition: block.hpp:144
static __device__ __forceinline__ void sync()
Definition: block.hpp:56
static __device__ __forceinline__ void reduce_n(T *data, unsigned int n, BinOp op)
Definition: block.hpp:169
static __device__ __forceinline__ unsigned int stride()
Definition: block.hpp:51
static __device__ __forceinline__ void transform(InIt1 beg1, InIt1 end1, InIt2 beg2, OutIt out, BinOp op)
Definition: block.hpp:110
static __device__ __forceinline__ void copy(InIt beg, InIt end, OutIt out)
Definition: block.hpp:88
static __device__ __forceinline__ unsigned int id()
Definition: block.hpp:46
static __device__ __forceinline__ void transform(InIt beg, InIt end, OutIt out, UnOp op)
Definition: block.hpp:99
static __device__ __forceinline__ int flattenedThreadId()
Definition: block.hpp:61