47 #endif // HAVE_VIENNACL
55 namespace implementation
61 template <enum Backend,
class Matrix>
64 typedef typename Matrix::Scalar
T;
82 static void compute(Matrix X, Matrix W, Matrix Y,
bool flip ,
83 bool overwrite, int32_t stride_x, int32_t stride_y);
89 template <>
template <
class Matrix>
92 typedef typename Matrix::Scalar
T;
110 bool overwrite, int32_t stride_x, int32_t stride_y)
118 int32_t rx = (kx-1)/2;
119 int32_t ry = (ky-1)/2;
121 for (int32_t x=0; x<width; x+=stride_x)
123 int32_t xout = x/stride_x;
125 for (int32_t y=0; y<height; y+=stride_y)
127 int32_t yout = y/stride_y;
129 T
sum = overwrite ? 0 : Y(yout,xout);
130 for (int32_t x1=x-rx; x1<=x+rx; x1++)
132 int32_t wx = flip ? x1-x+rx : rx-x1+x;
133 for (int32_t y1=y-ry; y1<=y+ry; y1++)
135 if (x1>=0 && y1>=0 && x1<width && y1<height)
138 sum += W(y1-y+ry,wx)*X(y1,x1);
140 sum += W(ry-y1+y,wx)*X(y1,x1);
149 #endif // HAVE_EIGEN3
154 template <>
template <
class Matrix>
155 struct convolve<Backend::VIENNACL, Matrix>
157 typedef typename Matrix::Scalar
T;
161 static viennacl::ocl::kernel& generate_kernel_unity_stride(
162 int32_t radius_x, int32_t radius_y,
bool flip,
bool overwrite)
164 std::string kernel_name =
165 "convolve_unity_stride_" + ocl::get_type_string<T>() +
"_" +
166 std::to_string(radius_x) +
"_" + std::to_string(radius_y);
168 if (flip) kernel_name.append(
"_flip");
169 if (overwrite) kernel_name.append(
"_overwrite");
171 if (ocl::kernel_exists(kernel_name))
172 return ocl::get_kernel(kernel_name);
174 std::string source = ocl::generate_kernel_preamble<T>(kernel_name);
176 if (flip) source.append(
"#define FLIP\n");
177 if (overwrite) source.append(
"#define OVERWRITE\n");
179 source.append(
"#define RADIUS_X " + std::to_string(radius_x) +
"\n");
180 source.append(
"#define RADIUS_Y " + std::to_string(radius_y) +
"\n");
184 #define W_WIDTH (2*RADIUS_X+1)
185 #define W_HEIGHT (2*RADIUS_Y+1)
187 #define X_LOCAL_WIDTH (WORK_GROUP_SIZE_2D+2*RADIUS_X)
188 #define X_LOCAL_HEIGHT (WORK_GROUP_SIZE_2D+2*RADIUS_Y)
190 inline DATATYPE readX(read_only __global DATATYPE* X, int x, int y,
191 int X_width, int X_height, int X_offset)
193 if (x>=0 && y>=0 && x<X_width && y<X_height)
194 return X[y + x*X_height + X_offset];
199 __kernel void KERNEL_NAME(
200 read_only __global DATATYPE* X, int X_width, int X_height, int X_offset,
201 __constant DATATYPE* W, int W_offset,
202 __global DATATYPE* Y, int Y_offset)
204 __local DATATYPE X_local[X_LOCAL_WIDTH][X_LOCAL_HEIGHT];
206 int x = get_global_id(0);
207 int y = get_global_id(1);
209 int xl = get_local_id(0);
210 int yl = get_local_id(1);
212 if (xl==WORK_GROUP_SIZE_2D-1 && yl == WORK_GROUP_SIZE_2D-1)
214 for (int rx=0; rx<=2*RADIUS_X; rx++)
215 for (int ry=0; ry<=2*RADIUS_Y; ry++)
216 X_local[xl+rx][yl+ry] = readX(X, x-RADIUS_X+rx, y-RADIUS_Y+ry, X_width, X_height, X_offset);
218 else if (xl==WORK_GROUP_SIZE_2D-1)
220 for (int rx=0; rx<=2*RADIUS_X; rx++)
221 X_local[xl+rx][yl] = readX(X, x-RADIUS_X+rx, y-RADIUS_Y, X_width, X_height, X_offset);
223 else if (yl == WORK_GROUP_SIZE_2D-1)
225 for (int ry=0; ry<=2*RADIUS_Y; ry++)
226 X_local[xl][yl+ry] = readX(X, x-RADIUS_X, y-RADIUS_Y+ry, X_width, X_height, X_offset);
229 X_local[xl][yl] = readX(X, x-RADIUS_X, y-RADIUS_Y, X_width, X_height, X_offset);
231 barrier(CLK_LOCAL_MEM_FENCE);
233 if (x>=X_width || y>=X_height)
237 for (int x1=0; x1<W_WIDTH; x1++)
240 int wx = x1*W_HEIGHT+W_offset;
242 int wx = (2*RADIUS_X-x1)*W_HEIGHT+W_offset;
245 for (int y1=0; y1<W_HEIGHT; y1++)
249 sum += W[y1+wx]*X_local[inx][iny];
251 sum += W[2*RADIUS_Y-y1+wx]*X_local[inx][iny];
256 Y[y+X_height*x + Y_offset] = sum;
258 Y[y+X_height*x + Y_offset] += sum;
264 viennacl::ocl::kernel& kernel = ocl::compile_kernel(kernel_name, source);
266 kernel.local_work_size(0, OCL_WORK_GROUP_SIZE_2D);
267 kernel.local_work_size(1, OCL_WORK_GROUP_SIZE_2D);
274 static viennacl::ocl::kernel& generate_kernel_arbitrary_stride(
275 int32_t radius_x, int32_t radius_y,
bool flip,
bool overwrite)
277 std::string kernel_name =
278 "convolve_arbitrary_stride_" + ocl::get_type_string<T>() +
"_" +
279 std::to_string(radius_x) +
"_" + std::to_string(radius_y);
281 if (flip) kernel_name.append(
"_flip");
282 if (overwrite) kernel_name.append(
"_overwrite");
284 if (ocl::kernel_exists(kernel_name))
285 return ocl::get_kernel(kernel_name);
287 std::string source = ocl::generate_kernel_preamble<T>(kernel_name);
289 if (flip) source.append(
"#define FLIP\n");
290 if (overwrite) source.append(
"#define OVERWRITE\n");
292 source.append(
"#define RADIUS_X " + std::to_string(radius_x) +
"\n");
293 source.append(
"#define RADIUS_Y " + std::to_string(radius_y) +
"\n");
297 #define W_WIDTH (2*RADIUS_X+1)
298 #define W_HEIGHT (2*RADIUS_Y+1)
300 #define X_LOCAL_WIDTH (WORK_GROUP_SIZE_2D+2*RADIUS_X)
301 #define X_LOCAL_HEIGHT (WORK_GROUP_SIZE_2D+2*RADIUS_Y)
303 __kernel void KERNEL_NAME(
304 read_only __global DATATYPE* X, int X_width, int X_height, int X_offset,
305 __constant DATATYPE* W, int W_offset,
306 __global DATATYPE* Y, int Y_offset,
307 int stride_x, int stride_y)
309 __local DATATYPE X_local[WORK_GROUP_SIZE_2D][WORK_GROUP_SIZE_2D];
311 int x = get_global_id(0)*stride_x;
312 int y = get_global_id(1)*stride_y;
314 int Y_width = X_width/stride_x;
315 int Y_height = X_height/stride_y;
317 if (get_global_id(0)>=Y_width || get_global_id(1)>=Y_height)
321 for (int x1=0; x1<W_WIDTH; x1++)
324 int wx = x1*W_HEIGHT+W_offset;
326 int wx = (2*RADIUS_X-x1)*W_HEIGHT+W_offset;
328 int inx = x1+x-RADIUS_X;
329 for (int y1=0; y1<W_HEIGHT; y1++)
331 int iny = y1+y-RADIUS_Y;
332 if (inx>=0 && iny>=0 && inx<X_width && iny<X_height)
335 sum += W[y1+wx]*X[iny+inx*X_height+X_offset];
337 sum += W[2*RADIUS_Y-y1+wx]*X[iny+inx*X_height+X_offset];
343 Y[get_global_id(1)+Y_height*get_global_id(0) + Y_offset] = sum;
345 Y[get_global_id(1)+Y_height*get_global_id(0) + Y_offset] += sum;
351 viennacl::ocl::kernel& kernel = ocl::compile_kernel(kernel_name, source);
353 kernel.local_work_size(0, OCL_WORK_GROUP_SIZE_2D);
354 kernel.local_work_size(1, OCL_WORK_GROUP_SIZE_2D);
375 static void compute(CGPUMatrix<T> X, CGPUMatrix<T> W, CGPUMatrix<T> Y,
bool flip ,
376 bool overwrite, int32_t stride_x, int32_t stride_y)
378 if (stride_x==1 && stride_y==1)
380 viennacl::ocl::kernel& kernel = generate_kernel_unity_stride<T>(
381 (W.num_cols-1)/2, (W.num_rows-1)/2, flip, overwrite);
383 kernel.global_work_size(0, ocl::align_to_multiple_2d(Y.num_cols));
384 kernel.global_work_size(1, ocl::align_to_multiple_2d(Y.num_rows));
386 viennacl::ocl::enqueue(kernel(
387 X.vcl_matrix(), cl_int(X.num_cols), cl_int(X.num_rows), cl_int(X.offset),
388 W.vcl_matrix(), cl_int(W.offset),
389 Y.vcl_matrix(), cl_int(Y.offset)));
393 viennacl::ocl::kernel& kernel = generate_kernel_arbitrary_stride<T>(
394 (W.num_cols-1)/2, (W.num_rows-1)/2, flip, overwrite);
396 kernel.global_work_size(0, ocl::align_to_multiple_2d(Y.num_cols));
397 kernel.global_work_size(1, ocl::align_to_multiple_2d(Y.num_rows));
399 viennacl::ocl::enqueue(kernel(
400 X.vcl_matrix(), cl_int(X.num_cols), cl_int(X.num_rows), cl_int(X.offset),
401 W.vcl_matrix(), cl_int(W.offset),
402 Y.vcl_matrix(), cl_int(Y.offset),
403 cl_int(stride_x), cl_int(stride_y)));
408 #endif // HAVE_VIENNACL
415 #endif // CONVOLVE_H_
Eigen::Matrix< T, Eigen::Dynamic, Eigen::Dynamic > MatrixXt
static void compute(SGMatrix< T > X, SGMatrix< T > W, SGMatrix< T > Y, bool flip, bool overwrite, int32_t stride_x, int32_t stride_y)
Generic class sum which provides a static compute method. This class is specialized for different typ...
Eigen::Matrix< T, Eigen::Dynamic, 1 > VectorXt
Matrix::Scalar sum(Matrix m, bool no_diag=false)
static void compute(Matrix X, Matrix W, Matrix Y, bool flip, bool overwrite, int32_t stride_x, int32_t stride_y)
void convolve(Matrix X, Matrix W, Matrix Y, bool flip=false, bool overwrite=true, int32_t stride_x=1, int32_t stride_y=1)
all of classes and functions are contained in the shogun namespace