Skip to content

Commit e90a281

Browse files
committed
fix cuda kernel launch parameter
- grid and block are reversed
1 parent ffe99b2 commit e90a281

File tree

2 files changed

+35
-39
lines changed

2 files changed

+35
-39
lines changed

modules/cudawarping/perf/perf_warping.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -230,9 +230,9 @@ PERF_TEST_P(Sz_Depth_Cn_Scale, ResizeOnnxLinearAntialias,
230230
Combine(CUDA_TYPICAL_MAT_SIZES,
231231
Values(CV_8U, CV_16U, CV_32F),
232232
CUDA_CHANNELS_1_3_4,
233-
Values(0.2, 0.1, 0.05)))
233+
Values(0.8, 0.5, 0.3)))
234234
{
235-
declare.time(1.0);
235+
declare.time(10.0);
236236

237237
const cv::Size size = GET_PARAM(0);
238238
const int depth = GET_PARAM(1);

modules/cudawarping/src/cuda/resize_onnx.cu

+33-37
Original file line numberDiff line numberDiff line change
@@ -339,24 +339,22 @@ namespace cv { namespace cuda { namespace device {
339339
{
340340
int xstart = __float2int_rd(fx) - 1;
341341
int ystart = __float2int_rd(fy) - 1;
342-
int xlimit = xstart + 3;
343-
int ylimit = ystart + 3;
344342
int xoffset[4];
345-
float xcoeff[4];
346-
for (int x = xstart; x <= xlimit; ++x)
343+
W1 xcoeff[4];
344+
for (int x = 0; x < 4; ++x, ++xstart)
347345
{
348-
xoffset[x - xstart] = clamp(x, 0, col1);
349-
xcoeff[x - xstart] = cubic.at(x - fx);
346+
xoffset[x] = clamp(xstart, 0, col1);
347+
xcoeff [x] = cubic.at(xstart - fx);
350348
}
351349
W sumval = VecTraits<W>::all(0);
352-
for (int y = ystart; y <= ylimit; ++y)
350+
for (int y = 0; y < 4; ++y, ++ystart)
353351
{
354-
int yoffest = clamp(y, 0, row1);
352+
int yoffest = clamp(ystart, 0, row1);
355353
T const* S = ptr<T>(src, yoffest);
356354
W sline = VecTraits<W>::all(0);
357355
for (int x = 0; x < 4; ++x)
358356
sline += xcoeff[x] * saturate_cast<W>(S[xoffset[x]]);
359-
sumval += sline * cubic.at(y - fy);
357+
sumval += sline * cubic.at(ystart - fy);
360358
}
361359
at<T>(dst, dy, dx) = saturate_cast<T>(sumval);
362360
}
@@ -376,19 +374,17 @@ namespace cv { namespace cuda { namespace device {
376374
{
377375
int xstart = __float2int_rd(fx) - 1;
378376
int ystart = __float2int_rd(fy) - 1;
379-
int xlimit = xstart + 3;
380-
int ylimit = ystart + 3;
381377
int xoffset[4], yoffset[4];
382378
W xcoeff[4], ycoeff[4];
383-
for (int x = xstart; x <= xlimit; ++x)
379+
for (int x = 0; x < 4; ++x, ++xstart)
384380
{
385-
xoffset[x - xstart] = clamp(x, 0, col1) * cn;
386-
xcoeff[x - xstart] = cubic.at(x - fx);
381+
xoffset[x] = clamp(xstart, 0, col1) * cn;
382+
xcoeff [x] = cubic.at(xstart - fx);
387383
}
388-
for (int y = ystart; y <= ylimit; ++y)
384+
for (int y = 0; y < 4; ++y, ++ystart)
389385
{
390-
yoffset[y - ystart] = clamp(y, 0, row1);
391-
ycoeff[y - ystart] = cubic.at(y - fy);
386+
yoffset[y] = clamp(ystart, 0, row1);
387+
ycoeff [y] = cubic.at(ystart - fy);
392388
}
393389
T* D = ptr<T>(dst, dy) + dx * cn;
394390
for (int i = 0; i < cn; ++i)
@@ -509,15 +505,15 @@ namespace cv { namespace cuda { namespace device {
509505
dim3 block(32, 8);
510506
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
511507
if (cn == 1)
512-
sampleKernel<<<block, grid, 0, stream>>>(M, LinearVec<T, W, 1>(src, dst));
508+
sampleKernel<<<grid, block, 0, stream>>>(M, LinearVec<T, W, 1>(src, dst));
513509
else if (cn == 2)
514-
sampleKernel<<<block, grid, 0, stream>>>(M, LinearVec<T, W, 2>(src, dst));
510+
sampleKernel<<<grid, block, 0, stream>>>(M, LinearVec<T, W, 2>(src, dst));
515511
else if (cn == 3)
516-
sampleKernel<<<block, grid, 0, stream>>>(M, LinearVec<T, W, 3>(src, dst));
512+
sampleKernel<<<grid, block, 0, stream>>>(M, LinearVec<T, W, 3>(src, dst));
517513
else if (cn == 4)
518-
sampleKernel<<<block, grid, 0, stream>>>(M, LinearVec<T, W, 4>(src, dst));
514+
sampleKernel<<<grid, block, 0, stream>>>(M, LinearVec<T, W, 4>(src, dst));
519515
else
520-
sampleKernel<<<block, grid, 0, stream>>>(M, LinearCn<T, W>(src, dst, cn));
516+
sampleKernel<<<grid, block, 0, stream>>>(M, LinearCn<T, W>(src, dst, cn));
521517
}
522518

523519
template <typename T, typename W>
@@ -527,15 +523,15 @@ namespace cv { namespace cuda { namespace device {
527523
dim3 block(32, 8);
528524
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
529525
if (cn == 1)
530-
sampleKernel<<<block, grid, 0, stream>>>(M, LinearAntiVec<T, W, 1>(src, dst, scale, 0));
526+
sampleKernel<<<grid, block, 0, stream>>>(M, LinearAntiVec<T, W, 1>(src, dst, scale, 0));
531527
else if (cn == 2)
532-
sampleKernel<<<block, grid, 0, stream>>>(M, LinearAntiVec<T, W, 2>(src, dst, scale, 0));
528+
sampleKernel<<<grid, block, 0, stream>>>(M, LinearAntiVec<T, W, 2>(src, dst, scale, 0));
533529
else if (cn == 3)
534-
sampleKernel<<<block, grid, 0, stream>>>(M, LinearAntiVec<T, W, 3>(src, dst, scale, 0));
530+
sampleKernel<<<grid, block, 0, stream>>>(M, LinearAntiVec<T, W, 3>(src, dst, scale, 0));
535531
else if (cn == 4)
536-
sampleKernel<<<block, grid, 0, stream>>>(M, LinearAntiVec<T, W, 4>(src, dst, scale, 0));
532+
sampleKernel<<<grid, block, 0, stream>>>(M, LinearAntiVec<T, W, 4>(src, dst, scale, 0));
537533
else
538-
sampleKernel<<<block, grid, 0, stream>>>(M, LinearAntiCn<T, W>(src, dst, scale, 0, cn));
534+
sampleKernel<<<grid, block, 0, stream>>>(M, LinearAntiCn<T, W>(src, dst, scale, 0, cn));
539535
}
540536

541537
//==================== cubic ====================//
@@ -547,15 +543,15 @@ namespace cv { namespace cuda { namespace device {
547543
dim3 block(32, 8);
548544
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
549545
if (cn == 1)
550-
sampleKernel<<<block, grid, 0, stream>>>(M, CubicVec<T, W, 1>(src, dst, A));
546+
sampleKernel<<<grid, block, 0, stream>>>(M, CubicVec<T, W, 1>(src, dst, A));
551547
else if (cn == 2)
552-
sampleKernel<<<block, grid, 0, stream>>>(M, CubicVec<T, W, 2>(src, dst, A));
548+
sampleKernel<<<grid, block, 0, stream>>>(M, CubicVec<T, W, 2>(src, dst, A));
553549
else if (cn == 3)
554-
sampleKernel<<<block, grid, 0, stream>>>(M, CubicVec<T, W, 3>(src, dst, A));
550+
sampleKernel<<<grid, block, 0, stream>>>(M, CubicVec<T, W, 3>(src, dst, A));
555551
else if (cn == 4)
556-
sampleKernel<<<block, grid, 0, stream>>>(M, CubicVec<T, W, 4>(src, dst, A));
552+
sampleKernel<<<grid, block, 0, stream>>>(M, CubicVec<T, W, 4>(src, dst, A));
557553
else
558-
sampleKernel<<<block, grid, 0, stream>>>(M, CubicCn<T, W>(src, dst, A, cn));
554+
sampleKernel<<<grid, block, 0, stream>>>(M, CubicCn<T, W>(src, dst, A, cn));
559555
}
560556

561557
template <typename T, typename W>
@@ -565,15 +561,15 @@ namespace cv { namespace cuda { namespace device {
565561
dim3 block(32, 8);
566562
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
567563
if (cn == 1)
568-
sampleKernel<<<block, grid, 0, stream>>>(M, CubicAntiVec<T, W, 1>(src, dst, scale, A));
564+
sampleKernel<<<grid, block, 0, stream>>>(M, CubicAntiVec<T, W, 1>(src, dst, scale, A));
569565
else if (cn == 2)
570-
sampleKernel<<<block, grid, 0, stream>>>(M, CubicAntiVec<T, W, 2>(src, dst, scale, A));
566+
sampleKernel<<<grid, block, 0, stream>>>(M, CubicAntiVec<T, W, 2>(src, dst, scale, A));
571567
else if (cn == 3)
572-
sampleKernel<<<block, grid, 0, stream>>>(M, CubicAntiVec<T, W, 3>(src, dst, scale, A));
568+
sampleKernel<<<grid, block, 0, stream>>>(M, CubicAntiVec<T, W, 3>(src, dst, scale, A));
573569
else if (cn == 4)
574-
sampleKernel<<<block, grid, 0, stream>>>(M, CubicAntiVec<T, W, 4>(src, dst, scale, A));
570+
sampleKernel<<<grid, block, 0, stream>>>(M, CubicAntiVec<T, W, 4>(src, dst, scale, A));
575571
else
576-
sampleKernel<<<block, grid, 0, stream>>>(M, CubicAntiCn<T, W>(src, dst, scale, A, cn));
572+
sampleKernel<<<grid, block, 0, stream>>>(M, CubicAntiCn<T, W>(src, dst, scale, A, cn));
577573
}
578574

579575
template <typename T, typename W>

0 commit comments

Comments
 (0)