diff --git a/.gitignore b/.gitignore index bb42d2a4950..adc4cf79cdb 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,7 @@ +# ignore dot files/directories +.* +!.gitignore + *.autosave *.pyc *.user diff --git a/modules/cudawarping/include/opencv2/cudawarping.hpp b/modules/cudawarping/include/opencv2/cudawarping.hpp index b9ca957358e..942100d15d9 100644 --- a/modules/cudawarping/include/opencv2/cudawarping.hpp +++ b/modules/cudawarping/include/opencv2/cudawarping.hpp @@ -113,6 +113,33 @@ supported for now. */ CV_EXPORTS_W void resize(InputArray src, OutputArray dst, Size dsize, double fx=0, double fy=0, int interpolation = INTER_LINEAR, Stream& stream = Stream::Null()); +/** @brief onnx resize op +https://github.com/onnx/onnx/blob/main/docs/Operators.md#Resize +https://github.com/onnx/onnx/blob/main/onnx/reference/ops/op_resize.py + +Not support `tf_crop_resize` yet. + +To get a similar result to `cv::resize`, give dsize and: + INTER_NEAREST : ASYMMETRIC + NEAREST_FLOOR + INTER_LINEAR : HALF_PIXEL + INTER_CUBIC : HALF_PIXEL + cubicCoeff(-0.75) + +@param src input image. +@param dst output image; it has the size dsize (when it is non-zero) or the size computed from src.size(), scale; the type of dst is the same as of src. +@param dsize output image size; if it equals to zero, it is computed as: +\f[\texttt{dsize = Size(int(scale.x * src.cols), int(scale.y * src.rows))}\f] +Either dsize or scale must be non-zero. +@param scale scale factor; use same definition as ONNX, if scale > 1, it's upsampling. +@param interpolation interpolation flags, see #InterpolationFlags and #ResizeONNXFlags +@param cubicCoeff cubic sampling coefficient, range \f[[-1.0, 0)\f] +@param stream Stream for the asynchronous version. + +@sa resize, resizeOnnx + */ +CV_EXPORTS_W void resizeOnnx(InputArray src, OutputArray dst, Size dsize, + Point2d scale = Point2d(), int interpolation = INTER_LINEAR, + float cubicCoeff = -0.75f, Stream& stream = Stream::Null()); + /** @brief Applies an affine transformation to an image. @param src Source image. CV_8U , CV_16U , CV_32S , or CV_32F depth and 1, 3, or 4 channels are diff --git a/modules/cudawarping/perf/perf_warping.cpp b/modules/cudawarping/perf/perf_warping.cpp index 3e7aa18f559..e5f6c8065f1 100644 --- a/modules/cudawarping/perf/perf_warping.cpp +++ b/modules/cudawarping/perf/perf_warping.cpp @@ -223,6 +223,48 @@ PERF_TEST_P(Sz_Depth_Cn_Scale, ResizeArea, } } +////////////////////////////////////////////////////////////////////// +// ResizeOnnx + +PERF_TEST_P(Sz_Depth_Cn_Scale, ResizeOnnxLinearAntialias, + Combine(CUDA_TYPICAL_MAT_SIZES, + Values(CV_8U, CV_16U, CV_32F), + CUDA_CHANNELS_1_3_4, + Values(0.8, 0.5, 0.3))) +{ + declare.time(10.0); + + const cv::Size size = GET_PARAM(0); + const int depth = GET_PARAM(1); + const int channels = GET_PARAM(2); + const int interpolation = cv::INTER_LINEAR | cv::INTER_ANTIALIAS; + const double f = GET_PARAM(3); + const Point2d scale = Point2d(f, f); + + const int type = CV_MAKE_TYPE(depth, channels); + + cv::Mat src(size, type); + declare.in(src, WARMUP_RNG); + + if (PERF_RUN_CUDA()) + { + const cv::cuda::GpuMat d_src(src); + cv::cuda::GpuMat dst; + + TEST_CYCLE() cv::cuda::resizeOnnx(d_src, dst, cv::Size(), scale, interpolation); + + CUDA_SANITY_CHECK(dst, 1); + } + else + { + cv::Mat dst; + + TEST_CYCLE() cv::resizeOnnx(src, dst, cv::Size(), scale, interpolation); + + CPU_SANITY_CHECK(dst); + } +} + ////////////////////////////////////////////////////////////////////// // WarpAffine diff --git a/modules/cudawarping/src/cuda/resize_onnx.cu b/modules/cudawarping/src/cuda/resize_onnx.cu new file mode 100644 index 00000000000..0a70965e78a --- /dev/null +++ b/modules/cudawarping/src/cuda/resize_onnx.cu @@ -0,0 +1,907 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER +// #define __CUDACC__ 110700 +#include "opencv2/imgproc.hpp" +#include "opencv2/core/cuda/common.hpp" +#include "opencv2/core/cuda/vec_traits.hpp" +#include "opencv2/core/cuda/vec_math.hpp" +#include "opencv2/core/cuda/saturate_cast.hpp" + +namespace cv { namespace cuda { namespace device { + + __device__ __forceinline__ int clamp(int x, int lo, int hi) + { + return x < lo ? lo : hi < x ? hi : x; + } + + template + __device__ __forceinline__ T* ptr(PtrStepb const& src, int y) + { return reinterpret_cast(src.data + y * src.step); } + + template + __device__ __forceinline__ T& at(PtrStepb const& src, int y, int x) + { return ptr(src, y)[x]; } + + struct LinearCoeff + { + enum { ksize = 2 }; + + LinearCoeff(float) {} + + __device__ __forceinline__ float at(float x) const + { return __saturatef(1.f - ::fabsf(x)); } + }; + + struct CubicCoeff + { + enum { ksize = 4 }; + + float A, A2, A3; + + CubicCoeff(float a) : A(a), A2(a + 2), A3(a + 3) {} + + __device__ __forceinline__ float at(float x) const + { + x = ::fabsf(x); + if (x <= 1) + x = (A2 * x - A3) * x * x + 1; + else if (x <= 2) + x = A * (((x - 5) * x + 8) * x - 4); + else + x = 0; + return x; + } + }; + + //==================== sampler ====================// + + struct SamplerBase + { + PtrStepb src; + PtrStepSzb dst; + int row1, col1; + + SamplerBase(PtrStepSzb const& S, PtrStepSzb const& D) + : src(S), dst(D), row1(S.rows - 1), col1(S.cols - 1) + {} + }; + + template + struct AntiBase : public SamplerBase + { + static_assert(Coeff::ksize % 2 == 0, ""); + + float xscale, yscale; + int xstart, xend, ystart, yend; + Coeff coeff; + + AntiBase(PtrStepSzb const& S, PtrStepSzb const& D, + Point2f const& scale, float A) + : SamplerBase(S, D), coeff(A) + { + int const khalf = Coeff::ksize / 2; + xscale = std::min(scale.x, 1.f); + yscale = std::min(scale.y, 1.f); + xstart = cvFloor(-khalf / xscale) + 1; + xend = 2 - xstart; + ystart = cvFloor(-khalf / yscale) + 1; + yend = 2 - ystart; + } + }; + + ////////// nearest neighbor ////////// + + template + struct NearestVec : public SamplerBase + { + using SamplerBase::SamplerBase; + + __device__ void to(int sx, int sy, int dx, int dy) const + { + sx = clamp(sx, 0, col1); + sy = clamp(sy, 0, row1); + at(dst, dy, dx) = at(src, sy, sx); + } + }; + + struct NearestSize : public SamplerBase + { + size_t esz; + + NearestSize(PtrStepSzb const& S, PtrStepSzb const& D, size_t sz) + : SamplerBase(S, D), esz(sz) + {} + + __device__ void to(int sx, int sy, int dx, int dy) const + { + sx = clamp(sx, 0, col1); + sy = clamp(sy, 0, row1); + uchar const* S = ptr(src, sy) + sx * esz; + uchar * D = ptr(dst, dy) + dx * esz; + for (size_t i = 0; i < esz; ++i) + D[i] = S[i]; + } + }; + + ////////// anti-alias brute force ////////// + // because we can not allocate temporary memory to store coeffs and offsets + + template + struct AntiVec : public AntiBase + { + using AntiBase::AntiBase; + using T = typename TypeVec::vec_type; + using W = typename TypeVec::vec_type; + + __device__ void to(float fx, float fy, int dx, int dy) const + { + int ix = __float2int_rd(fx), iy = __float2int_rd(fy); + float rx = fx - ix, ry = fy - iy; + W1 weight = 0; + W sumval = VecTraits::all(0); + for (int h = this->ystart; h < this->yend; ++h) + { + W1 wline = 0; + W sline = VecTraits::all(0); + int sy = clamp(iy + h, 0, this->row1); + T const* S = ptr(this->src, sy); + for (int w = this->xstart; w < this->xend; ++w) + { + int sx = clamp(ix + w, 0, this->col1); + W1 t = this->coeff.at((w - rx) * this->xscale); + wline += t; + sline += t * saturate_cast(S[sx]); + } + W1 u = this->coeff.at((h - ry) * this->yscale); + weight += u * wline; + sumval += u * sline; + } + at(this->dst, dy, dx) = saturate_cast(sumval / weight); + } + }; + + template + struct AntiCn : public AntiBase + { + int cn; + + AntiCn(PtrStepSzb const& S, PtrStepSzb const& D, + Point2f const& scale, float A, int _cn) + : AntiBase(S, D, scale, A), cn(_cn) + {} + + __device__ void to(float fx, float fy, int dx, int dy) const + { + int ix = __float2int_rd(fx), iy = __float2int_rd(fy); + float rx = fx - ix, ry = fy - iy; + W weight = 0, sumval = 0; + T* D = ptr(this->dst, dy) + dx * cn; + for (int h = this->ystart; h < this->yend; ++h) + { + W wline = 0, sline = 0; + int sy = clamp(iy + h, 0, this->row1); + T const* S = ptr(this->src, sy); + for (int w = this->xstart; w < this->xend; ++w) + { + int sx = clamp(ix + w, 0, this->col1) * cn; + W t = this->coeff.at((w - rx) * this->xscale); + wline += t; + sline += t * S[sx]; + } + W u = this->coeff.at((h - ry) * this->yscale); + weight += u * wline; + sumval += u * sline; + } + D[0] = saturate_cast(sumval / weight); + for (int i = 1; i < cn; ++i) + { + sumval = 0; + for (int h = this->ystart; h < this->yend; ++h) + { + W sline = 0; + int sy = clamp(iy + h, 0, this->row1); + T const* S = ptr(this->src, sy) + i; + for (int w = this->xstart; w < this->xend; ++w) + { + int sx = clamp(ix + w, 0, this->col1) * cn; + W t = this->coeff.at((w - rx) * this->xscale); + sline += t * S[sx]; + } + W u = this->coeff.at((h - ry) * this->yscale); + sumval += u * sline; + } + D[i] = saturate_cast(sumval / weight); + } + } + }; + + template + struct AntiVecExOut : public AntiBase + { + using AntiBase::AntiBase; + using T = typename TypeVec::vec_type; + using W = typename TypeVec::vec_type; + + __device__ void to(float fx, float fy, int dx, int dy) const + { + int ix = __float2int_rd(fx), iy = __float2int_rd(fy); + float rx = fx - ix, ry = fy - iy; + W1 weight = 0; + W sumval = VecTraits::all(0); + for (int h = this->ystart; h < this->yend; ++h) + { + int sy = iy + h; + if (static_cast(sy) > static_cast(this->row1)) + continue; + W1 wline = 0; + W sline = VecTraits::all(0); + T const* S = ptr(this->src, sy); + for (int w = this->xstart; w < this->xend; ++w) + { + int sx = ix + w; + if (static_cast(sx) > static_cast(this->col1)) + continue; + W1 t = this->coeff.at((w - rx) * this->xscale); + wline += t; + sline += t * saturate_cast(S[sx]); + } + W1 u = this->coeff.at((h - ry) * this->yscale); + weight += u * wline; + sumval += u * sline; + } + at(this->dst, dy, dx) = saturate_cast(sumval / weight); + } + }; + + template + struct AntiCnExOut : public AntiBase + { + int cn; + + AntiCnExOut(PtrStepSzb const& S, PtrStepSzb const& D, + Point2f const& scale, float A, int _cn) + : AntiBase(S, D, scale, A), cn(_cn) + {} + + __device__ void to(float fx, float fy, int dx, int dy) const + { + int ix = __float2int_rd(fx), iy = __float2int_rd(fy); + float rx = fx - ix, ry = fy - iy; + W weight = 0, sumval = 0; + T* D = ptr(this->dst, dy) + dx * cn; + for (int h = this->ystart; h < this->yend; ++h) + { + int sy = iy + h; + if (static_cast(sy) > static_cast(this->row1)) + continue; + W wline = 0, sline = 0; + T const* S = ptr(this->src, sy); + for (int w = this->xstart; w < this->xend; ++w) + { + int sx = ix + w; + if (static_cast(sx) > static_cast(this->col1)) + continue; + sx = sx * cn; + W t = this->coeff.at((w - rx) * this->xscale); + wline += t; + sline += t * S[sx]; + } + W u = this->coeff.at((h - ry) * this->yscale); + weight += u * wline; + sumval += u * sline; + } + D[0] = saturate_cast(sumval / weight); + for (int i = 1; i < cn; ++i) + { + sumval = 0; + for (int h = this->ystart; h < this->yend; ++h) + { + int sy = iy + h; + if (static_cast(sy) > static_cast(this->row1)) + continue; + W sline = 0; + T const* S = ptr(this->src, sy) + i; + for (int w = this->xstart; w < this->xend; ++w) + { + int sx = ix + w; + if (static_cast(sx) > static_cast(this->col1)) + continue; + sx = sx * cn; + W t = this->coeff.at((w - rx) * this->xscale); + sline += t * S[sx]; + } + W u = this->coeff.at((h - ry) * this->yscale); + sumval += u * sline; + } + D[i] = saturate_cast(sumval / weight); + } + } + }; + + ////////// bi-linear ////////// + + template + struct LinearVec : public SamplerBase + { + using SamplerBase::SamplerBase; + using T = typename TypeVec::vec_type; + using W = typename TypeVec::vec_type; + + __device__ void to(float fx, float fy, int dx, int dy) const + { + int ix = __float2int_rd(fx), iy = __float2int_rd(fy); + float u1 = fx - ix, v1 = fy - iy; + float u0 = 1.f - u1, v0 = 1.f - v1; + int x0 = ::max(ix, 0); + int y0 = ::max(iy, 0); + int x1 = ::min(ix + 1, col1); + int y1 = ::min(iy + 1, row1); + W s0 = saturate_cast(at(src, y0, x0)); + W s1 = saturate_cast(at(src, y0, x1)); + W s2 = saturate_cast(at(src, y1, x0)); + W s3 = saturate_cast(at(src, y1, x1)); + W val = (u0 * v0) * s0 + (u1 * v0) * s1 + (u0 * v1) * s2 + (u1 * v1) * s3; + at(dst, dy, dx) = saturate_cast(val); + } + }; + + template + struct LinearCn : public SamplerBase + { + int cn; + + LinearCn(PtrStepSzb const& S, PtrStepSzb const& D, int _cn) + : SamplerBase(S, D), cn(_cn) + {} + + __device__ void to(float fx, float fy, int dx, int dy) const + { + int ix = __float2int_rd(fx), iy = __float2int_rd(fy); + float u1 = fx - ix, v1 = fy - iy; + float u0 = 1.f - u1, v0 = 1.f - v1; + int x0 = ::max(ix, 0); + int y0 = ::max(iy, 0); + int x1 = ::min(ix + 1, col1); + int y1 = ::min(iy + 1, row1); + W coeff[4] = {u0 * v0, u1 * v0, u0 * v1, u1 * v1}; + T const* S0 = ptr(src, y0) + x0 * cn; + T const* S1 = ptr(src, y0) + x1 * cn; + T const* S2 = ptr(src, y1) + x0 * cn; + T const* S3 = ptr(src, y1) + x1 * cn; + T * D = ptr(dst, dy) + dx * cn; + for (int i = 0; i < cn; ++i) + { + D[i] = saturate_cast(coeff[0] * S0[i] + + coeff[1] * S1[i] + coeff[2] * S2[i] + coeff[3] * S3[i]); + } + } + }; + + template + using LinearAntiVec = AntiVec; + + template + using LinearAntiCn = AntiCn; + + template + using LinearAntiVecExOut = AntiVecExOut; + + template + using LinearAntiCnExOut = AntiCnExOut; + + ////////// bi-cubic ////////// + + template + struct CubicVec : public SamplerBase + { + CubicCoeff cubic; + using T = typename TypeVec::vec_type; + using W = typename TypeVec::vec_type; + + CubicVec(PtrStepSzb const& S, PtrStepSzb const& D, float A) + : SamplerBase(S, D), cubic(A) + {} + + __device__ void to(float fx, float fy, int dx, int dy) const + { + int xstart = __float2int_rd(fx) - 1; + int ystart = __float2int_rd(fy) - 1; + int xoffset[4]; + W1 xcoeff[4]; + for (int x = 0; x < 4; ++x, ++xstart) + { + xoffset[x] = clamp(xstart, 0, col1); + xcoeff [x] = cubic.at(xstart - fx); + } + W sumval = VecTraits::all(0); + for (int y = 0; y < 4; ++y, ++ystart) + { + int yoffest = clamp(ystart, 0, row1); + T const* S = ptr(src, yoffest); + W sline = VecTraits::all(0); + for (int x = 0; x < 4; ++x) + sline += xcoeff[x] * saturate_cast(S[xoffset[x]]); + sumval += sline * cubic.at(ystart - fy); + } + at(dst, dy, dx) = saturate_cast(sumval); + } + }; + + template + struct CubicCn : public SamplerBase + { + CubicCoeff cubic; + int cn; + + CubicCn(PtrStepSzb const& S, PtrStepSzb const& D, float A, int _cn) + : SamplerBase(S, D), cubic(A), cn(_cn) + {} + + __device__ void to(float fx, float fy, int dx, int dy) const + { + int xstart = __float2int_rd(fx) - 1; + int ystart = __float2int_rd(fy) - 1; + int xoffset[4], yoffset[4]; + W xcoeff[4], ycoeff[4]; + for (int x = 0; x < 4; ++x, ++xstart) + { + xoffset[x] = clamp(xstart, 0, col1) * cn; + xcoeff [x] = cubic.at(xstart - fx); + } + for (int y = 0; y < 4; ++y, ++ystart) + { + yoffset[y] = clamp(ystart, 0, row1); + ycoeff [y] = cubic.at(ystart - fy); + } + T* D = ptr(dst, dy) + dx * cn; + for (int i = 0; i < cn; ++i) + { + W sumval = 0; + for (int y = 0; y < 4; ++y) + { + T const* S = ptr(src, yoffset[y]) + i; + W sline = 0; + for (int x = 0; x < 4; ++x) + sline += xcoeff[x] * S[xoffset[x]]; + sumval += sline * ycoeff[y]; + } + D[i] = saturate_cast(sumval); + } + } + }; + + template + struct CubicVecExOut : public SamplerBase + { + CubicCoeff cubic; + using T = typename TypeVec::vec_type; + using W = typename TypeVec::vec_type; + + CubicVecExOut(PtrStepSzb const& S, PtrStepSzb const& D, float A) + : SamplerBase(S, D), cubic(A) + {} + + __device__ void to(float fx, float fy, int dx, int dy) const + { + int xstart = __float2int_rd(fx) - 1; + int ystart = __float2int_rd(fy) - 1; + int xoffset[4]; + W1 xcoeff[4], xcoeffsum = 0, ycoeffsum = 0; + for (int x = 0; x < 4; ++x, ++xstart) + { + xoffset[x] = clamp(xstart, 0, col1); + xcoeff [x] = cubic.at(xstart - fx); + if (static_cast(xstart) > static_cast(col1)) + xcoeff[x] = 0; + xcoeffsum += xcoeff[x]; + } + W sumval = VecTraits::all(0); + for (int y = 0; y < 4; ++y, ++ystart) + { + if (static_cast(ystart) > static_cast(row1)) + continue; + int yoffest = ystart; + T const* S = ptr(src, yoffest); + W sline = VecTraits::all(0); + for (int x = 0; x < 4; ++x) + sline += xcoeff[x] * saturate_cast(S[xoffset[x]]); + W1 u = cubic.at(ystart - fy); + ycoeffsum += u; + sumval += sline * u; + } + at(dst, dy, dx) = saturate_cast(sumval / (ycoeffsum * xcoeffsum)); + } + }; + + template + struct CubicCnExOut : public SamplerBase + { + CubicCoeff cubic; + int cn; + + CubicCnExOut(PtrStepSzb const& S, PtrStepSzb const& D, float A, int _cn) + : SamplerBase(S, D), cubic(A), cn(_cn) + {} + + __device__ void to(float fx, float fy, int dx, int dy) const + { + int xstart = __float2int_rd(fx) - 1; + int ystart = __float2int_rd(fy) - 1; + int xoffset[4], yoffset[4]; + W xcoeff[4], ycoeff[4], xcoeffsum = 0, ycoeffsum = 0; + for (int x = 0; x < 4; ++x, ++xstart) + { + xoffset[x] = clamp(xstart, 0, col1) * cn; + xcoeff [x] = cubic.at(xstart - fx); + if (static_cast(xstart) > static_cast(col1)) + xcoeff[x] = 0; + xcoeffsum += xcoeff[x]; + } + for (int y = 0; y < 4; ++y, ++ystart) + { + yoffset[y] = clamp(ystart, 0, row1); + ycoeff [y] = cubic.at(ystart - fy); + if (static_cast(ystart) > static_cast(row1)) + ycoeff[y] = 0; + ycoeffsum += ycoeff[y]; + } + W weight = xcoeffsum * ycoeffsum; + T* D = ptr(dst, dy) + dx * cn; + for (int i = 0; i < cn; ++i) + { + W sumval = 0; + for (int y = 0; y < 4; ++y) + { + T const* S = ptr(src, yoffset[y]) + i; + W sline = 0; + for (int x = 0; x < 4; ++x) + sline += xcoeff[x] * S[xoffset[x]]; + sumval += sline * ycoeff[y]; + } + D[i] = saturate_cast(sumval / weight); + } + } + }; + + template + using CubicAntiVec = AntiVec; + + template + using CubicAntiCn = AntiCn; + + template + using CubicAntiVecExOut = AntiVecExOut; + + template + using CubicAntiCnExOut = AntiCnExOut; + + ////////// generic ////////// + + template + __global__ void sampleKernel(Matx22f const M, Sampler const sampler) + { + int dx = blockDim.x * blockIdx.x + threadIdx.x; + int dy = blockDim.y * blockIdx.y + threadIdx.y; + if (dx < sampler.dst.cols && dy < sampler.dst.rows) + { + float fx = ::fmaf(static_cast(dx), M.val[0], M.val[1]); + float fy = ::fmaf(static_cast(dy), M.val[2], M.val[3]); + sampler.to(fx, fy, dx, dy); + } + } + + //==================== nearest neighbor ====================// + + struct RoundUp + { + __device__ __forceinline__ int operator()(float x) const + { return __float2int_ru(x); } + }; + + struct RoundDown + { + __device__ __forceinline__ int operator()(float x) const + { return __float2int_rd(x); } + }; + + template + __global__ void nnBySampler( + RoundOp const R, Sampler const sampler, Matx22f const M, float const offset) + { + int dx = blockDim.x * blockIdx.x + threadIdx.x; + int dy = blockDim.y * blockIdx.y + threadIdx.y; + if (dx < sampler.dst.cols && dy < sampler.dst.rows) + { + int sx = R(::fmaf(static_cast(dx), M.val[0], M.val[1]) + offset); + int sy = R(::fmaf(static_cast(dy), M.val[2], M.val[3]) + offset); + sampler.to(sx, sy, dx, dy); + } + } + + template + void nnByRound(size_t esz, PtrStepSzb const& src, PtrStepSzb dst, + Matx22f const& M, float offset, cudaStream_t stream) + { + RoundOp R; + dim3 block(32, 8); + dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); + if (esz == 1) + nnBySampler<<>>(R, NearestVec(src, dst), M, offset); + else if (esz == 2) + nnBySampler<<>>(R, NearestVec(src, dst), M, offset); + else if (esz == 3) + nnBySampler<<>>(R, NearestVec(src, dst), M, offset); + else if (esz == 4) + nnBySampler<<>>(R, NearestVec(src, dst), M, offset); + else if (esz == 6) + nnBySampler<<>>(R, NearestVec(src, dst), M, offset); + else if (esz == 8) + nnBySampler<<>>(R, NearestVec(src, dst), M, offset); + else if (esz == 12) + nnBySampler<<>>(R, NearestVec(src, dst), M, offset); + else if (esz == 16) + nnBySampler<<>>(R, NearestVec(src, dst), M, offset); + else + nnBySampler<<>>(R, NearestSize(src, dst, esz), M, offset); + } + + void resizeOnnxNN(size_t elemSize, PtrStepSzb const& src, PtrStepSzb const& dst, + Matx22f const& M, int mode, cudaStream_t stream) + { + float offset = 0.f; + if (mode == INTER_NEAREST_PREFER_FLOOR) + offset = -0.5f; + if (mode == INTER_NEAREST_PREFER_CEIL) + offset = +0.5f; + + if (mode == INTER_NEAREST_PREFER_FLOOR || mode == INTER_NEAREST_CEIL) + nnByRound(elemSize, src, dst, M, offset, stream); + else + nnByRound(elemSize, src, dst, M, offset, stream); + if (!stream) + cudaSafeCall(cudaDeviceSynchronize()); + } + + //==================== linear ====================// + + template + void linear(int cn, PtrStepSzb const& src, PtrStepSzb const& dst, + Matx22f const& M, cudaStream_t stream) + { + dim3 block(32, 8); + dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); + if (cn == 1) + sampleKernel<<>>(M, LinearVec(src, dst)); + else if (cn == 2) + sampleKernel<<>>(M, LinearVec(src, dst)); + else if (cn == 3) + sampleKernel<<>>(M, LinearVec(src, dst)); + else if (cn == 4) + sampleKernel<<>>(M, LinearVec(src, dst)); + else + sampleKernel<<>>(M, LinearCn(src, dst, cn)); + } + + template + void linearAnti(int cn, PtrStepSzb const& src, PtrStepSzb const& dst, + Matx22f const& M, Point2f const& scale, cudaStream_t stream) + { + dim3 block(32, 8); + dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); + if (cn == 1) + sampleKernel<<>>(M, LinearAntiVec(src, dst, scale, 0)); + else if (cn == 2) + sampleKernel<<>>(M, LinearAntiVec(src, dst, scale, 0)); + else if (cn == 3) + sampleKernel<<>>(M, LinearAntiVec(src, dst, scale, 0)); + else if (cn == 4) + sampleKernel<<>>(M, LinearAntiVec(src, dst, scale, 0)); + else + sampleKernel<<>>(M, LinearAntiCn(src, dst, scale, 0, cn)); + } + + template + void linearAntiExOut(int cn, PtrStepSzb const& src, PtrStepSzb const& dst, + Matx22f const& M, Point2f const& scale, cudaStream_t stream) + { + dim3 block(32, 8); + dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); + if (cn == 1) + sampleKernel<<>>(M, LinearAntiVecExOut(src, dst, scale, 0)); + else if (cn == 2) + sampleKernel<<>>(M, LinearAntiVecExOut(src, dst, scale, 0)); + else if (cn == 3) + sampleKernel<<>>(M, LinearAntiVecExOut(src, dst, scale, 0)); + else if (cn == 4) + sampleKernel<<>>(M, LinearAntiVecExOut(src, dst, scale, 0)); + else + sampleKernel<<>>(M, LinearAntiCnExOut(src, dst, scale, 0, cn)); + } + + //==================== cubic ====================// + + template + void cubic(int cn, float A, PtrStepSzb const& src, + PtrStepSzb const& dst, Matx22f const& M, cudaStream_t stream) + { + dim3 block(32, 8); + dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); + if (cn == 1) + sampleKernel<<>>(M, CubicVec(src, dst, A)); + else if (cn == 2) + sampleKernel<<>>(M, CubicVec(src, dst, A)); + else if (cn == 3) + sampleKernel<<>>(M, CubicVec(src, dst, A)); + else if (cn == 4) + sampleKernel<<>>(M, CubicVec(src, dst, A)); + else + sampleKernel<<>>(M, CubicCn(src, dst, A, cn)); + } + + template + void cubicExOut(int cn, float A, PtrStepSzb const& src, + PtrStepSzb const& dst, Matx22f const& M, cudaStream_t stream) + { + dim3 block(32, 8); + dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); + if (cn == 1) + sampleKernel<<>>(M, CubicVecExOut(src, dst, A)); + else if (cn == 2) + sampleKernel<<>>(M, CubicVecExOut(src, dst, A)); + else if (cn == 3) + sampleKernel<<>>(M, CubicVecExOut(src, dst, A)); + else if (cn == 4) + sampleKernel<<>>(M, CubicVecExOut(src, dst, A)); + else + sampleKernel<<>>(M, CubicCnExOut(src, dst, A, cn)); + } + + template + void cubicAnti(int cn, float A, PtrStepSzb const& src, PtrStepSzb const& dst, + Matx22f const& M, Point2f const& scale, cudaStream_t stream) + { + dim3 block(32, 8); + dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); + if (cn == 1) + sampleKernel<<>>(M, CubicAntiVec(src, dst, scale, A)); + else if (cn == 2) + sampleKernel<<>>(M, CubicAntiVec(src, dst, scale, A)); + else if (cn == 3) + sampleKernel<<>>(M, CubicAntiVec(src, dst, scale, A)); + else if (cn == 4) + sampleKernel<<>>(M, CubicAntiVec(src, dst, scale, A)); + else + sampleKernel<<>>(M, CubicAntiCn(src, dst, scale, A, cn)); + } + + template + void cubicAntiExOut(int cn, float A, PtrStepSzb const& src, PtrStepSzb const& dst, + Matx22f const& M, Point2f const& scale, cudaStream_t stream) + { + dim3 block(32, 8); + dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); + if (cn == 1) + sampleKernel<<>>(M, CubicAntiVecExOut(src, dst, scale, A)); + else if (cn == 2) + sampleKernel<<>>(M, CubicAntiVecExOut(src, dst, scale, A)); + else if (cn == 3) + sampleKernel<<>>(M, CubicAntiVecExOut(src, dst, scale, A)); + else if (cn == 4) + sampleKernel<<>>(M, CubicAntiVecExOut(src, dst, scale, A)); + else + sampleKernel<<>>(M, CubicAntiCnExOut(src, dst, scale, A, cn)); + } + +template +void resizeOnnx(int cn, float A, PtrStepSzb const& src, PtrStepSzb const& dst, + Matx22f const& M, Point2f const& scale, int interpolation, cudaStream_t stream) +{ + int sampler = interpolation & INTER_SAMPLER_MASK; + int antialias = interpolation & INTER_ANTIALIAS_MASK; + int exclude_outside = interpolation & INTER_EXCLUDE_OUTSIDE_MASK; + if (exclude_outside) + { + if (sampler == INTER_LINEAR && !antialias) + linear(cn, src, dst, M, stream); + else if (sampler == INTER_LINEAR && antialias) + linearAntiExOut(cn, src, dst, M, scale, stream); + else if (sampler == INTER_CUBIC && !antialias) + cubicExOut(cn, A, src, dst, M, stream); + else if (sampler == INTER_CUBIC && antialias) + cubicAntiExOut(cn, A, src, dst, M, scale, stream); + else + CV_Error(cv::Error::StsBadArg, "unsupported interpolation"); + } + else + { + if (sampler == INTER_LINEAR && !antialias) + linear(cn, src, dst, M, stream); + else if (sampler == INTER_LINEAR && antialias) + linearAnti(cn, src, dst, M, scale, stream); + else if (sampler == INTER_CUBIC && !antialias) + cubic(cn, A, src, dst, M, stream); + else if (sampler == INTER_CUBIC && antialias) + cubicAnti(cn, A, src, dst, M, scale, stream); + else + CV_Error(cv::Error::StsBadArg, "unsupported interpolation"); + } + + if (!stream) + cudaSafeCall(cudaDeviceSynchronize()); +} + +template void resizeOnnx(int cn, float A, + PtrStepSzb const& src, PtrStepSzb const& dst, Matx22f const& M, + Point2f const& scale, int interpolation, cudaStream_t stream); + +template void resizeOnnx(int cn, float A, + PtrStepSzb const& src, PtrStepSzb const& dst, Matx22f const& M, + Point2f const& scale, int interpolation, cudaStream_t stream); + +template void resizeOnnx(int cn, float A, + PtrStepSzb const& src, PtrStepSzb const& dst, Matx22f const& M, + Point2f const& scale, int interpolation, cudaStream_t stream); + +template void resizeOnnx(int cn, float A, + PtrStepSzb const& src, PtrStepSzb const& dst, Matx22f const& M, + Point2f const& scale, int interpolation, cudaStream_t stream); + +template void resizeOnnx(int cn, float A, + PtrStepSzb const& src, PtrStepSzb const& dst, Matx22f const& M, + Point2f const& scale, int interpolation, cudaStream_t stream); + +template void resizeOnnx(int cn, float A, + PtrStepSzb const& src, PtrStepSzb const& dst, Matx22f const& M, + Point2f const& scale, int interpolation, cudaStream_t stream); + +template void resizeOnnx(int cn, float A, + PtrStepSzb const& src, PtrStepSzb const& dst, Matx22f const& M, + Point2f const& scale, int interpolation, cudaStream_t stream); + +/*template void resizeOnnx<__half, float>(int cn, float A, + PtrStepSzb const& src, PtrStepSzb const& dst, Matx22f const& M, + Point2f const& scale, int interpolation, cudaStream_t stream);*/ +}}} + +#endif /* CUDA_DISABLER */ diff --git a/modules/cudawarping/src/resize.cpp b/modules/cudawarping/src/resize.cpp index 9943a6cdc6a..7deeafddf0c 100644 --- a/modules/cudawarping/src/resize.cpp +++ b/modules/cudawarping/src/resize.cpp @@ -44,16 +44,65 @@ #if !defined HAVE_CUDA || defined(CUDA_DISABLER) -void cv::cuda::resize(InputArray, OutputArray, Size, double, double, int, Stream&) { throw_no_cuda(); } +void cv::cuda::resize(InputArray /*src*/, OutputArray /*dst*/, Size /*dsize*/, + double /*fx*/, double /*fy*/, int /*interpolation*/, Stream& /*stream*/) +{ throw_no_cuda(); } + +void cv::cuda::resizeOnnx(InputArray /*src*/, OutputArray /*dst*/, Size /*dsize*/, + Point2d /*scale*/, int /*interpolation*/, float /*cubicCoeff*/, Stream& /*stream*/) +{ throw_no_cuda(); } #else // HAVE_CUDA namespace cv { namespace cuda { namespace device { - template - void resize(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream); +template +void resize(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, + const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream); + +template +void resizeOnnx(int cn, float A, PtrStepSzb const& src, PtrStepSzb const& dst, + Matx22f const& M, Point2f const& scale, int interpolation, cudaStream_t stream); + +void resizeOnnxNN(size_t elemSize, PtrStepSzb const& src, PtrStepSzb const& dst, + Matx22f const& M, int mode, cudaStream_t stream); }}} +namespace cv +{ +static Vec2f interCoordinate(int coordinate, int dst, int src, double scale) +{ + float a, b; + if (coordinate == INTER_HALF_PIXEL + || coordinate == INTER_HALF_PIXEL_SYMMETRIC + || coordinate == INTER_HALF_PIXEL_PYTORCH) + { + a = static_cast(1.0 / scale); + b = static_cast(0.5 / scale - 0.5); + if (coordinate == INTER_HALF_PIXEL_SYMMETRIC) + b += static_cast(0.5 * (src - dst / scale)); + if (coordinate == INTER_HALF_PIXEL_PYTORCH && dst <= 1) + { + a = 0.f; + b = -0.5f; + } + } + else if (coordinate == INTER_ALIGN_CORNERS) + { + a = static_cast((src - 1.0) / (src * scale - 1.0)); + b = 0.f; + } + else if (coordinate == INTER_ASYMMETRIC) + { + a = static_cast(1.0 / scale); + b = 0.f; + } + else + CV_Error(Error::StsBadArg, format("Unknown coordinate transformation mode %d", coordinate)); + return Vec2f(a, b); +} +} + void cv::cuda::resize(InputArray _src, OutputArray _dst, Size dsize, double fx, double fy, int interpolation, Stream& stream) { GpuMat src = _src.getGpuMat(); @@ -105,4 +154,99 @@ void cv::cuda::resize(InputArray _src, OutputArray _dst, Size dsize, double fx, func(src, wholeSrc, ofs.y, ofs.x, dst, static_cast(1.0 / fy), static_cast(1.0 / fx), interpolation, StreamAccessor::getStream(stream)); } + +void cv::cuda::resizeOnnx(InputArray _src, OutputArray _dst, + Size dsize, Point2d scale, int interpolation, float cubicCoeff, Stream& stream) +{ + GpuMat src = _src.getGpuMat(); + Size ssize = _src.size(); + CV_CheckEQ(_src.dims(), 2, "only 2 dim image is support now"); + CV_CheckFalse(ssize.empty(), "src size must not be empty"); + if (dsize.empty()) + { + CV_CheckGT(scale.x, 0.0, "scale must > 0 if no dsize given"); + CV_CheckGT(scale.y, 0.0, "scale must > 0 if no dsize given"); + dsize.width = static_cast(scale.x * ssize.width); + dsize.height = static_cast(scale.y * ssize.height); + } + if (scale.x == 0 || scale.y == 0) + { + scale.x = static_cast(dsize.width) / ssize.width; + scale.y = static_cast(dsize.height) / ssize.height; + } + CV_CheckFalse(dsize.empty(), "dst size must not empty"); + CV_CheckGT(scale.x, 0.0, "require computed or given scale > 0"); + CV_CheckGT(scale.y, 0.0, "require computed or given scale > 0"); + + int sampler = interpolation & INTER_SAMPLER_MASK; + int nearest = interpolation & INTER_NEAREST_MODE_MASK; + int coordinate = interpolation & INTER_COORDINATE_MASK; + CV_Assert( + sampler == INTER_NEAREST || + sampler == INTER_LINEAR || + sampler == INTER_CUBIC); + CV_Assert( + nearest == INTER_NEAREST_PREFER_FLOOR || + nearest == INTER_NEAREST_PREFER_CEIL || + nearest == INTER_NEAREST_FLOOR || + nearest == INTER_NEAREST_CEIL); + CV_Assert( + coordinate == INTER_HALF_PIXEL || + coordinate == INTER_HALF_PIXEL_PYTORCH || + coordinate == INTER_HALF_PIXEL_SYMMETRIC || + coordinate == INTER_ALIGN_CORNERS || + coordinate == INTER_ASYMMETRIC); + + _dst.create(dsize, _src.type()); + GpuMat dst = _dst.getGpuMat(); + if (dsize == ssize) + { + src.copyTo(dst, stream); + return; + } + if (scale.x >= 1.0 && scale.y >= 1.0) + interpolation &= ~INTER_ANTIALIAS_MASK; + + Point2f scalef = static_cast(scale); + Matx22f M; + Vec2f xcoef = interCoordinate(coordinate, dsize.width, ssize.width, scale.x); + Vec2f ycoef = interCoordinate(coordinate, dsize.height, ssize.height, scale.y); + M(0, 0) = xcoef[0]; + M(0, 1) = xcoef[1]; + M(1, 0) = ycoef[0]; + M(1, 1) = ycoef[1]; + + if (sampler == INTER_NEAREST) + { + device::resizeOnnxNN(src.elemSize(), + src, dst, M, nearest, StreamAccessor::getStream(stream)); + return; + } + + int depth = src.depth(), cn = src.channels(); + CV_CheckDepth(depth, depth <= CV_64F, + "only support float in cuda kernel when not use nearest sampler"); + + using Func = void(*)(int cn, float A, + PtrStepSzb const& src, PtrStepSzb const& dst, Matx22f const& M, + Point2f const& scale, int interpolation, cudaStream_t stream); + static Func const funcs[CV_DEPTH_MAX] = + { + device::resizeOnnx, + device::resizeOnnx, + device::resizeOnnx, + device::resizeOnnx, + device::resizeOnnx, + device::resizeOnnx, + device::resizeOnnx, + /*device::resizeOnnx<__half, float>*/ nullptr, + }; + + Func const func = funcs[depth]; + if (!func) + CV_Error(Error::StsUnsupportedFormat, "Unsupported depth"); + func(cn, cubicCoeff, src, dst, M, scalef, interpolation, + StreamAccessor::getStream(stream)); +} + #endif // HAVE_CUDA diff --git a/modules/cudawarping/test/test_resize.cpp b/modules/cudawarping/test/test_resize.cpp index 768ad09f982..f90a79c310f 100644 --- a/modules/cudawarping/test/test_resize.cpp +++ b/modules/cudawarping/test/test_resize.cpp @@ -260,6 +260,127 @@ INSTANTIATE_TEST_CASE_P(CUDA_Warping, ResizeTextures, testing::Combine( ALL_DEVICES, testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)))); +PARAM_TEST_CASE(ResizeOnnx, cv::cuda::DeviceInfo, MatType, double, double, int, UseRoi) +{ + cv::cuda::DeviceInfo devInfo; + int depth, interpolation; + double fx, fy; + bool useRoi; + + Rect src_loc, dst_loc; + Mat src, dst, src_roi, dst_roi; + GpuMat gsrc, gdst, gsrc_roi, gdst_roi; + + virtual void SetUp() + { + devInfo = GET_PARAM(0); + depth = GET_PARAM(1); + fx = GET_PARAM(2); + fy = GET_PARAM(3); + interpolation = GET_PARAM(4); + useRoi = GET_PARAM(5); + cv::cuda::setDevice(devInfo.deviceID()); + } + + void random_submat(int type, + Size& size, Rect& roi, Mat& mat, Mat& sub, GpuMat& gmat, GpuMat& gsub) + { + int border = useRoi ? 65 : 0; + roi.x = randomInt(0, border); + roi.y = randomInt(0, border); + roi.width = size.width; + roi.height = size.height; + size.width += roi.x + randomInt(0, border); + size.height += roi.y + randomInt(0, border); + mat = randomMat(size, type, -127, 127); + gmat.upload(mat); + sub = mat(roi); + gsub = gmat(roi); + } + + void random_roi(int type) + { + Size srcSize, dstSize; + int minSize = min(fx, fy) < 1.0 ? 16 : 1; + while (dstSize.empty()) + { + srcSize = randomSize(minSize, 129); + dstSize.width = cvRound(srcSize.width * fx); + dstSize.height = cvRound(srcSize.height * fy); + } + + random_submat(type, srcSize, src_loc, src, src_roi, gsrc, gsrc_roi); + random_submat(type, dstSize, dst_loc, dst, dst_roi, gdst, gdst_roi); + } +}; + +CUDA_TEST_P(ResizeOnnx, Accuracy) +{ + Mat host, host_roi; + double eps = depth <= CV_32S ? 1 : 5e-2; + + for (int cn = 1; cn <= 6; ++cn) + { + int type = CV_MAKETYPE(depth, cn); + float A = static_cast(randomDouble(-1.0, -0.1)); + random_roi(type); + + cv::resizeOnnx(src_roi, dst_roi, + dst_roi.size(), Point2d(fx, fy), interpolation, A); + cv::cuda::resizeOnnx(gsrc_roi, gdst_roi, + dst_roi.size(), Point2d(fx, fy), interpolation, A); + + gdst.download(host); + host_roi = host(dst_loc); + string info = cv::format( + "fail on type %sC%d src %dx%d dst %dx%d src_roi %dx%d dst_roi %dx%d", + depthToString(depth), cn, src.cols, src.rows, dst.cols, dst.rows, + src_roi.cols, src_roi.rows, dst_roi.cols, dst_roi.rows); + EXPECT_MAT_NEAR(dst_roi, host_roi, eps) << info; + } +} + +INSTANTIATE_TEST_CASE_P(CUDA_Warping, ResizeOnnx, Combine( + ALL_DEVICES, + Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F), + Values(0.4, 0.27, 1.6), + Values(0.5, 0.71, 2.7), + Values((int)(INTER_LINEAR), (int)(INTER_CUBIC)), + WHOLE_SUBMAT)); + +INSTANTIATE_TEST_CASE_P(CUDA_Warping_Antialias, ResizeOnnx, Combine( + ALL_DEVICES, + Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F), + Values(0.4, 0.27, 1.6), + Values(0.5, 0.71, 2.7), + Values( + (int)(INTER_ANTIALIAS | INTER_LINEAR), + (int)(INTER_ANTIALIAS | INTER_CUBIC)), + WHOLE_SUBMAT)); + +INSTANTIATE_TEST_CASE_P(CUDA_Warping_Nearest, ResizeOnnx, Combine( + ALL_DEVICES, + Values(CV_8S, CV_16S, CV_32F, CV_64F), + Values(0.4, 0.27, 1.6), + Values(0.5, 0.71, 2.7), + Values( + (int)(INTER_NEAREST | INTER_NEAREST_PREFER_FLOOR), + (int)(INTER_NEAREST | INTER_NEAREST_PREFER_CEIL), + (int)(INTER_NEAREST | INTER_NEAREST_CEIL), + (int)(INTER_NEAREST | INTER_NEAREST_FLOOR)), + WHOLE_SUBMAT)); + +INSTANTIATE_TEST_CASE_P(CUDA_Warping_ExcludeOutside, ResizeOnnx, Combine( + ALL_DEVICES, + Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F), + Values(0.4, 0.27, 1.6), + Values(0.5, 0.71, 2.7), + Values( + (int)( INTER_CUBIC | INTER_EXCLUDE_OUTSIDE), + (int)(INTER_ANTIALIAS | INTER_CUBIC | INTER_EXCLUDE_OUTSIDE), + (int)(INTER_ANTIALIAS | INTER_LINEAR | INTER_EXCLUDE_OUTSIDE)), + WHOLE_SUBMAT)); }} // namespace + #endif // HAVE_CUDA