3rdparty: NDSRVP - Part 1.5: New Interfaces #25786

Junyan721113 · 2024-06-19T11:19:57Z

Summary

Previous context

From PR #25167:

Looks like warpAffine/warpPerspective are a good place to add a new smaller HAL function which would do the same processing as opt_SSE4_1::WarpAffineInvoker_Blockline_SSE41, opt_AVX2::warpAffineBlockline and other similar functions, e.g. here:

opencv/modules/imgproc/src/imgwarp.cpp

Lines 2304 to 2334 in 78ed6de

x1 = 0;

#if CV_TRY_SSE4_1

if( useSSE4_1 )

opt_SSE4_1::WarpAffineInvoker_Blockline_SSE41(adelta + x, bdelta + x, xy, X0, Y0, bw);

else

#endif

{

#if CV_SIMD128

{

v_int32x4 v_X0 = v_setall_s32(X0), v_Y0 = v_setall_s32(Y0);

int span = VTraits<v_uint16x8>::vlanes();

for( ; x1 <= bw - span; x1 += span )

{

v_int16x8 v_dst[2];

#define CV_CONVERT_MAP(ptr,offset,shift) v_pack(v_shr<AB_BITS>(v_add(shift,v_load(ptr + offset))),\

v_shr<AB_BITS>(v_add(shift,v_load(ptr + offset + 4))))

v_dst[0] = CV_CONVERT_MAP(adelta, x+x1, v_X0);

v_dst[1] = CV_CONVERT_MAP(bdelta, x+x1, v_Y0);

#undef CV_CONVERT_MAP

v_store_interleave(xy + (x1 << 1), v_dst[0], v_dst[1]);

}

}

#endif

for( ; x1 < bw; x1++ )

{

int X = (X0 + adelta[x+x1]) >> AB_BITS;

int Y = (Y0 + bdelta[x+x1]) >> AB_BITS;

xy[x1*2] = saturate_cast<short>(X);

xy[x1*2+1] = saturate_cast<short>(Y);

}

}

I agree. If smaller HAL functions were added, the code could be better modified in the spirit of HAL design.

Perhaps it would be better to move this include to cpp files in order to limit public HAL interface to hal/interface.h. Is it possible?

Refactoring and some cleanup (warpAffine/warpPerspective/headers) can be postponed to the next PR.

Part 1.5: New Interfaces (Ready for PR)

Core
ImgProc
Add smaller WarpAffine & WarpPerspective HAL interface
Shrink cv::ndsrvp::warpAffine & cv::ndsrvp::warpPerspective into ...Blockline & ...BlocklineNN
Implement cv::ndsrvp::remap via new cv_hal_remap32f interface

What's noticing is that the remap function called by warpAffine and warpPerspective does not use HAL interface cv_hal_remap32f.

Performance tests

Remap

Geometric mean (ms)

Name of Test	opencv perf imgproc Remap 8U	opencv perf imgproc Remap 8U	opencv perf imgproc Remap 8U vs opencv perf imgproc Remap 8U (x-factor)
Remap::OCL_RemapFixture::(1280x720, 8UC1, INTER_NEAREST)	59.808	58.873	1.02
Remap::OCL_RemapFixture::(1280x720, 8UC3, INTER_NEAREST)	65.620	62.347	1.05
Remap::OCL_RemapFixture::(1280x720, 8UC4, INTER_NEAREST)	67.534	58.076	1.16
Remap::OCL_RemapFixture::(1920x1080, 8UC1, INTER_NEAREST)	138.648	130.950	1.06
Remap::OCL_RemapFixture::(1920x1080, 8UC3, INTER_NEAREST)	153.022	137.940	1.11
Remap::OCL_RemapFixture::(1920x1080, 8UC4, INTER_NEAREST)	155.732	130.616	1.19

Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

I agree to contribute to the project under Apache 2 License.
To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
The PR is proposed to the proper branch
There is a reference to the original bug report and related work
There is accuracy test, performance test and test data in opencv_extra repository, if applicable
Patch to opencv_extra has the same branch name.
The feature is well documented and sample code can be built with the project CMake

Junyan721113 · 2024-07-08T10:20:19Z

@mshabunin

3rdparty/ndsrvp/src/integral.cpp

mshabunin · 2024-07-09T11:26:58Z

modules/imgproc/src/imgwarp.cpp

@@ -2307,6 +2307,7 @@ class WarpAffineInvoker :
                            opt_SSE4_1::WarpAffineInvoker_Blockline_SSE41(adelta + x, bdelta + x, xy, X0, Y0, bw);
                        else
                        #endif
+                        if( cv_hal_warpAffineBlocklineNN(adelta + x, bdelta + x, xy, X0, Y0, bw) != CV_HAL_ERROR_OK )


Please extract the whole block to some cv::hal:: function. Define it here: https://github.com/opencv/opencv/blob/4.x/modules/imgproc/include/opencv2/imgproc/hal/hal.hpp Implementation can reside somewhere in this file (imgwarp.cpp).

This new function should first try external HAL function (using CALL_HAL macro), then try AVX, SSE, LASX, universal intrinsics, then fallback implementation.

Then cv::warpAffine should call this new cv::hal:: function for CPU processing.

There are some functions implemented this way, e.g. cv::hal::normHamming (

opencv/modules/core/src/norm.cpp

Lines 53 to 100 in 8d935e2

int normHamming(const uchar* a, int n, int cellSize)

{

int output;

CALL_HAL_RET(normHamming8u, cv_hal_normHamming8u, output, a, n, cellSize);

if( cellSize == 1 )

return normHamming(a, n);

const uchar* tab = 0;

if( cellSize == 2 )

tab = popCountTable2;

else if( cellSize == 4 )

tab = popCountTable4;

else

return -1;

int i = 0;

int result = 0;

#if (CV_SIMD || CV_SIMD_SCALABLE)

v_uint64 t = vx_setzero_u64();

if ( cellSize == 2)

{

v_uint16 mask = v_reinterpret_as_u16(vx_setall_u8(0x55));

for(; i <= n - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes())

{

v_uint16 a0 = v_reinterpret_as_u16(vx_load(a + i));

t = v_add(t, v_popcount(v_reinterpret_as_u64(v_and(v_or(a0, v_shr<1>(a0)), mask))));

}

}

else // cellSize == 4

{

v_uint16 mask = v_reinterpret_as_u16(vx_setall_u8(0x11));

for(; i <= n - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes())

{

v_uint16 a0 = v_reinterpret_as_u16(vx_load(a + i));

v_uint16 a1 = v_or(a0, v_shr<2>(a0));

t = v_add(t, v_popcount(v_reinterpret_as_u64(v_and(v_or(a1, v_shr<1>(a1)), mask))));

}

}

result += (int)v_reduce_sum(t);

vx_cleanup();

#elif CV_ENABLE_UNROLLED

for( ; i <= n - 4; i += 4 )

result += tab[a[i]] + tab[a[i+1]] + tab[a[i+2]] + tab[a[i+3]];

#endif

for( ; i < n; i++ )

result += tab[a[i]];

return result;

}

), but it only have HAL, universal intrinsics and fallback parts.

4 new functions have been extracted. Further accuracy checks might be needed for other related platforms(AVX2, LASX, etc.).

asmorkalov · 2024-07-24T12:59:16Z

@Junyan721113 friendly reminder.

asmorkalov · 2024-07-26T06:22:34Z

cc @vpisarev

3rdparty/ndsrvp/include/imgproc.hpp

3rdparty/ndsrvp/include/cvutils.hpp

3rdparty/ndsrvp/src/remap.cpp

modules/imgproc/include/opencv2/imgproc/hal/hal.hpp

asmorkalov · 2024-07-31T13:43:20Z

@fengyuentau Please attention on the changes.

fengyuentau · 2024-07-31T14:08:06Z

Okay, it seems the changes are not modifying the core method regarding warpAffine and warpPerspective. The new kernel is written fully with universal intrinsics (some parts are using neon intrinsics for the best performance). Could this be merged soon? Otherwise it can lead to merge conflicts.

Junyan721113 · 2024-08-03T05:45:24Z

@asmorkalov is there any other change needed to be made?

asmorkalov requested a review from mshabunin June 19, 2024 11:38

asmorkalov added optimization platform: riscv labels Jun 19, 2024

asmorkalov added this to the 4.11.0 milestone Jun 19, 2024

mshabunin reviewed Jul 9, 2024

View reviewed changes

3rdparty/ndsrvp/src/integral.cpp Outdated Show resolved Hide resolved

mshabunin reviewed Jul 9, 2024

View reviewed changes

Junyan721113 force-pushed the rvp_3rdparty branch 2 times, most recently from 22d10fe to f3729de Compare July 29, 2024 08:08

mshabunin reviewed Jul 29, 2024

View reviewed changes

3rdparty/ndsrvp/include/imgproc.hpp Outdated Show resolved Hide resolved

mshabunin reviewed Jul 29, 2024

View reviewed changes

3rdparty/ndsrvp/include/cvutils.hpp Outdated Show resolved Hide resolved

Junyan721113 force-pushed the rvp_3rdparty branch 3 times, most recently from e4d8dd2 to 7a0336d Compare July 30, 2024 13:31

mshabunin approved these changes Jul 30, 2024

View reviewed changes

asmorkalov reviewed Jul 31, 2024

View reviewed changes

3rdparty/ndsrvp/src/remap.cpp Outdated Show resolved Hide resolved

modules/imgproc/include/opencv2/imgproc/hal/hal.hpp Show resolved Hide resolved

This comment was marked as outdated.

Sign in to view

feat: Part 1.5 - New Interfaces

35463e0

Junyan721113 force-pushed the rvp_3rdparty branch from 7a0336d to 35463e0 Compare August 2, 2024 05:49

Junyan721113 requested a review from asmorkalov August 2, 2024 05:57

asmorkalov approved these changes Aug 5, 2024

View reviewed changes

asmorkalov merged commit ecbff5a into opencv:4.x Aug 5, 2024
29 of 30 checks passed

asmorkalov mentioned this pull request Aug 6, 2024

(5.x) Merge 4.x #25998

Merged

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

3rdparty: NDSRVP - Part 1.5: New Interfaces #25786

3rdparty: NDSRVP - Part 1.5: New Interfaces #25786

Uh oh!

Junyan721113 commented Jun 19, 2024 •

edited

Loading

Uh oh!

Junyan721113 commented Jul 8, 2024

Uh oh!

Uh oh!

mshabunin Jul 9, 2024

Uh oh!

Junyan721113 Jul 27, 2024 •

edited

Loading

Uh oh!

asmorkalov commented Jul 24, 2024

Uh oh!

asmorkalov commented Jul 26, 2024

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

asmorkalov commented Jul 31, 2024

Uh oh!

This comment was marked as outdated.

fengyuentau commented Jul 31, 2024

Uh oh!

Junyan721113 commented Aug 3, 2024

Uh oh!

Uh oh!

Uh oh!

	x1 = 0;
	#if CV_TRY_SSE4_1
	if( useSSE4_1 )
	opt_SSE4_1::WarpAffineInvoker_Blockline_SSE41(adelta + x, bdelta + x, xy, X0, Y0, bw);
	else
	#endif
	{
	#if CV_SIMD128
	{
	v_int32x4 v_X0 = v_setall_s32(X0), v_Y0 = v_setall_s32(Y0);
	int span = VTraits<v_uint16x8>::vlanes();
	for( ; x1 <= bw - span; x1 += span )
	{
	v_int16x8 v_dst[2];
	#define CV_CONVERT_MAP(ptr,offset,shift) v_pack(v_shr<AB_BITS>(v_add(shift,v_load(ptr + offset))),\
	v_shr<AB_BITS>(v_add(shift,v_load(ptr + offset + 4))))
	v_dst[0] = CV_CONVERT_MAP(adelta, x+x1, v_X0);
	v_dst[1] = CV_CONVERT_MAP(bdelta, x+x1, v_Y0);
	#undef CV_CONVERT_MAP
	v_store_interleave(xy + (x1 << 1), v_dst[0], v_dst[1]);
	}
	}
	#endif
	for( ; x1 < bw; x1++ )
	{
	int X = (X0 + adelta[x+x1]) >> AB_BITS;
	int Y = (Y0 + bdelta[x+x1]) >> AB_BITS;
	xy[x1*2] = saturate_cast<short>(X);
	xy[x1*2+1] = saturate_cast<short>(Y);
	}
	}

	int normHamming(const uchar* a, int n, int cellSize)
	{
	int output;
	CALL_HAL_RET(normHamming8u, cv_hal_normHamming8u, output, a, n, cellSize);

	if( cellSize == 1 )
	return normHamming(a, n);
	const uchar* tab = 0;
	if( cellSize == 2 )
	tab = popCountTable2;
	else if( cellSize == 4 )
	tab = popCountTable4;
	else
	return -1;
	int i = 0;
	int result = 0;
	#if (CV_SIMD \|\| CV_SIMD_SCALABLE)
	v_uint64 t = vx_setzero_u64();
	if ( cellSize == 2)
	{
	v_uint16 mask = v_reinterpret_as_u16(vx_setall_u8(0x55));
	for(; i <= n - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes())
	{
	v_uint16 a0 = v_reinterpret_as_u16(vx_load(a + i));
	t = v_add(t, v_popcount(v_reinterpret_as_u64(v_and(v_or(a0, v_shr<1>(a0)), mask))));
	}
	}
	else // cellSize == 4
	{
	v_uint16 mask = v_reinterpret_as_u16(vx_setall_u8(0x11));
	for(; i <= n - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes())
	{
	v_uint16 a0 = v_reinterpret_as_u16(vx_load(a + i));
	v_uint16 a1 = v_or(a0, v_shr<2>(a0));
	t = v_add(t, v_popcount(v_reinterpret_as_u64(v_and(v_or(a1, v_shr<1>(a1)), mask))));

	}
	}
	result += (int)v_reduce_sum(t);
	vx_cleanup();
	#elif CV_ENABLE_UNROLLED
	for( ; i <= n - 4; i += 4 )
	result += tab[a[i]] + tab[a[i+1]] + tab[a[i+2]] + tab[a[i+3]];
	#endif
	for( ; i < n; i++ )
	result += tab[a[i]];
	return result;
	}

Uh oh!

3rdparty: NDSRVP - Part 1.5: New Interfaces #25786

3rdparty: NDSRVP - Part 1.5: New Interfaces #25786

Uh oh!

Conversation

Junyan721113 commented Jun 19, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Summary

Previous context

Part 1.5: New Interfaces (Ready for PR)

Performance tests

Pull Request Readiness Checklist

Uh oh!

Junyan721113 commented Jul 8, 2024

Uh oh!

Uh oh!

mshabunin Jul 9, 2024

Choose a reason for hiding this comment

Uh oh!

Junyan721113 Jul 27, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

asmorkalov commented Jul 24, 2024

Uh oh!

asmorkalov commented Jul 26, 2024

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

asmorkalov commented Jul 31, 2024

Uh oh!

This comment was marked as outdated.

fengyuentau commented Jul 31, 2024

Uh oh!

Junyan721113 commented Aug 3, 2024

Uh oh!

Uh oh!

Uh oh!

Junyan721113 commented Jun 19, 2024 •

edited

Loading

Junyan721113 Jul 27, 2024 •

edited

Loading