9#ifndef __KOSTREAMED_MATH_H
10#define __KOSTREAMED_MATH_H
18#if XSIMD_VERSION_MAJOR < 10
28template<
typename _impl,
typename result_type>
35 return _mm_cvtss_si32(_mm_set_ss(
value));
36#elif XSIMD_WITH_NEON64
37 return vgetq_lane_s32(vcvtnq_s32_f32(vrndiq_f32(vdupq_n_f32(
value))),
86 const auto nearbyint_as_int = [](
const float v) {
87 const auto a = vdupq_n_f32(
v);
88 const auto signmask = vdupq_n_u32(0x80000000);
90 vbslq_f32(signmask, a, vdupq_n_f32(0.5f));
91 const auto r_normal = vcvtq_s32_f32(
95 const auto plusone = vreinterpretq_s32_u32(
96 vshrq_n_u32(vreinterpretq_u32_s32(vnegq_s32(r_trunc)),
99 vbicq_s32(vaddq_s32(r_trunc, plusone),
101 const auto delta = vsubq_f32(
103 vcvtq_f32_s32(r_trunc));
104 const auto is_delta_half =
105 vceqq_f32(delta, half);
106 return vbslq_s32(is_delta_half, r_even, r_normal);
108 return vgetq_lane_s32(nearbyint_as_int(
value), 0);
110 return std::lroundf(
value);
115#if !defined(XSIMD_NO_SUPPORTED_ARCHITECTURE)
117template<
typename _impl>
126 __m128 x = _mm_set_ss(divisor);
127 __m128 y = _mm_set_ss(divident);
129 x = _mm_mul_ss(x, y);
131 _mm_store_ss(&result, x);
133#elif defined __ARM_NEON
134 auto x = vdupq_n_f32(divisor);
135 auto y = vdupq_n_f32(divident);
139 return vgetq_lane_f32(x, 0);
141 return (1.f / divisor) * divident;
147 return divident * xsimd::reciprocal(divisor);
151template<
typename _impl>
153 using int_v = xsimd::batch<int, _impl>;
154 using uint_v = xsimd::batch<unsigned int, _impl>;
157 static_assert(int_v::size == uint_v::size,
"the selected architecture does not guarantee vector size equality!");
158 static_assert(uint_v::size == float_v::size,
"the selected architecture does not guarantee vector size equality!");
163 template<
bool useMask,
bool useFlow,
class Compositor,
int pixelSize>
166 const qint32 linearInc = pixelSize;
167 qint32 srcLinearInc = params.
srcRowStride ? pixelSize : 0;
172 typename Compositor::ParamsWrapper paramsWrapper(params);
174 for (qint32 r = params.
rows; r > 0; --r) {
175 const quint8 *mask = maskRowStart;
176 const quint8 *src = srcRowStart;
177 quint8 *dst = dstRowStart;
179 int blockRest = params.
cols;
181 for (
int i = 0; i < blockRest; i++) {
182 Compositor::template compositeOnePixelScalar<useMask, _impl>(src,
204 template<
bool useMask,
bool useFlow,
class Compositor>
207 genericComposite_novector<useMask, useFlow, Compositor, 4>(params);
210 template<
bool useMask,
bool useFlow,
class Compositor>
213 genericComposite_novector<useMask, useFlow, Compositor, 16>(params);
216 template<
bool useMask,
bool useFlow,
class Compositor>
219 genericComposite_novector<useMask, useFlow, Compositor, 8>(params);
238 return xsimd::batch_cast<float>(xsimd::load_and_extend<int_v>(data));
253 template<
bool aligned>
256 using U =
typename std::conditional<aligned, xsimd::aligned_mode, xsimd::unaligned_mode>::type;
257 const auto data_i = uint_v::load(
static_cast<const typename uint_v::value_type *
>(data), U{});
258 return xsimd::to_float(xsimd::bitwise_cast_compat<int>(data_i >> 24));
273 template<
bool aligned>
276 using U =
typename std::conditional<aligned, xsimd::aligned_mode, xsimd::unaligned_mode>::type;
278 const auto data_i = uint_v::load(
static_cast<const typename uint_v::value_type *
>(data), U{});
282 c1 = xsimd::to_float(xsimd::bitwise_cast_compat<int>((data_i >> 16) & mask));
283 c2 = xsimd::to_float(xsimd::bitwise_cast_compat<int>((data_i >> 8) & mask));
284 c3 = xsimd::to_float(xsimd::bitwise_cast_compat<int>((data_i) & mask));
298 const int_v mask(0xFF);
300 const auto v1 = (xsimd::nearbyint_as_int(alpha)) << 24;
301 const auto v2 = (xsimd::nearbyint_as_int(c1) & mask) << 16;
302 const auto v3 = (xsimd::nearbyint_as_int(c2) & mask) << 8;
303 const auto v4 = (xsimd::nearbyint_as_int(c3) & mask);
304 xsimd::store_aligned(
static_cast<typename int_v::value_type *
>(data), (v1 | v2) | (v3 | v4));
310 const int_v mask(0xFF);
312 const auto v1 = (xsimd::nearbyint_as_int(alpha)) << 24;
313 const auto v2 = (xsimd::nearbyint_as_int(c1) & mask) << 16;
314 const auto v3 = (xsimd::nearbyint_as_int(c2) & mask) << 8;
315 const auto v4 = (xsimd::nearbyint_as_int(c3) & mask);
316 xsimd::store_unaligned(
static_cast<typename int_v::value_type *
>(data), (v1 | v2) | (v3 | v4));
324 template<
bool useMask,
bool useFlow,
class Compositor,
int pixelSize>
327 const int vectorSize =
static_cast<int>(float_v::size);
328 const qint32 vectorInc = pixelSize * vectorSize;
329 const qint32 linearInc = pixelSize;
330 qint32 srcVectorInc = vectorInc;
331 qint32 srcLinearInc = pixelSize;
336 typename Compositor::ParamsWrapper paramsWrapper(params);
339 if (pixelSize == 4) {
340 auto *buf =
reinterpret_cast<uint_v *
>(xsimd::vector_aligned_malloc<typename uint_v::value_type>(vectorSize));
341 *buf =
uint_v(*(
reinterpret_cast<const quint32 *
>(srcRowStart)));
342 srcRowStart =
reinterpret_cast<quint8 *
>(buf);
346 auto *buf = xsimd::vector_aligned_malloc<quint8>(vectorInc);
349 for (
size_t i = 0; i < vectorSize; i++) {
360 int totalBlockAlign = 0;
361 int totalBlockAlignedVector = 0;
362 int totalBlockUnalignedVector = 0;
363 int totalBlockRest = 0;
366 for (qint32 r = params.
rows; r > 0; --r) {
368 const quint8 *mask = maskRowStart;
370 const quint8 *src = srcRowStart;
371 quint8 *dst = dstRowStart;
373 const int pixelsAlignmentMask = vectorSize *
sizeof(float) - 1;
374 auto srcPtrValue =
reinterpret_cast<uintptr_t
>(src);
375 auto dstPtrValue =
reinterpret_cast<uintptr_t
>(dst);
376 uintptr_t srcAlignment = srcPtrValue & pixelsAlignmentMask;
377 uintptr_t dstAlignment = dstPtrValue & pixelsAlignmentMask;
383 int blockAlign = params.
cols;
384 int blockAlignedVector = 0;
385 int blockUnalignedVector = 0;
389 srcAlignment == dstAlignment || !srcVectorInc ? &blockAlignedVector : &blockUnalignedVector;
393 *vectorBlock = params.
cols / vectorSize;
394 blockRest = params.
cols % vectorSize;
395 }
else if (params.
cols > 2 * vectorSize) {
396 blockAlign = (vectorInc - dstAlignment) / pixelSize;
397 const int restCols = params.
cols - blockAlign;
399 *vectorBlock = restCols / vectorSize;
400 blockRest = restCols % vectorSize;
402 blockAlign = params.
cols;
408 totalBlockAlign += blockAlign;
409 totalBlockAlignedVector += blockAlignedVector;
410 totalBlockUnalignedVector += blockUnalignedVector;
411 totalBlockRest += blockRest;
414 for (
int i = 0; i < blockAlign; i++) {
415 Compositor::template compositeOnePixelScalar<useMask, _impl>(src,
428 for (
int i = 0; i < blockAlignedVector; i++) {
429 Compositor::template compositeVector<useMask, true, _impl>(src,
442 for (
int i = 0; i < blockUnalignedVector; i++) {
443 Compositor::template compositeVector<useMask, false, _impl>(src,
456 for (
int i = 0; i < blockRest; i++) {
457 Compositor::template compositeOnePixelScalar<useMask, _impl>(src,
480 <<
"rows:" << params.
rows <<
"\tpad(S):" << totalBlockAlign <<
"\tbav(V):" << totalBlockAlignedVector
481 <<
"\tbuv(V):" << totalBlockUnalignedVector <<
"\tres(S)"
490 template<
bool useMask,
bool useFlow,
class Compositor>
493 genericComposite<useMask, useFlow, Compositor, 4>(params);
496 template<
bool useMask,
bool useFlow,
class Compositor>
499 genericComposite<useMask, useFlow, Compositor, 16>(params);
502 template<
bool useMask,
bool useFlow,
class Compositor>
505 genericComposite<useMask, useFlow, Compositor, 8>(params);
509template<
typename channels_type,
class _impl>
512 using float_m =
typename float_v::batch_bool_type;
534 using float_m =
typename float_v::batch_bool_type;
546 if (xsimd::any(mask)) {
547 c1 = xsimd::select(mask, m_orig_c1, c1);
548 c2 = xsimd::select(mask, m_orig_c2, c2);
549 c3 = xsimd::select(mask, m_orig_c3, c3);
559template<
typename channels_type,
class _impl>
566 using int_v = xsimd::batch<int, _impl>;
567 using uint_v = xsimd::batch<unsigned int, _impl>;
570 static_assert(int_v::size == uint_v::size,
"the selected architecture does not guarantee vector size equality!");
571 static_assert(uint_v::size == float_v::size,
"the selected architecture does not guarantee vector size equality!");
588 const float uint16Rec1 = 1.0f / 65535.0f;
595 const float uint16Max = 65535.0f;
601 , uint16Max(65535.0f)
602 , uint16Rec1(1.0f / 65535.0f)
612#if XSIMD_VERSION_MAJOR < 10
617 const auto *srcPtr =
static_cast<const typename uint_v::value_type *
>(src);
618 const auto idx1 = xsimd::detail::make_sequence_as_batch<int_v>() * 2;
619 const auto idx2 = idx1 + 1;
621 const auto pixelsC1C2 = uint_v::gather(srcPtr, idx1);
622 const auto pixelsC3Alpha = uint_v::gather(srcPtr, idx2);
625 dst_c1 = xsimd::to_float(xsimd::bitwise_cast_compat<int>(pixelsC1C2 & mask));
626 dst_c2 = xsimd::to_float(xsimd::bitwise_cast_compat<int>((pixelsC1C2 >> 16) & mask));
627 dst_c3 = xsimd::to_float(xsimd::bitwise_cast_compat<int>((pixelsC3Alpha & mask)));
628 dst_alpha = xsimd::to_float(xsimd::bitwise_cast_compat<int>((pixelsC3Alpha >> 16) & mask));
630 dst_alpha *= uint16Rec1;
635 const auto alpha = a * uint16Max;
637 const auto v1 = xsimd::bitwise_cast_compat<unsigned int>(xsimd::nearbyint_as_int(c1));
638 const auto v2 = xsimd::bitwise_cast_compat<unsigned int>(xsimd::nearbyint_as_int(c2));
639 const auto v3 = xsimd::bitwise_cast_compat<unsigned int>(xsimd::nearbyint_as_int(c3));
640 const auto v4 = xsimd::bitwise_cast_compat<unsigned int>(xsimd::nearbyint_as_int(alpha));
642 const auto c1c2 = ((v2 & mask) << 16) | (v1 & mask);
643 const auto c3ca = ((v4 & mask) << 16) | (v3 & mask);
645#if XSIMD_VERSION_MAJOR < 10
648 auto dstPtr =
reinterpret_cast<typename int_v::value_type *
>(dst);
650 const auto idx1 = xsimd::detail::make_sequence_as_batch<int_v>() * 2;
651 const auto idx2 = idx1 + 1;
653 c1c2.scatter(dstPtr, idx1);
654 c3ca.scatter(dstPtr, idx2);
661 memset(dataDst, 0, float_v::size *
sizeof(quint16) * 4);
667 memcpy(dataDst, dataSrc, float_v::size *
sizeof(quint16) * 4);
675template<
typename _impl>
677 using int_v = xsimd::batch<int, _impl>;
678 using uint_v = xsimd::batch<unsigned int, _impl>;
681 static_assert(int_v::size == uint_v::size,
"the selected architecture does not guarantee vector size equality!");
682 static_assert(uint_v::size == float_v::size,
"the selected architecture does not guarantee vector size equality!");
699 const float uint8Rec1 = 1.0f / 255.0f;
706 const float uint8Max = 255.0f;
711 : mask(quint32(0xFF))
713 , uint8Rec1(1.0f / 255.0f)
722 dst_alpha *= uint8Rec1;
728 const auto alpha = a * uint8Max;
736 memset(dataDst, 0, float_v::size *
sizeof(quint8) * 4);
742 memcpy(dataDst, dataSrc, float_v::size *
sizeof(quint8) * 4);
750template<
typename _impl>
752 using int_v = xsimd::batch<int, _impl>;
753 using uint_v = xsimd::batch<unsigned int, _impl>;
756 static_assert(int_v::size == uint_v::size,
"the selected architecture does not guarantee vector size equality!");
757 static_assert(uint_v::size == float_v::size,
"the selected architecture does not guarantee vector size equality!");
794#if XSIMD_VERSION_MAJOR < 10
797 const auto srcPtr =
reinterpret_cast<const typename float_v::value_type *
>(src);
798 const auto idx1 = xsimd::detail::make_sequence_as_batch<int_v>() * 4;
799 const auto idx2 = idx1 + 1;
800 const auto idx3 = idx1 + 2;
801 const auto idx4 = idx1 + 3;
803 dst_c1 = float_v::gather(srcPtr, idx1);
804 dst_c2 = float_v::gather(srcPtr, idx2);
805 dst_c3 = float_v::gather(srcPtr, idx3);
806 dst_alpha = float_v::gather(srcPtr, idx4);
813#if XSIMD_VERSION_MAJOR < 10
816 auto dstPtr =
reinterpret_cast<typename float_v::value_type *
>(dst);
818 const auto idx1 = xsimd::detail::make_sequence_as_batch<int_v>() * 4;
819 const auto idx2 = idx1 + 1;
820 const auto idx3 = idx1 + 2;
821 const auto idx4 = idx1 + 3;
823 src_c1.scatter(dstPtr, idx1);
824 src_c2.scatter(dstPtr, idx2);
825 src_c3.scatter(dstPtr, idx3);
826 src_alpha.scatter(dstPtr, idx4);
833 memset(dataDst, 0, float_v::size *
sizeof(
float) * 4);
839 memcpy(dataDst, dataSrc, float_v::size *
sizeof(
float) * 4);
847template<
int pixelSize>
850 std::memset(dst, 0, pixelSize);
853template<
int pixelSize>
856 std::memcpy(dst, src, pixelSize);
float value(const T *src, size_t ch)
T lerp(T a, T b, T alpha)
ALWAYS_INLINE void clearPixel(quint8 *dst)
ALWAYS_INLINE void copyPixel(const quint8 *src, quint8 *dst)
void vector_aligned_free(const T *ptr) noexcept
const quint8 * srcRowStart
const quint8 * maskRowStart
static void genericComposite32(const KoCompositeOp::ParameterInfo ¶ms)
static void genericComposite_novector(const KoCompositeOp::ParameterInfo ¶ms)
static void genericComposite64_novector(const KoCompositeOp::ParameterInfo ¶ms)
xsimd::batch< unsigned int, _impl > uint_v
static void write_channels_32(void *data, const float_v alpha, const float_v c1, const float_v c2, const float_v c3)
static void genericComposite128(const KoCompositeOp::ParameterInfo ¶ms)
static void write_channels_32_unaligned(void *data, const float_v alpha, const float_v c1, const float_v c2, const float_v c3)
static void genericComposite(const KoCompositeOp::ParameterInfo ¶ms)
xsimd::batch< float, _impl > float_v
static float_v fetch_mask_8(const quint8 *data)
static quint8 lerp_mixed_u8_float(quint8 a, quint8 b, float alpha)
static void fetch_colors_32(const void *data, float_v &c1, float_v &c2, float_v &c3)
static float_v fetch_alpha_32(const void *data)
static void genericComposite64(const KoCompositeOp::ParameterInfo ¶ms)
static quint8 round_float_to_u8(float x)
static void genericComposite32_novector(const KoCompositeOp::ParameterInfo ¶ms)
xsimd::batch< int, _impl > int_v
static void genericComposite128_novector(const KoCompositeOp::ParameterInfo ¶ms)
xsimd::batch< float, _impl > float_v
static ALWAYS_INLINE float divScalar(const float ÷nt, const float &divisor)
static ALWAYS_INLINE float_v divVector(const float_v ÷nt, const float_v &divisor)
static ALWAYS_INLINE result_type roundScalar(const float value)
xsimd::batch< float, _impl > float_v
typename float_v::batch_bool_type float_m
ALWAYS_INLINE void recoverPixels(const float_m &mask, float_v &c1, float_v &c2, float_v &c3) const
ALWAYS_INLINE PixelStateRecoverHelper(const float_v &c1, const float_v &c2, const float_v &c3)
typename float_v::batch_bool_type float_m
ALWAYS_INLINE void recoverPixels(const float_m &mask, float_v &c1, float_v &c2, float_v &c3) const
xsimd::batch< float, _impl > float_v
ALWAYS_INLINE PixelStateRecoverHelper(const float_v &c1, const float_v &c2, const float_v &c3)
xsimd::batch< int, _impl > int_v
xsimd::batch< unsigned int, _impl > uint_v
ALWAYS_INLINE void copyPixels(const quint8 *dataSrc, quint8 *dataDst)
xsimd::batch< float, _impl > float_v
static ALWAYS_INLINE float lerpMixedUintFloat(float a, float b, float alpha)
static ALWAYS_INLINE void denormalizeAlpha(float &alpha)
ALWAYS_INLINE void write(void *dst, const float_v &src_c1, const float_v &src_c2, const float_v &src_c3, const float_v &src_alpha)
ALWAYS_INLINE void clearPixels(quint8 *dataDst)
static ALWAYS_INLINE float roundFloatToUint(float x)
static ALWAYS_INLINE void normalizeAlpha(float &alpha)
ALWAYS_INLINE void read(const void *src, float_v &dst_c1, float_v &dst_c2, float_v &dst_c3, float_v &dst_alpha)
xsimd::batch< int, _impl > int_v
static ALWAYS_INLINE void denormalizeAlpha(float &alpha)
ALWAYS_INLINE void read(const void *src, float_v &dst_c1, float_v &dst_c2, float_v &dst_c3, float_v &dst_alpha)
ALWAYS_INLINE void clearPixels(quint8 *dataDst)
xsimd::batch< float, _impl > float_v
static ALWAYS_INLINE quint16 roundFloatToUint(float x)
static ALWAYS_INLINE void normalizeAlpha(float &alpha)
xsimd::batch< unsigned int, _impl > uint_v
ALWAYS_INLINE void copyPixels(const quint8 *dataSrc, quint8 *dataDst)
static ALWAYS_INLINE quint16 lerpMixedUintFloat(quint16 a, quint16 b, float alpha)
ALWAYS_INLINE void write(void *dst, const float_v &c1, const float_v &c2, const float_v &c3, const float_v &a)
xsimd::batch< int, _impl > int_v
static ALWAYS_INLINE void normalizeAlpha(float &alpha)
static ALWAYS_INLINE quint8 roundFloatToUint(float x)
xsimd::batch< float, _impl > float_v
static ALWAYS_INLINE void denormalizeAlpha(float &alpha)
ALWAYS_INLINE void write(quint8 *dataDst, const float_v &c1, const float_v &c2, const float_v &c3, const float_v &a)
xsimd::batch< unsigned int, _impl > uint_v
ALWAYS_INLINE void copyPixels(const quint8 *dataSrc, quint8 *dataDst)
ALWAYS_INLINE void clearPixels(quint8 *dataDst)
ALWAYS_INLINE void read(const void *src, float_v &dst_c1, float_v &dst_c2, float_v &dst_c3, float_v &dst_alpha)
static ALWAYS_INLINE quint8 lerpMixedUintFloat(quint8 a, quint8 b, float alpha)