9#ifndef __KOSTREAMED_MATH_H
10#define __KOSTREAMED_MATH_H
12#if defined(XSIMD_NO_SUPPORTED_ARCHITECTURE)
13#error "Trying to use SIMD with an unknown architecture!"
22#if XSIMD_VERSION_MAJOR < 10
32template<
typename _impl,
typename result_type>
39 return _mm_cvtss_si32(_mm_set_ss(
value));
40#elif XSIMD_WITH_NEON64
41 return vgetq_lane_s32(vcvtnq_s32_f32(vrndiq_f32(vdupq_n_f32(
value))),
90 const auto nearbyint_as_int = [](
const float v) {
91 const auto a = vdupq_n_f32(
v);
92 const auto signmask = vdupq_n_u32(0x80000000);
94 vbslq_f32(signmask, a, vdupq_n_f32(0.5f));
95 const auto r_normal = vcvtq_s32_f32(
99 const auto plusone = vreinterpretq_s32_u32(
100 vshrq_n_u32(vreinterpretq_u32_s32(vnegq_s32(r_trunc)),
103 vbicq_s32(vaddq_s32(r_trunc, plusone),
105 const auto delta = vsubq_f32(
107 vcvtq_f32_s32(r_trunc));
108 const auto is_delta_half =
109 vceqq_f32(delta, half);
110 return vbslq_s32(is_delta_half, r_even, r_normal);
112 return vgetq_lane_s32(nearbyint_as_int(
value), 0);
114 return std::lroundf(
value);
119template<
typename _impl>
128 __m128 x = _mm_set_ss(divisor);
129 __m128 y = _mm_set_ss(divident);
131 x = _mm_mul_ss(x, y);
133 _mm_store_ss(&result, x);
135#elif defined __ARM_NEON
136 auto x = vdupq_n_f32(divisor);
137 auto y = vdupq_n_f32(divident);
141 return vgetq_lane_f32(x, 0);
143 return (1.f / divisor) * divident;
149 return divident * xsimd::reciprocal(divisor);
153template<
typename _impl>
155 using int_v = xsimd::batch<int, _impl>;
156 using uint_v = xsimd::batch<unsigned int, _impl>;
159 static_assert(int_v::size == uint_v::size,
"the selected architecture does not guarantee vector size equality!");
160 static_assert(uint_v::size == float_v::size,
"the selected architecture does not guarantee vector size equality!");
165 template<
bool useMask,
bool useFlow,
class Compositor,
int pixelSize>
168 const qint32 linearInc = pixelSize;
169 qint32 srcLinearInc = params.
srcRowStride ? pixelSize : 0;
174 typename Compositor::ParamsWrapper paramsWrapper(params);
176 for (qint32 r = params.
rows; r > 0; --r) {
177 const quint8 *mask = maskRowStart;
178 const quint8 *src = srcRowStart;
179 quint8 *dst = dstRowStart;
181 int blockRest = params.
cols;
183 for (
int i = 0; i < blockRest; i++) {
184 Compositor::template compositeOnePixelScalar<useMask, _impl>(src,
206 template<
bool useMask,
bool useFlow,
class Compositor>
209 genericComposite_novector<useMask, useFlow, Compositor, 4>(params);
212 template<
bool useMask,
bool useFlow,
class Compositor>
215 genericComposite_novector<useMask, useFlow, Compositor, 16>(params);
218 template<
bool useMask,
bool useFlow,
class Compositor>
221 genericComposite_novector<useMask, useFlow, Compositor, 8>(params);
240 return xsimd::batch_cast<float>(xsimd::load_and_extend<int_v>(data));
255 template<
bool aligned>
258 using U =
typename std::conditional<aligned, xsimd::aligned_mode, xsimd::unaligned_mode>::type;
259 const auto data_i = uint_v::load(
static_cast<const typename uint_v::value_type *
>(data), U{});
260 return xsimd::to_float(xsimd::bitwise_cast_compat<int>(data_i >> 24));
275 template<
bool aligned>
278 using U =
typename std::conditional<aligned, xsimd::aligned_mode, xsimd::unaligned_mode>::type;
280 const auto data_i = uint_v::load(
static_cast<const typename uint_v::value_type *
>(data), U{});
284 c1 = xsimd::to_float(xsimd::bitwise_cast_compat<int>((data_i >> 16) & mask));
285 c2 = xsimd::to_float(xsimd::bitwise_cast_compat<int>((data_i >> 8) & mask));
286 c3 = xsimd::to_float(xsimd::bitwise_cast_compat<int>((data_i) & mask));
300 const int_v mask(0xFF);
302 const auto v1 = (xsimd::nearbyint_as_int(alpha)) << 24;
303 const auto v2 = (xsimd::nearbyint_as_int(c1) & mask) << 16;
304 const auto v3 = (xsimd::nearbyint_as_int(c2) & mask) << 8;
305 const auto v4 = (xsimd::nearbyint_as_int(c3) & mask);
306 xsimd::store_aligned(
static_cast<typename int_v::value_type *
>(data), (v1 | v2) | (v3 | v4));
312 const int_v mask(0xFF);
314 const auto v1 = (xsimd::nearbyint_as_int(alpha)) << 24;
315 const auto v2 = (xsimd::nearbyint_as_int(c1) & mask) << 16;
316 const auto v3 = (xsimd::nearbyint_as_int(c2) & mask) << 8;
317 const auto v4 = (xsimd::nearbyint_as_int(c3) & mask);
318 xsimd::store_unaligned(
static_cast<typename int_v::value_type *
>(data), (v1 | v2) | (v3 | v4));
326 template<
bool useMask,
bool useFlow,
class Compositor,
int pixelSize>
329 const int vectorSize =
static_cast<int>(float_v::size);
330 const qint32 vectorInc = pixelSize * vectorSize;
331 const qint32 linearInc = pixelSize;
332 qint32 srcVectorInc = vectorInc;
333 qint32 srcLinearInc = pixelSize;
338 typename Compositor::ParamsWrapper paramsWrapper(params);
341 if (pixelSize == 4) {
342 auto *buf =
reinterpret_cast<uint_v *
>(xsimd::vector_aligned_malloc<typename uint_v::value_type>(vectorSize));
343 *buf =
uint_v(*(
reinterpret_cast<const quint32 *
>(srcRowStart)));
344 srcRowStart =
reinterpret_cast<quint8 *
>(buf);
348 auto *buf = xsimd::vector_aligned_malloc<quint8>(vectorInc);
351 for (
size_t i = 0; i < vectorSize; i++) {
362 int totalBlockAlign = 0;
363 int totalBlockAlignedVector = 0;
364 int totalBlockUnalignedVector = 0;
365 int totalBlockRest = 0;
368 for (qint32 r = params.
rows; r > 0; --r) {
370 const quint8 *mask = maskRowStart;
372 const quint8 *src = srcRowStart;
373 quint8 *dst = dstRowStart;
375 const int pixelsAlignmentMask = vectorSize *
sizeof(float) - 1;
376 auto srcPtrValue =
reinterpret_cast<uintptr_t
>(src);
377 auto dstPtrValue =
reinterpret_cast<uintptr_t
>(dst);
378 uintptr_t srcAlignment = srcPtrValue & pixelsAlignmentMask;
379 uintptr_t dstAlignment = dstPtrValue & pixelsAlignmentMask;
385 int blockAlign = params.
cols;
386 int blockAlignedVector = 0;
387 int blockUnalignedVector = 0;
391 srcAlignment == dstAlignment || !srcVectorInc ? &blockAlignedVector : &blockUnalignedVector;
395 *vectorBlock = params.
cols / vectorSize;
396 blockRest = params.
cols % vectorSize;
397 }
else if (params.
cols > 2 * vectorSize) {
398 blockAlign = (vectorInc - dstAlignment) / pixelSize;
399 const int restCols = params.
cols - blockAlign;
401 *vectorBlock = restCols / vectorSize;
402 blockRest = restCols % vectorSize;
404 blockAlign = params.
cols;
410 totalBlockAlign += blockAlign;
411 totalBlockAlignedVector += blockAlignedVector;
412 totalBlockUnalignedVector += blockUnalignedVector;
413 totalBlockRest += blockRest;
416 for (
int i = 0; i < blockAlign; i++) {
417 Compositor::template compositeOnePixelScalar<useMask, _impl>(src,
430 for (
int i = 0; i < blockAlignedVector; i++) {
431 Compositor::template compositeVector<useMask, true, _impl>(src,
444 for (
int i = 0; i < blockUnalignedVector; i++) {
445 Compositor::template compositeVector<useMask, false, _impl>(src,
458 for (
int i = 0; i < blockRest; i++) {
459 Compositor::template compositeOnePixelScalar<useMask, _impl>(src,
482 <<
"rows:" << params.
rows <<
"\tpad(S):" << totalBlockAlign <<
"\tbav(V):" << totalBlockAlignedVector
483 <<
"\tbuv(V):" << totalBlockUnalignedVector <<
"\tres(S)"
492 template<
bool useMask,
bool useFlow,
class Compositor>
495 genericComposite<useMask, useFlow, Compositor, 4>(params);
498 template<
bool useMask,
bool useFlow,
class Compositor>
501 genericComposite<useMask, useFlow, Compositor, 16>(params);
504 template<
bool useMask,
bool useFlow,
class Compositor>
507 genericComposite<useMask, useFlow, Compositor, 8>(params);
511template<
typename channels_type,
class _impl>
514 using float_m =
typename float_v::batch_bool_type;
536 using float_m =
typename float_v::batch_bool_type;
548 if (xsimd::any(mask)) {
549 c1 = xsimd::select(mask, m_orig_c1, c1);
550 c2 = xsimd::select(mask, m_orig_c2, c2);
551 c3 = xsimd::select(mask, m_orig_c3, c3);
561template<
typename channels_type,
class _impl>
568 using int_v = xsimd::batch<int, _impl>;
569 using uint_v = xsimd::batch<unsigned int, _impl>;
572 static_assert(int_v::size == uint_v::size,
"the selected architecture does not guarantee vector size equality!");
573 static_assert(uint_v::size == float_v::size,
"the selected architecture does not guarantee vector size equality!");
590 const float uint16Rec1 = 1.0f / 65535.0f;
597 const float uint16Max = 65535.0f;
603 , uint16Max(65535.0f)
604 , uint16Rec1(1.0f / 65535.0f)
614#if XSIMD_VERSION_MAJOR < 10
619 const auto *srcPtr =
static_cast<const typename uint_v::value_type *
>(src);
620 const auto idx1 = xsimd::detail::make_sequence_as_batch<int_v>() * 2;
621 const auto idx2 = idx1 + 1;
623 const auto pixelsC1C2 = uint_v::gather(srcPtr, idx1);
624 const auto pixelsC3Alpha = uint_v::gather(srcPtr, idx2);
627 dst_c1 = xsimd::to_float(xsimd::bitwise_cast_compat<int>(pixelsC1C2 & mask));
628 dst_c2 = xsimd::to_float(xsimd::bitwise_cast_compat<int>((pixelsC1C2 >> 16) & mask));
629 dst_c3 = xsimd::to_float(xsimd::bitwise_cast_compat<int>((pixelsC3Alpha & mask)));
630 dst_alpha = xsimd::to_float(xsimd::bitwise_cast_compat<int>((pixelsC3Alpha >> 16) & mask));
632 dst_alpha *= uint16Rec1;
637 const auto alpha = a * uint16Max;
639 const auto v1 = xsimd::bitwise_cast_compat<unsigned int>(xsimd::nearbyint_as_int(c1));
640 const auto v2 = xsimd::bitwise_cast_compat<unsigned int>(xsimd::nearbyint_as_int(c2));
641 const auto v3 = xsimd::bitwise_cast_compat<unsigned int>(xsimd::nearbyint_as_int(c3));
642 const auto v4 = xsimd::bitwise_cast_compat<unsigned int>(xsimd::nearbyint_as_int(alpha));
644 const auto c1c2 = ((v2 & mask) << 16) | (v1 & mask);
645 const auto c3ca = ((v4 & mask) << 16) | (v3 & mask);
647#if XSIMD_VERSION_MAJOR < 10
650 auto dstPtr =
reinterpret_cast<typename int_v::value_type *
>(dst);
652 const auto idx1 = xsimd::detail::make_sequence_as_batch<int_v>() * 2;
653 const auto idx2 = idx1 + 1;
655 c1c2.scatter(dstPtr, idx1);
656 c3ca.scatter(dstPtr, idx2);
663 memset(dataDst, 0, float_v::size *
sizeof(quint16) * 4);
669 memcpy(dataDst, dataSrc, float_v::size *
sizeof(quint16) * 4);
677template<
typename _impl>
679 using int_v = xsimd::batch<int, _impl>;
680 using uint_v = xsimd::batch<unsigned int, _impl>;
683 static_assert(int_v::size == uint_v::size,
"the selected architecture does not guarantee vector size equality!");
684 static_assert(uint_v::size == float_v::size,
"the selected architecture does not guarantee vector size equality!");
701 const float uint8Rec1 = 1.0f / 255.0f;
708 const float uint8Max = 255.0f;
713 : mask(quint32(0xFF))
715 , uint8Rec1(1.0f / 255.0f)
724 dst_alpha *= uint8Rec1;
730 const auto alpha = a * uint8Max;
738 memset(dataDst, 0, float_v::size *
sizeof(quint8) * 4);
744 memcpy(dataDst, dataSrc, float_v::size *
sizeof(quint8) * 4);
752template<
typename _impl>
754 using int_v = xsimd::batch<int, _impl>;
755 using uint_v = xsimd::batch<unsigned int, _impl>;
758 static_assert(int_v::size == uint_v::size,
"the selected architecture does not guarantee vector size equality!");
759 static_assert(uint_v::size == float_v::size,
"the selected architecture does not guarantee vector size equality!");
796#if XSIMD_VERSION_MAJOR < 10
799 const auto srcPtr =
reinterpret_cast<const typename float_v::value_type *
>(src);
800 const auto idx1 = xsimd::detail::make_sequence_as_batch<int_v>() * 4;
801 const auto idx2 = idx1 + 1;
802 const auto idx3 = idx1 + 2;
803 const auto idx4 = idx1 + 3;
805 dst_c1 = float_v::gather(srcPtr, idx1);
806 dst_c2 = float_v::gather(srcPtr, idx2);
807 dst_c3 = float_v::gather(srcPtr, idx3);
808 dst_alpha = float_v::gather(srcPtr, idx4);
815#if XSIMD_VERSION_MAJOR < 10
818 auto dstPtr =
reinterpret_cast<typename float_v::value_type *
>(dst);
820 const auto idx1 = xsimd::detail::make_sequence_as_batch<int_v>() * 4;
821 const auto idx2 = idx1 + 1;
822 const auto idx3 = idx1 + 2;
823 const auto idx4 = idx1 + 3;
825 src_c1.scatter(dstPtr, idx1);
826 src_c2.scatter(dstPtr, idx2);
827 src_c3.scatter(dstPtr, idx3);
828 src_alpha.scatter(dstPtr, idx4);
835 memset(dataDst, 0, float_v::size *
sizeof(
float) * 4);
841 memcpy(dataDst, dataSrc, float_v::size *
sizeof(
float) * 4);
847template<
int pixelSize>
850 std::memset(dst, 0, pixelSize);
853template<
int pixelSize>
856 std::memcpy(dst, src, pixelSize);
float value(const T *src, size_t ch)
T lerp(T a, T b, T alpha)
ALWAYS_INLINE void clearPixel(quint8 *dst)
ALWAYS_INLINE void copyPixel(const quint8 *src, quint8 *dst)
void vector_aligned_free(const T *ptr) noexcept
const quint8 * srcRowStart
const quint8 * maskRowStart
static void genericComposite32(const KoCompositeOp::ParameterInfo ¶ms)
static void genericComposite_novector(const KoCompositeOp::ParameterInfo ¶ms)
static void genericComposite64_novector(const KoCompositeOp::ParameterInfo ¶ms)
xsimd::batch< unsigned int, _impl > uint_v
static void write_channels_32(void *data, const float_v alpha, const float_v c1, const float_v c2, const float_v c3)
static void genericComposite128(const KoCompositeOp::ParameterInfo ¶ms)
static void write_channels_32_unaligned(void *data, const float_v alpha, const float_v c1, const float_v c2, const float_v c3)
static void genericComposite(const KoCompositeOp::ParameterInfo ¶ms)
xsimd::batch< float, _impl > float_v
static float_v fetch_mask_8(const quint8 *data)
static quint8 lerp_mixed_u8_float(quint8 a, quint8 b, float alpha)
static void fetch_colors_32(const void *data, float_v &c1, float_v &c2, float_v &c3)
static float_v fetch_alpha_32(const void *data)
static void genericComposite64(const KoCompositeOp::ParameterInfo ¶ms)
static quint8 round_float_to_u8(float x)
static void genericComposite32_novector(const KoCompositeOp::ParameterInfo ¶ms)
xsimd::batch< int, _impl > int_v
static void genericComposite128_novector(const KoCompositeOp::ParameterInfo ¶ms)
xsimd::batch< float, _impl > float_v
static ALWAYS_INLINE float divScalar(const float ÷nt, const float &divisor)
static ALWAYS_INLINE float_v divVector(const float_v ÷nt, const float_v &divisor)
static ALWAYS_INLINE result_type roundScalar(const float value)
xsimd::batch< float, _impl > float_v
typename float_v::batch_bool_type float_m
ALWAYS_INLINE void recoverPixels(const float_m &mask, float_v &c1, float_v &c2, float_v &c3) const
ALWAYS_INLINE PixelStateRecoverHelper(const float_v &c1, const float_v &c2, const float_v &c3)
typename float_v::batch_bool_type float_m
ALWAYS_INLINE void recoverPixels(const float_m &mask, float_v &c1, float_v &c2, float_v &c3) const
xsimd::batch< float, _impl > float_v
ALWAYS_INLINE PixelStateRecoverHelper(const float_v &c1, const float_v &c2, const float_v &c3)
xsimd::batch< int, _impl > int_v
xsimd::batch< unsigned int, _impl > uint_v
ALWAYS_INLINE void copyPixels(const quint8 *dataSrc, quint8 *dataDst)
xsimd::batch< float, _impl > float_v
static ALWAYS_INLINE float lerpMixedUintFloat(float a, float b, float alpha)
static ALWAYS_INLINE void denormalizeAlpha(float &alpha)
ALWAYS_INLINE void write(void *dst, const float_v &src_c1, const float_v &src_c2, const float_v &src_c3, const float_v &src_alpha)
ALWAYS_INLINE void clearPixels(quint8 *dataDst)
static ALWAYS_INLINE float roundFloatToUint(float x)
static ALWAYS_INLINE void normalizeAlpha(float &alpha)
ALWAYS_INLINE void read(const void *src, float_v &dst_c1, float_v &dst_c2, float_v &dst_c3, float_v &dst_alpha)
xsimd::batch< int, _impl > int_v
static ALWAYS_INLINE void denormalizeAlpha(float &alpha)
ALWAYS_INLINE void read(const void *src, float_v &dst_c1, float_v &dst_c2, float_v &dst_c3, float_v &dst_alpha)
ALWAYS_INLINE void clearPixels(quint8 *dataDst)
xsimd::batch< float, _impl > float_v
static ALWAYS_INLINE quint16 roundFloatToUint(float x)
static ALWAYS_INLINE void normalizeAlpha(float &alpha)
xsimd::batch< unsigned int, _impl > uint_v
ALWAYS_INLINE void copyPixels(const quint8 *dataSrc, quint8 *dataDst)
static ALWAYS_INLINE quint16 lerpMixedUintFloat(quint16 a, quint16 b, float alpha)
ALWAYS_INLINE void write(void *dst, const float_v &c1, const float_v &c2, const float_v &c3, const float_v &a)
xsimd::batch< int, _impl > int_v
static ALWAYS_INLINE void normalizeAlpha(float &alpha)
static ALWAYS_INLINE quint8 roundFloatToUint(float x)
xsimd::batch< float, _impl > float_v
static ALWAYS_INLINE void denormalizeAlpha(float &alpha)
ALWAYS_INLINE void write(quint8 *dataDst, const float_v &c1, const float_v &c2, const float_v &c3, const float_v &a)
xsimd::batch< unsigned int, _impl > uint_v
ALWAYS_INLINE void copyPixels(const quint8 *dataSrc, quint8 *dataDst)
ALWAYS_INLINE void clearPixels(quint8 *dataDst)
ALWAYS_INLINE void read(const void *src, float_v &dst_c1, float_v &dst_c2, float_v &dst_c3, float_v &dst_alpha)
static ALWAYS_INLINE quint8 lerpMixedUintFloat(quint8 a, quint8 b, float alpha)