Krita Source Code Documentation
Loading...
Searching...
No Matches
KoStreamedMath.h
Go to the documentation of this file.
1/*
2 * SPDX-FileCopyrightText: 2012 Dmitry Kazakov <dimula73@gmail.com>
3 * SPDX-FileCopyrightText: 2020 Mathias Wein <lynx.mw+kde@gmail.com>
4 * SPDX-FileCopyrightText: 2022 L. E. Segovia <amy@amyspark.me>
5 *
6 * SPDX-License-Identifier: LGPL-2.1-or-later
7 */
8
9#ifndef __KOSTREAMED_MATH_H
10#define __KOSTREAMED_MATH_H
11
12#if defined(XSIMD_NO_SUPPORTED_ARCHITECTURE)
13#error "Trying to use SIMD with an unknown architecture!"
14#endif
15
16#include <cstdint>
17#include <cstring>
18#include <iostream>
19#include <type_traits>
21
22#if XSIMD_VERSION_MAJOR < 10
23#include <KoRgbaInterleavers.h>
24#endif
25
26#include <KoAlwaysInline.h>
27#include <KoCompositeOp.h>
28#include <KoColorSpaceMaths.h>
29
30#define BLOCKDEBUG 0
31
32template<typename _impl, typename result_type>
33struct OptiRound {
34 ALWAYS_INLINE static result_type roundScalar(const float value)
35 {
36#ifdef __SSE__
37 // SSE/AVX instructions use round-to-even rounding rule so we
38 // should reuse it when possible
39 return _mm_cvtss_si32(_mm_set_ss(value));
40#elif XSIMD_WITH_NEON64
41 return vgetq_lane_s32(vcvtnq_s32_f32(vrndiq_f32(vdupq_n_f32(value))),
42 0);
43#elif XSIMD_WITH_NEON
44 /* origin:
45 * https://github.com/DLTcollab/sse2neon/blob/cad518a93b326f0f644b7972d488d04eaa2b0475/sse2neon.h#L4028-L4047
46 */
47 // Contributors to this work are:
48 // John W. Ratcliff <jratcliffscarab@gmail.com>
49 // Brandon Rowlett <browlett@nvidia.com>
50 // Ken Fast <kfast@gdeb.com>
51 // Eric van Beurden <evanbeurden@nvidia.com>
52 // Alexander Potylitsin <apotylitsin@nvidia.com>
53 // Hasindu Gamaarachchi <hasindu2008@gmail.com>
54 // Jim Huang <jserv@biilabs.io>
55 // Mark Cheng <marktwtn@biilabs.io>
56 // Malcolm James MacLeod <malcolm@gulden.com>
57 // Devin Hussey (easyaspi314) <husseydevin@gmail.com>
58 // Sebastian Pop <spop@amazon.com>
59 // Developer Ecosystem Engineering
60 // <DeveloperEcosystemEngineering@apple.com> Danila Kutenin
61 // <danilak@google.com> François Turban (JishinMaster)
62 // <francois.turban@gmail.com> Pei-Hsuan Hung <afcidk@gmail.com>
63 // Yang-Hao Yuan <yanghau@biilabs.io>
64 // Syoyo Fujita <syoyo@lighttransport.com>
65 // Brecht Van Lommel <brecht@blender.org>
66
67 /*
68 * sse2neon is freely redistributable under the MIT License.
69 *
70 * Permission is hereby granted, free of charge, to any person obtaining
71 * a copy of this software and associated documentation files (the
72 * "Software"), to deal in the Software without restriction, including
73 * without limitation the rights to use, copy, modify, merge, publish,
74 * distribute, sublicense, and/or sell copies of the Software, and to
75 * permit persons to whom the Software is furnished to do so, subject to
76 * the following conditions:
77 *
78 * The above copyright notice and this permission notice shall be
79 * included in all copies or substantial portions of the Software.
80 *
81 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
82 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
83 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
84 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
85 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
86 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
87 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
88 * SOFTWARE.
89 */
90 const auto nearbyint_as_int = [](const float v) {
91 const auto a = vdupq_n_f32(v);
92 const auto signmask = vdupq_n_u32(0x80000000);
93 const auto half =
94 vbslq_f32(signmask, a, vdupq_n_f32(0.5f)); /* +/- 0.5 */
95 const auto r_normal = vcvtq_s32_f32(
96 vaddq_f32(a, half)); /* round to integer: [a + 0.5]*/
97 const auto r_trunc =
98 vcvtq_s32_f32(a); /* truncate to integer: [a] */
99 const auto plusone = vreinterpretq_s32_u32(
100 vshrq_n_u32(vreinterpretq_u32_s32(vnegq_s32(r_trunc)),
101 31)); /* 1 or 0 */
102 const auto r_even =
103 vbicq_s32(vaddq_s32(r_trunc, plusone),
104 vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
105 const auto delta = vsubq_f32(
106 a,
107 vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
108 const auto is_delta_half =
109 vceqq_f32(delta, half); /* delta == +/- 0.5 */
110 return vbslq_s32(is_delta_half, r_even, r_normal);
111 };
112 return vgetq_lane_s32(nearbyint_as_int(value), 0);
113#else
114 return std::lroundf(value);
115#endif
116 }
117};
118
119template<typename _impl>
120struct OptiDiv {
121 using float_v = xsimd::batch<float, _impl>;
122
123 ALWAYS_INLINE static float divScalar(const float &divident, const float &divisor)
124 {
125#ifdef __SSE__
126 float result = NAN;
127
128 __m128 x = _mm_set_ss(divisor);
129 __m128 y = _mm_set_ss(divident);
130 x = _mm_rcp_ss(x);
131 x = _mm_mul_ss(x, y);
132
133 _mm_store_ss(&result, x);
134 return result;
135#elif defined __ARM_NEON
136 auto x = vdupq_n_f32(divisor);
137 auto y = vdupq_n_f32(divident);
138 x = vrecpeq_f32(x);
139 x = vmulq_f32(x, y);
140
141 return vgetq_lane_f32(x, 0);
142#else
143 return (1.f / divisor) * divident;
144#endif
145 }
146
147 ALWAYS_INLINE static float_v divVector(const float_v &divident, const float_v &divisor)
148 {
149 return divident * xsimd::reciprocal(divisor);
150 }
151};
152
153template<typename _impl>
155 using int_v = xsimd::batch<int, _impl>;
156 using uint_v = xsimd::batch<unsigned int, _impl>;
157 using float_v = xsimd::batch<float, _impl>;
158
159 static_assert(int_v::size == uint_v::size, "the selected architecture does not guarantee vector size equality!");
160 static_assert(uint_v::size == float_v::size, "the selected architecture does not guarantee vector size equality!");
161
165 template<bool useMask, bool useFlow, class Compositor, int pixelSize>
167 {
168 const qint32 linearInc = pixelSize;
169 qint32 srcLinearInc = params.srcRowStride ? pixelSize : 0;
170
171 quint8 *dstRowStart = params.dstRowStart;
172 const quint8 *maskRowStart = params.maskRowStart;
173 const quint8 *srcRowStart = params.srcRowStart;
174 typename Compositor::ParamsWrapper paramsWrapper(params);
175
176 for (qint32 r = params.rows; r > 0; --r) {
177 const quint8 *mask = maskRowStart;
178 const quint8 *src = srcRowStart;
179 quint8 *dst = dstRowStart;
180
181 int blockRest = params.cols;
182
183 for (int i = 0; i < blockRest; i++) {
184 Compositor::template compositeOnePixelScalar<useMask, _impl>(src,
185 dst,
186 mask,
187 params.opacity,
188 paramsWrapper);
189 src += srcLinearInc;
190 dst += linearInc;
191
192 if (useMask) {
193 mask++;
194 }
195 }
196
197 srcRowStart += params.srcRowStride;
198 dstRowStart += params.dstRowStride;
199
200 if (useMask) {
201 maskRowStart += params.maskRowStride;
202 }
203 }
204 }
205
206 template<bool useMask, bool useFlow, class Compositor>
208 {
209 genericComposite_novector<useMask, useFlow, Compositor, 4>(params);
210 }
211
212 template<bool useMask, bool useFlow, class Compositor>
214 {
215 genericComposite_novector<useMask, useFlow, Compositor, 16>(params);
216 }
217
218 template<bool useMask, bool useFlow, class Compositor>
220 {
221 genericComposite_novector<useMask, useFlow, Compositor, 8>(params);
222 }
223
224 static inline quint8 round_float_to_u8(float x)
225 {
227 }
228
229 static inline quint8 lerp_mixed_u8_float(quint8 a, quint8 b, float alpha)
230 {
231 return round_float_to_u8(float(b - a) * alpha + float(a));
232 }
233
238 static inline float_v fetch_mask_8(const quint8 *data)
239 {
240 return xsimd::batch_cast<float>(xsimd::load_and_extend<int_v>(data));
241 }
242
255 template<bool aligned>
256 static inline float_v fetch_alpha_32(const void *data)
257 {
258 using U = typename std::conditional<aligned, xsimd::aligned_mode, xsimd::unaligned_mode>::type;
259 const auto data_i = uint_v::load(static_cast<const typename uint_v::value_type *>(data), U{});
260 return xsimd::to_float(xsimd::bitwise_cast_compat<int>(data_i >> 24));
261 }
262
275 template<bool aligned>
276 static inline void fetch_colors_32(const void *data, float_v &c1, float_v &c2, float_v &c3)
277 {
278 using U = typename std::conditional<aligned, xsimd::aligned_mode, xsimd::unaligned_mode>::type;
279
280 const auto data_i = uint_v::load(static_cast<const typename uint_v::value_type *>(data), U{});
281
282 const uint_v mask(0xFF);
283
284 c1 = xsimd::to_float(xsimd::bitwise_cast_compat<int>((data_i >> 16) & mask));
285 c2 = xsimd::to_float(xsimd::bitwise_cast_compat<int>((data_i >> 8) & mask));
286 c3 = xsimd::to_float(xsimd::bitwise_cast_compat<int>((data_i) & mask));
287 }
288
297 static inline void
298 write_channels_32(void *data, const float_v alpha, const float_v c1, const float_v c2, const float_v c3)
299 {
300 const int_v mask(0xFF);
301
302 const auto v1 = (xsimd::nearbyint_as_int(alpha)) << 24;
303 const auto v2 = (xsimd::nearbyint_as_int(c1) & mask) << 16;
304 const auto v3 = (xsimd::nearbyint_as_int(c2) & mask) << 8;
305 const auto v4 = (xsimd::nearbyint_as_int(c3) & mask);
306 xsimd::store_aligned(static_cast<typename int_v::value_type *>(data), (v1 | v2) | (v3 | v4));
307 }
308
309 static inline void
310 write_channels_32_unaligned(void *data, const float_v alpha, const float_v c1, const float_v c2, const float_v c3)
311 {
312 const int_v mask(0xFF);
313
314 const auto v1 = (xsimd::nearbyint_as_int(alpha)) << 24;
315 const auto v2 = (xsimd::nearbyint_as_int(c1) & mask) << 16;
316 const auto v3 = (xsimd::nearbyint_as_int(c2) & mask) << 8;
317 const auto v4 = (xsimd::nearbyint_as_int(c3) & mask);
318 xsimd::store_unaligned(static_cast<typename int_v::value_type *>(data), (v1 | v2) | (v3 | v4));
319 }
320
326 template<bool useMask, bool useFlow, class Compositor, int pixelSize>
328 {
329 const int vectorSize = static_cast<int>(float_v::size);
330 const qint32 vectorInc = pixelSize * vectorSize;
331 const qint32 linearInc = pixelSize;
332 qint32 srcVectorInc = vectorInc;
333 qint32 srcLinearInc = pixelSize;
334
335 quint8 *dstRowStart = params.dstRowStart;
336 const quint8 *maskRowStart = params.maskRowStart;
337 const quint8 *srcRowStart = params.srcRowStart;
338 typename Compositor::ParamsWrapper paramsWrapper(params);
339
340 if (!params.srcRowStride) {
341 if (pixelSize == 4) {
342 auto *buf = reinterpret_cast<uint_v *>(xsimd::vector_aligned_malloc<typename uint_v::value_type>(vectorSize));
343 *buf = uint_v(*(reinterpret_cast<const quint32 *>(srcRowStart)));
344 srcRowStart = reinterpret_cast<quint8 *>(buf);
345 srcLinearInc = 0;
346 srcVectorInc = 0;
347 } else {
348 auto *buf = xsimd::vector_aligned_malloc<quint8>(vectorInc);
349 quint8 *ptr = buf;
350
351 for (size_t i = 0; i < vectorSize; i++) {
352 memcpy(ptr, params.srcRowStart, pixelSize);
353 ptr += pixelSize;
354 }
355
356 srcRowStart = buf;
357 srcLinearInc = 0;
358 srcVectorInc = 0;
359 }
360 }
361#if BLOCKDEBUG
362 int totalBlockAlign = 0;
363 int totalBlockAlignedVector = 0;
364 int totalBlockUnalignedVector = 0;
365 int totalBlockRest = 0;
366#endif
367
368 for (qint32 r = params.rows; r > 0; --r) {
369 // Hint: Mask is allowed to be unaligned
370 const quint8 *mask = maskRowStart;
371
372 const quint8 *src = srcRowStart;
373 quint8 *dst = dstRowStart;
374
375 const int pixelsAlignmentMask = vectorSize * sizeof(float) - 1;
376 auto srcPtrValue = reinterpret_cast<uintptr_t>(src);
377 auto dstPtrValue = reinterpret_cast<uintptr_t>(dst);
378 uintptr_t srcAlignment = srcPtrValue & pixelsAlignmentMask;
379 uintptr_t dstAlignment = dstPtrValue & pixelsAlignmentMask;
380
381 // Uncomment if facing problems with alignment:
382 // Q_ASSERT_X(!(dstAlignment & 3), "Compositing",
383 // "Pixel data must be aligned on pixels borders!");
384
385 int blockAlign = params.cols;
386 int blockAlignedVector = 0;
387 int blockUnalignedVector = 0;
388 int blockRest = 0;
389
390 int *vectorBlock =
391 srcAlignment == dstAlignment || !srcVectorInc ? &blockAlignedVector : &blockUnalignedVector;
392
393 if (!dstAlignment) {
394 blockAlign = 0;
395 *vectorBlock = params.cols / vectorSize;
396 blockRest = params.cols % vectorSize;
397 } else if (params.cols > 2 * vectorSize) {
398 blockAlign = (vectorInc - dstAlignment) / pixelSize;
399 const int restCols = params.cols - blockAlign;
400 if (restCols > 0) {
401 *vectorBlock = restCols / vectorSize;
402 blockRest = restCols % vectorSize;
403 } else {
404 blockAlign = params.cols;
405 *vectorBlock = 0;
406 blockRest = 0;
407 }
408 }
409#if BLOCKDEBUG
410 totalBlockAlign += blockAlign;
411 totalBlockAlignedVector += blockAlignedVector;
412 totalBlockUnalignedVector += blockUnalignedVector;
413 totalBlockRest += blockRest;
414#endif
415
416 for (int i = 0; i < blockAlign; i++) {
417 Compositor::template compositeOnePixelScalar<useMask, _impl>(src,
418 dst,
419 mask,
420 params.opacity,
421 paramsWrapper);
422 src += srcLinearInc;
423 dst += linearInc;
424
425 if (useMask) {
426 mask++;
427 }
428 }
429
430 for (int i = 0; i < blockAlignedVector; i++) {
431 Compositor::template compositeVector<useMask, true, _impl>(src,
432 dst,
433 mask,
434 params.opacity,
435 paramsWrapper);
436 src += srcVectorInc;
437 dst += vectorInc;
438
439 if (useMask) {
440 mask += vectorSize;
441 }
442 }
443
444 for (int i = 0; i < blockUnalignedVector; i++) {
445 Compositor::template compositeVector<useMask, false, _impl>(src,
446 dst,
447 mask,
448 params.opacity,
449 paramsWrapper);
450 src += srcVectorInc;
451 dst += vectorInc;
452
453 if (useMask) {
454 mask += vectorSize;
455 }
456 }
457
458 for (int i = 0; i < blockRest; i++) {
459 Compositor::template compositeOnePixelScalar<useMask, _impl>(src,
460 dst,
461 mask,
462 params.opacity,
463 paramsWrapper);
464 src += srcLinearInc;
465 dst += linearInc;
466
467 if (useMask) {
468 mask++;
469 }
470 }
471
472 srcRowStart += params.srcRowStride;
473 dstRowStart += params.dstRowStride;
474
475 if (useMask) {
476 maskRowStart += params.maskRowStride;
477 }
478 }
479
480#if BLOCKDEBUG
481 dbgPigment << "I"
482 << "rows:" << params.rows << "\tpad(S):" << totalBlockAlign << "\tbav(V):" << totalBlockAlignedVector
483 << "\tbuv(V):" << totalBlockUnalignedVector << "\tres(S)"
484 << totalBlockRest; // << srcAlignment << dstAlignment;
485#endif
486
487 if (!params.srcRowStride) {
488 xsimd::vector_aligned_free(srcRowStart);
489 }
490 }
491
492 template<bool useMask, bool useFlow, class Compositor>
494 {
495 genericComposite<useMask, useFlow, Compositor, 4>(params);
496 }
497
498 template<bool useMask, bool useFlow, class Compositor>
500 {
501 genericComposite<useMask, useFlow, Compositor, 16>(params);
502 }
503
504 template<bool useMask, bool useFlow, class Compositor>
506 {
507 genericComposite<useMask, useFlow, Compositor, 8>(params);
508 }
509};
510
511template<typename channels_type, class _impl>
513 using float_v = xsimd::batch<float, _impl>;
514 using float_m = typename float_v::batch_bool_type;
515
517 PixelStateRecoverHelper(const float_v &c1, const float_v &c2, const float_v &c3)
518 {
519 Q_UNUSED(c1);
520 Q_UNUSED(c2);
521 Q_UNUSED(c3);
522 }
523
525 void recoverPixels(const float_m &mask, float_v &c1, float_v &c2, float_v &c3) const {
526 Q_UNUSED(mask);
527 Q_UNUSED(c1);
528 Q_UNUSED(c2);
529 Q_UNUSED(c3);
530 }
531};
532
533template<class _impl>
534struct PixelStateRecoverHelper<float, _impl> {
535 using float_v = xsimd::batch<float, _impl>;
536 using float_m = typename float_v::batch_bool_type;
537
539 PixelStateRecoverHelper(const float_v &c1, const float_v &c2, const float_v &c3)
540 : m_orig_c1(c1),
541 m_orig_c2(c2),
542 m_orig_c3(c3)
543 {
544 }
545
547 void recoverPixels(const float_m &mask, float_v &c1, float_v &c2, float_v &c3) const {
548 if (xsimd::any(mask)) {
549 c1 = xsimd::select(mask, m_orig_c1, c1);
550 c2 = xsimd::select(mask, m_orig_c2, c2);
551 c3 = xsimd::select(mask, m_orig_c3, c3);
552 }
553 }
554
555private:
559};
560
561template<typename channels_type, class _impl>
563{
564};
565
566template<class _impl>
567struct PixelWrapper<quint16, _impl> {
568 using int_v = xsimd::batch<int, _impl>;
569 using uint_v = xsimd::batch<unsigned int, _impl>;
570 using float_v = xsimd::batch<float, _impl>;
571
572 static_assert(int_v::size == uint_v::size, "the selected architecture does not guarantee vector size equality!");
573 static_assert(uint_v::size == float_v::size, "the selected architecture does not guarantee vector size equality!");
574
576 static quint16 lerpMixedUintFloat(quint16 a, quint16 b, float alpha)
577 {
578 return OptiRound<_impl, quint16>::roundScalar((float(b) - a) * alpha + float(a));
579 }
580
582 static quint16 roundFloatToUint(float x)
583 {
585 }
586
588 static void normalizeAlpha(float &alpha)
589 {
590 const float uint16Rec1 = 1.0f / 65535.0f;
591 alpha *= uint16Rec1;
592 }
593
595 static void denormalizeAlpha(float &alpha)
596 {
597 const float uint16Max = 65535.0f;
598 alpha *= uint16Max;
599 }
600
602 : mask(0xFFFF)
603 , uint16Max(65535.0f)
604 , uint16Rec1(1.0f / 65535.0f)
605 {
606 }
607
608 ALWAYS_INLINE void read(const void *src, float_v &dst_c1, float_v &dst_c2, float_v &dst_c3, float_v &dst_alpha)
609 {
610 // struct PackedPixel {
611 // float rrgg;
612 // float bbaa;
613 // }
614#if XSIMD_VERSION_MAJOR < 10
615 uint_v pixelsC1C2;
616 uint_v pixelsC3Alpha;
617 KoRgbaInterleavers<16>::deinterleave(src, pixelsC1C2, pixelsC3Alpha);
618#else
619 const auto *srcPtr = static_cast<const typename uint_v::value_type *>(src);
620 const auto idx1 = xsimd::detail::make_sequence_as_batch<int_v>() * 2; // stride == 2
621 const auto idx2 = idx1 + 1; // offset 1 == 2nd members
622
623 const auto pixelsC1C2 = uint_v::gather(srcPtr, idx1);
624 const auto pixelsC3Alpha = uint_v::gather(srcPtr, idx2);
625#endif
626
627 dst_c1 = xsimd::to_float(xsimd::bitwise_cast_compat<int>(pixelsC1C2 & mask)); // r
628 dst_c2 = xsimd::to_float(xsimd::bitwise_cast_compat<int>((pixelsC1C2 >> 16) & mask)); // g
629 dst_c3 = xsimd::to_float(xsimd::bitwise_cast_compat<int>((pixelsC3Alpha & mask))); // b
630 dst_alpha = xsimd::to_float(xsimd::bitwise_cast_compat<int>((pixelsC3Alpha >> 16) & mask)); // a
631
632 dst_alpha *= uint16Rec1;
633 }
634
635 ALWAYS_INLINE void write(void *dst, const float_v &c1, const float_v &c2, const float_v &c3, const float_v &a)
636 {
637 const auto alpha = a * uint16Max;
638
639 const auto v1 = xsimd::bitwise_cast_compat<unsigned int>(xsimd::nearbyint_as_int(c1));
640 const auto v2 = xsimd::bitwise_cast_compat<unsigned int>(xsimd::nearbyint_as_int(c2));
641 const auto v3 = xsimd::bitwise_cast_compat<unsigned int>(xsimd::nearbyint_as_int(c3));
642 const auto v4 = xsimd::bitwise_cast_compat<unsigned int>(xsimd::nearbyint_as_int(alpha));
643
644 const auto c1c2 = ((v2 & mask) << 16) | (v1 & mask);
645 const auto c3ca = ((v4 & mask) << 16) | (v3 & mask);
646
647#if XSIMD_VERSION_MAJOR < 10
648 KoRgbaInterleavers<16>::interleave(dst, c1c2, c3ca);
649#else
650 auto dstPtr = reinterpret_cast<typename int_v::value_type *>(dst);
651
652 const auto idx1 = xsimd::detail::make_sequence_as_batch<int_v>() * 2;
653 const auto idx2 = idx1 + 1;
654
655 c1c2.scatter(dstPtr, idx1);
656 c3ca.scatter(dstPtr, idx2);
657#endif
658 }
659
661 void clearPixels(quint8 *dataDst)
662 {
663 memset(dataDst, 0, float_v::size * sizeof(quint16) * 4);
664 }
665
667 void copyPixels(const quint8 *dataSrc, quint8 *dataDst)
668 {
669 memcpy(dataDst, dataSrc, float_v::size * sizeof(quint16) * 4);
670 }
671
675};
676
677template<typename _impl>
678struct PixelWrapper<quint8, _impl> {
679 using int_v = xsimd::batch<int, _impl>;
680 using uint_v = xsimd::batch<unsigned int, _impl>;
681 using float_v = xsimd::batch<float, _impl>;
682
683 static_assert(int_v::size == uint_v::size, "the selected architecture does not guarantee vector size equality!");
684 static_assert(uint_v::size == float_v::size, "the selected architecture does not guarantee vector size equality!");
685
687 static quint8 lerpMixedUintFloat(quint8 a, quint8 b, float alpha)
688 {
690 }
691
693 static quint8 roundFloatToUint(float x)
694 {
696 }
697
699 static void normalizeAlpha(float &alpha)
700 {
701 const float uint8Rec1 = 1.0f / 255.0f;
702 alpha *= uint8Rec1;
703 }
704
706 static void denormalizeAlpha(float &alpha)
707 {
708 const float uint8Max = 255.0f;
709 alpha *= uint8Max;
710 }
711
713 : mask(quint32(0xFF))
714 , uint8Max(255.0f)
715 , uint8Rec1(1.0f / 255.0f)
716 {
717 }
718
719 ALWAYS_INLINE void read(const void *src, float_v &dst_c1, float_v &dst_c2, float_v &dst_c3, float_v &dst_alpha)
720 {
721 dst_alpha = KoStreamedMath<_impl>::template fetch_alpha_32<false>(src);
722 KoStreamedMath<_impl>::template fetch_colors_32<false>(src, dst_c1, dst_c2, dst_c3);
723
724 dst_alpha *= uint8Rec1;
725 }
726
728 void write(quint8 *dataDst, const float_v &c1, const float_v &c2, const float_v &c3, const float_v &a)
729 {
730 const auto alpha = a * uint8Max;
731
732 KoStreamedMath<_impl>::write_channels_32_unaligned(dataDst, alpha, c1, c2, c3);
733 }
734
736 void clearPixels(quint8 *dataDst)
737 {
738 memset(dataDst, 0, float_v::size * sizeof(quint8) * 4);
739 }
740
742 void copyPixels(const quint8 *dataSrc, quint8 *dataDst)
743 {
744 memcpy(dataDst, dataSrc, float_v::size * sizeof(quint8) * 4);
745 }
746
750};
751
752template<typename _impl>
753struct PixelWrapper<float, _impl> {
754 using int_v = xsimd::batch<int, _impl>;
755 using uint_v = xsimd::batch<unsigned int, _impl>;
756 using float_v = xsimd::batch<float, _impl>;
757
758 static_assert(int_v::size == uint_v::size, "the selected architecture does not guarantee vector size equality!");
759 static_assert(uint_v::size == float_v::size, "the selected architecture does not guarantee vector size equality!");
760
761 struct Pixel {
762 float red;
763 float green;
764 float blue;
765 float alpha;
766 };
767
769 static float lerpMixedUintFloat(float a, float b, float alpha)
770 {
771 return Arithmetic::lerp(a,b,alpha);
772 }
773
775 static float roundFloatToUint(float x)
776 {
777 return x;
778 }
779
781 static void normalizeAlpha(float &alpha)
782 {
783 Q_UNUSED(alpha);
784 }
785
787 static void denormalizeAlpha(float &alpha)
788 {
789 Q_UNUSED(alpha);
790 }
791
792 PixelWrapper() = default;
793
794 ALWAYS_INLINE void read(const void *src, float_v &dst_c1, float_v &dst_c2, float_v &dst_c3, float_v &dst_alpha)
795 {
796#if XSIMD_VERSION_MAJOR < 10
797 KoRgbaInterleavers<32>::deinterleave(src, dst_c1, dst_c2, dst_c3, dst_alpha);
798#else
799 const auto srcPtr = reinterpret_cast<const typename float_v::value_type *>(src);
800 const auto idx1 = xsimd::detail::make_sequence_as_batch<int_v>() * 4; // stride == 4
801 const auto idx2 = idx1 + 1;
802 const auto idx3 = idx1 + 2;
803 const auto idx4 = idx1 + 3;
804
805 dst_c1 = float_v::gather(srcPtr, idx1);
806 dst_c2 = float_v::gather(srcPtr, idx2);
807 dst_c3 = float_v::gather(srcPtr, idx3);
808 dst_alpha = float_v::gather(srcPtr, idx4);
809#endif
810 }
811
812 ALWAYS_INLINE void
813 write(void *dst, const float_v &src_c1, const float_v &src_c2, const float_v &src_c3, const float_v &src_alpha)
814 {
815#if XSIMD_VERSION_MAJOR < 10
816 KoRgbaInterleavers<32>::interleave(dst, src_c1, src_c2, src_c3, src_alpha);
817#else
818 auto dstPtr = reinterpret_cast<typename float_v::value_type *>(dst);
819
820 const auto idx1 = xsimd::detail::make_sequence_as_batch<int_v>() * 4; // stride == 4
821 const auto idx2 = idx1 + 1;
822 const auto idx3 = idx1 + 2;
823 const auto idx4 = idx1 + 3;
824
825 src_c1.scatter(dstPtr, idx1);
826 src_c2.scatter(dstPtr, idx2);
827 src_c3.scatter(dstPtr, idx3);
828 src_alpha.scatter(dstPtr, idx4);
829#endif
830 }
831
833 void clearPixels(quint8 *dataDst)
834 {
835 memset(dataDst, 0, float_v::size * sizeof(float) * 4);
836 }
837
839 void copyPixels(const quint8 *dataSrc, quint8 *dataDst)
840 {
841 memcpy(dataDst, dataSrc, float_v::size * sizeof(float) * 4);
842 }
843};
844
846{
847template<int pixelSize>
848ALWAYS_INLINE void clearPixel(quint8 *dst)
849{
850 std::memset(dst, 0, pixelSize);
851}
852
853template<int pixelSize>
854ALWAYS_INLINE void copyPixel(const quint8 *src, quint8 *dst)
855{
856 std::memcpy(dst, src, pixelSize);
857}
858} // namespace KoStreamedMathFunctions
859
860#endif /* __KOSTREAMED_MATH_H */
#define dbgPigment
float value(const T *src, size_t ch)
qreal v
#define ALWAYS_INLINE
T lerp(T a, T b, T alpha)
ALWAYS_INLINE void clearPixel(quint8 *dst)
ALWAYS_INLINE void copyPixel(const quint8 *src, quint8 *dst)
void vector_aligned_free(const T *ptr) noexcept
static void genericComposite32(const KoCompositeOp::ParameterInfo &params)
static void genericComposite_novector(const KoCompositeOp::ParameterInfo &params)
static void genericComposite64_novector(const KoCompositeOp::ParameterInfo &params)
xsimd::batch< unsigned int, _impl > uint_v
static void write_channels_32(void *data, const float_v alpha, const float_v c1, const float_v c2, const float_v c3)
static void genericComposite128(const KoCompositeOp::ParameterInfo &params)
static void write_channels_32_unaligned(void *data, const float_v alpha, const float_v c1, const float_v c2, const float_v c3)
static void genericComposite(const KoCompositeOp::ParameterInfo &params)
xsimd::batch< float, _impl > float_v
static float_v fetch_mask_8(const quint8 *data)
static quint8 lerp_mixed_u8_float(quint8 a, quint8 b, float alpha)
static void fetch_colors_32(const void *data, float_v &c1, float_v &c2, float_v &c3)
static float_v fetch_alpha_32(const void *data)
static void genericComposite64(const KoCompositeOp::ParameterInfo &params)
static quint8 round_float_to_u8(float x)
static void genericComposite32_novector(const KoCompositeOp::ParameterInfo &params)
xsimd::batch< int, _impl > int_v
static void genericComposite128_novector(const KoCompositeOp::ParameterInfo &params)
xsimd::batch< float, _impl > float_v
static ALWAYS_INLINE float divScalar(const float &divident, const float &divisor)
static ALWAYS_INLINE float_v divVector(const float_v &divident, const float_v &divisor)
static ALWAYS_INLINE result_type roundScalar(const float value)
xsimd::batch< float, _impl > float_v
typename float_v::batch_bool_type float_m
ALWAYS_INLINE void recoverPixels(const float_m &mask, float_v &c1, float_v &c2, float_v &c3) const
ALWAYS_INLINE PixelStateRecoverHelper(const float_v &c1, const float_v &c2, const float_v &c3)
typename float_v::batch_bool_type float_m
ALWAYS_INLINE void recoverPixels(const float_m &mask, float_v &c1, float_v &c2, float_v &c3) const
xsimd::batch< float, _impl > float_v
ALWAYS_INLINE PixelStateRecoverHelper(const float_v &c1, const float_v &c2, const float_v &c3)
xsimd::batch< int, _impl > int_v
xsimd::batch< unsigned int, _impl > uint_v
ALWAYS_INLINE void copyPixels(const quint8 *dataSrc, quint8 *dataDst)
xsimd::batch< float, _impl > float_v
static ALWAYS_INLINE float lerpMixedUintFloat(float a, float b, float alpha)
static ALWAYS_INLINE void denormalizeAlpha(float &alpha)
ALWAYS_INLINE void write(void *dst, const float_v &src_c1, const float_v &src_c2, const float_v &src_c3, const float_v &src_alpha)
ALWAYS_INLINE void clearPixels(quint8 *dataDst)
static ALWAYS_INLINE float roundFloatToUint(float x)
static ALWAYS_INLINE void normalizeAlpha(float &alpha)
ALWAYS_INLINE void read(const void *src, float_v &dst_c1, float_v &dst_c2, float_v &dst_c3, float_v &dst_alpha)
xsimd::batch< int, _impl > int_v
static ALWAYS_INLINE void denormalizeAlpha(float &alpha)
ALWAYS_INLINE void read(const void *src, float_v &dst_c1, float_v &dst_c2, float_v &dst_c3, float_v &dst_alpha)
ALWAYS_INLINE void clearPixels(quint8 *dataDst)
xsimd::batch< float, _impl > float_v
static ALWAYS_INLINE quint16 roundFloatToUint(float x)
static ALWAYS_INLINE void normalizeAlpha(float &alpha)
xsimd::batch< unsigned int, _impl > uint_v
ALWAYS_INLINE void copyPixels(const quint8 *dataSrc, quint8 *dataDst)
static ALWAYS_INLINE quint16 lerpMixedUintFloat(quint16 a, quint16 b, float alpha)
ALWAYS_INLINE void write(void *dst, const float_v &c1, const float_v &c2, const float_v &c3, const float_v &a)
xsimd::batch< int, _impl > int_v
static ALWAYS_INLINE void normalizeAlpha(float &alpha)
static ALWAYS_INLINE quint8 roundFloatToUint(float x)
xsimd::batch< float, _impl > float_v
static ALWAYS_INLINE void denormalizeAlpha(float &alpha)
ALWAYS_INLINE void write(quint8 *dataDst, const float_v &c1, const float_v &c2, const float_v &c3, const float_v &a)
xsimd::batch< unsigned int, _impl > uint_v
ALWAYS_INLINE void copyPixels(const quint8 *dataSrc, quint8 *dataDst)
ALWAYS_INLINE void clearPixels(quint8 *dataDst)
ALWAYS_INLINE void read(const void *src, float_v &dst_c1, float_v &dst_c2, float_v &dst_c3, float_v &dst_alpha)
static ALWAYS_INLINE quint8 lerpMixedUintFloat(quint8 a, quint8 b, float alpha)