Krita Source Code Documentation
Loading...
Searching...
No Matches
KoStreamedMath.h
Go to the documentation of this file.
1/*
2 * SPDX-FileCopyrightText: 2012 Dmitry Kazakov <dimula73@gmail.com>
3 * SPDX-FileCopyrightText: 2020 Mathias Wein <lynx.mw+kde@gmail.com>
4 * SPDX-FileCopyrightText: 2022 L. E. Segovia <amy@amyspark.me>
5 *
6 * SPDX-License-Identifier: LGPL-2.1-or-later
7 */
8
9#ifndef __KOSTREAMED_MATH_H
10#define __KOSTREAMED_MATH_H
11
12#include <cstdint>
13#include <cstring>
14#include <iostream>
15#include <type_traits>
17
18#if XSIMD_VERSION_MAJOR < 10
19#include <KoRgbaInterleavers.h>
20#endif
21
22#include <KoAlwaysInline.h>
23#include <KoCompositeOp.h>
24#include <KoColorSpaceMaths.h>
25
26#define BLOCKDEBUG 0
27
28template<typename _impl, typename result_type>
29struct OptiRound {
30 ALWAYS_INLINE static result_type roundScalar(const float value)
31 {
32#ifdef __SSE__
33 // SSE/AVX instructions use round-to-even rounding rule so we
34 // should reuse it when possible
35 return _mm_cvtss_si32(_mm_set_ss(value));
36#elif XSIMD_WITH_NEON64
37 return vgetq_lane_s32(vcvtnq_s32_f32(vrndiq_f32(vdupq_n_f32(value))),
38 0);
39#elif XSIMD_WITH_NEON
40 /* origin:
41 * https://github.com/DLTcollab/sse2neon/blob/cad518a93b326f0f644b7972d488d04eaa2b0475/sse2neon.h#L4028-L4047
42 */
43 // Contributors to this work are:
44 // John W. Ratcliff <jratcliffscarab@gmail.com>
45 // Brandon Rowlett <browlett@nvidia.com>
46 // Ken Fast <kfast@gdeb.com>
47 // Eric van Beurden <evanbeurden@nvidia.com>
48 // Alexander Potylitsin <apotylitsin@nvidia.com>
49 // Hasindu Gamaarachchi <hasindu2008@gmail.com>
50 // Jim Huang <jserv@biilabs.io>
51 // Mark Cheng <marktwtn@biilabs.io>
52 // Malcolm James MacLeod <malcolm@gulden.com>
53 // Devin Hussey (easyaspi314) <husseydevin@gmail.com>
54 // Sebastian Pop <spop@amazon.com>
55 // Developer Ecosystem Engineering
56 // <DeveloperEcosystemEngineering@apple.com> Danila Kutenin
57 // <danilak@google.com> François Turban (JishinMaster)
58 // <francois.turban@gmail.com> Pei-Hsuan Hung <afcidk@gmail.com>
59 // Yang-Hao Yuan <yanghau@biilabs.io>
60 // Syoyo Fujita <syoyo@lighttransport.com>
61 // Brecht Van Lommel <brecht@blender.org>
62
63 /*
64 * sse2neon is freely redistributable under the MIT License.
65 *
66 * Permission is hereby granted, free of charge, to any person obtaining
67 * a copy of this software and associated documentation files (the
68 * "Software"), to deal in the Software without restriction, including
69 * without limitation the rights to use, copy, modify, merge, publish,
70 * distribute, sublicense, and/or sell copies of the Software, and to
71 * permit persons to whom the Software is furnished to do so, subject to
72 * the following conditions:
73 *
74 * The above copyright notice and this permission notice shall be
75 * included in all copies or substantial portions of the Software.
76 *
77 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
78 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
79 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
80 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
81 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
82 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
83 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
84 * SOFTWARE.
85 */
86 const auto nearbyint_as_int = [](const float v) {
87 const auto a = vdupq_n_f32(v);
88 const auto signmask = vdupq_n_u32(0x80000000);
89 const auto half =
90 vbslq_f32(signmask, a, vdupq_n_f32(0.5f)); /* +/- 0.5 */
91 const auto r_normal = vcvtq_s32_f32(
92 vaddq_f32(a, half)); /* round to integer: [a + 0.5]*/
93 const auto r_trunc =
94 vcvtq_s32_f32(a); /* truncate to integer: [a] */
95 const auto plusone = vreinterpretq_s32_u32(
96 vshrq_n_u32(vreinterpretq_u32_s32(vnegq_s32(r_trunc)),
97 31)); /* 1 or 0 */
98 const auto r_even =
99 vbicq_s32(vaddq_s32(r_trunc, plusone),
100 vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
101 const auto delta = vsubq_f32(
102 a,
103 vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
104 const auto is_delta_half =
105 vceqq_f32(delta, half); /* delta == +/- 0.5 */
106 return vbslq_s32(is_delta_half, r_even, r_normal);
107 };
108 return vgetq_lane_s32(nearbyint_as_int(value), 0);
109#else
110 return std::lroundf(value);
111#endif
112 }
113};
114
115#if !defined(XSIMD_NO_SUPPORTED_ARCHITECTURE)
116
117template<typename _impl>
118struct OptiDiv {
119 using float_v = xsimd::batch<float, _impl>;
120
121 ALWAYS_INLINE static float divScalar(const float &divident, const float &divisor)
122 {
123#ifdef __SSE__
124 float result = NAN;
125
126 __m128 x = _mm_set_ss(divisor);
127 __m128 y = _mm_set_ss(divident);
128 x = _mm_rcp_ss(x);
129 x = _mm_mul_ss(x, y);
130
131 _mm_store_ss(&result, x);
132 return result;
133#elif defined __ARM_NEON
134 auto x = vdupq_n_f32(divisor);
135 auto y = vdupq_n_f32(divident);
136 x = vrecpeq_f32(x);
137 x = vmulq_f32(x, y);
138
139 return vgetq_lane_f32(x, 0);
140#else
141 return (1.f / divisor) * divident;
142#endif
143 }
144
145 ALWAYS_INLINE static float_v divVector(const float_v &divident, const float_v &divisor)
146 {
147 return divident * xsimd::reciprocal(divisor);
148 }
149};
150
151template<typename _impl>
153 using int_v = xsimd::batch<int, _impl>;
154 using uint_v = xsimd::batch<unsigned int, _impl>;
155 using float_v = xsimd::batch<float, _impl>;
156
157 static_assert(int_v::size == uint_v::size, "the selected architecture does not guarantee vector size equality!");
158 static_assert(uint_v::size == float_v::size, "the selected architecture does not guarantee vector size equality!");
159
163 template<bool useMask, bool useFlow, class Compositor, int pixelSize>
165 {
166 const qint32 linearInc = pixelSize;
167 qint32 srcLinearInc = params.srcRowStride ? pixelSize : 0;
168
169 quint8 *dstRowStart = params.dstRowStart;
170 const quint8 *maskRowStart = params.maskRowStart;
171 const quint8 *srcRowStart = params.srcRowStart;
172 typename Compositor::ParamsWrapper paramsWrapper(params);
173
174 for (qint32 r = params.rows; r > 0; --r) {
175 const quint8 *mask = maskRowStart;
176 const quint8 *src = srcRowStart;
177 quint8 *dst = dstRowStart;
178
179 int blockRest = params.cols;
180
181 for (int i = 0; i < blockRest; i++) {
182 Compositor::template compositeOnePixelScalar<useMask, _impl>(src,
183 dst,
184 mask,
185 params.opacity,
186 paramsWrapper);
187 src += srcLinearInc;
188 dst += linearInc;
189
190 if (useMask) {
191 mask++;
192 }
193 }
194
195 srcRowStart += params.srcRowStride;
196 dstRowStart += params.dstRowStride;
197
198 if (useMask) {
199 maskRowStart += params.maskRowStride;
200 }
201 }
202 }
203
204 template<bool useMask, bool useFlow, class Compositor>
206 {
207 genericComposite_novector<useMask, useFlow, Compositor, 4>(params);
208 }
209
210 template<bool useMask, bool useFlow, class Compositor>
212 {
213 genericComposite_novector<useMask, useFlow, Compositor, 16>(params);
214 }
215
216 template<bool useMask, bool useFlow, class Compositor>
218 {
219 genericComposite_novector<useMask, useFlow, Compositor, 8>(params);
220 }
221
222 static inline quint8 round_float_to_u8(float x)
223 {
225 }
226
227 static inline quint8 lerp_mixed_u8_float(quint8 a, quint8 b, float alpha)
228 {
229 return round_float_to_u8(float(b - a) * alpha + float(a));
230 }
231
236 static inline float_v fetch_mask_8(const quint8 *data)
237 {
238 return xsimd::batch_cast<float>(xsimd::load_and_extend<int_v>(data));
239 }
240
253 template<bool aligned>
254 static inline float_v fetch_alpha_32(const void *data)
255 {
256 using U = typename std::conditional<aligned, xsimd::aligned_mode, xsimd::unaligned_mode>::type;
257 const auto data_i = uint_v::load(static_cast<const typename uint_v::value_type *>(data), U{});
258 return xsimd::to_float(xsimd::bitwise_cast_compat<int>(data_i >> 24));
259 }
260
273 template<bool aligned>
274 static inline void fetch_colors_32(const void *data, float_v &c1, float_v &c2, float_v &c3)
275 {
276 using U = typename std::conditional<aligned, xsimd::aligned_mode, xsimd::unaligned_mode>::type;
277
278 const auto data_i = uint_v::load(static_cast<const typename uint_v::value_type *>(data), U{});
279
280 const uint_v mask(0xFF);
281
282 c1 = xsimd::to_float(xsimd::bitwise_cast_compat<int>((data_i >> 16) & mask));
283 c2 = xsimd::to_float(xsimd::bitwise_cast_compat<int>((data_i >> 8) & mask));
284 c3 = xsimd::to_float(xsimd::bitwise_cast_compat<int>((data_i) & mask));
285 }
286
295 static inline void
296 write_channels_32(void *data, const float_v alpha, const float_v c1, const float_v c2, const float_v c3)
297 {
298 const int_v mask(0xFF);
299
300 const auto v1 = (xsimd::nearbyint_as_int(alpha)) << 24;
301 const auto v2 = (xsimd::nearbyint_as_int(c1) & mask) << 16;
302 const auto v3 = (xsimd::nearbyint_as_int(c2) & mask) << 8;
303 const auto v4 = (xsimd::nearbyint_as_int(c3) & mask);
304 xsimd::store_aligned(static_cast<typename int_v::value_type *>(data), (v1 | v2) | (v3 | v4));
305 }
306
307 static inline void
308 write_channels_32_unaligned(void *data, const float_v alpha, const float_v c1, const float_v c2, const float_v c3)
309 {
310 const int_v mask(0xFF);
311
312 const auto v1 = (xsimd::nearbyint_as_int(alpha)) << 24;
313 const auto v2 = (xsimd::nearbyint_as_int(c1) & mask) << 16;
314 const auto v3 = (xsimd::nearbyint_as_int(c2) & mask) << 8;
315 const auto v4 = (xsimd::nearbyint_as_int(c3) & mask);
316 xsimd::store_unaligned(static_cast<typename int_v::value_type *>(data), (v1 | v2) | (v3 | v4));
317 }
318
324 template<bool useMask, bool useFlow, class Compositor, int pixelSize>
326 {
327 const int vectorSize = static_cast<int>(float_v::size);
328 const qint32 vectorInc = pixelSize * vectorSize;
329 const qint32 linearInc = pixelSize;
330 qint32 srcVectorInc = vectorInc;
331 qint32 srcLinearInc = pixelSize;
332
333 quint8 *dstRowStart = params.dstRowStart;
334 const quint8 *maskRowStart = params.maskRowStart;
335 const quint8 *srcRowStart = params.srcRowStart;
336 typename Compositor::ParamsWrapper paramsWrapper(params);
337
338 if (!params.srcRowStride) {
339 if (pixelSize == 4) {
340 auto *buf = reinterpret_cast<uint_v *>(xsimd::vector_aligned_malloc<typename uint_v::value_type>(vectorSize));
341 *buf = uint_v(*(reinterpret_cast<const quint32 *>(srcRowStart)));
342 srcRowStart = reinterpret_cast<quint8 *>(buf);
343 srcLinearInc = 0;
344 srcVectorInc = 0;
345 } else {
346 auto *buf = xsimd::vector_aligned_malloc<quint8>(vectorInc);
347 quint8 *ptr = buf;
348
349 for (size_t i = 0; i < vectorSize; i++) {
350 memcpy(ptr, params.srcRowStart, pixelSize);
351 ptr += pixelSize;
352 }
353
354 srcRowStart = buf;
355 srcLinearInc = 0;
356 srcVectorInc = 0;
357 }
358 }
359#if BLOCKDEBUG
360 int totalBlockAlign = 0;
361 int totalBlockAlignedVector = 0;
362 int totalBlockUnalignedVector = 0;
363 int totalBlockRest = 0;
364#endif
365
366 for (qint32 r = params.rows; r > 0; --r) {
367 // Hint: Mask is allowed to be unaligned
368 const quint8 *mask = maskRowStart;
369
370 const quint8 *src = srcRowStart;
371 quint8 *dst = dstRowStart;
372
373 const int pixelsAlignmentMask = vectorSize * sizeof(float) - 1;
374 auto srcPtrValue = reinterpret_cast<uintptr_t>(src);
375 auto dstPtrValue = reinterpret_cast<uintptr_t>(dst);
376 uintptr_t srcAlignment = srcPtrValue & pixelsAlignmentMask;
377 uintptr_t dstAlignment = dstPtrValue & pixelsAlignmentMask;
378
379 // Uncomment if facing problems with alignment:
380 // Q_ASSERT_X(!(dstAlignment & 3), "Compositing",
381 // "Pixel data must be aligned on pixels borders!");
382
383 int blockAlign = params.cols;
384 int blockAlignedVector = 0;
385 int blockUnalignedVector = 0;
386 int blockRest = 0;
387
388 int *vectorBlock =
389 srcAlignment == dstAlignment || !srcVectorInc ? &blockAlignedVector : &blockUnalignedVector;
390
391 if (!dstAlignment) {
392 blockAlign = 0;
393 *vectorBlock = params.cols / vectorSize;
394 blockRest = params.cols % vectorSize;
395 } else if (params.cols > 2 * vectorSize) {
396 blockAlign = (vectorInc - dstAlignment) / pixelSize;
397 const int restCols = params.cols - blockAlign;
398 if (restCols > 0) {
399 *vectorBlock = restCols / vectorSize;
400 blockRest = restCols % vectorSize;
401 } else {
402 blockAlign = params.cols;
403 *vectorBlock = 0;
404 blockRest = 0;
405 }
406 }
407#if BLOCKDEBUG
408 totalBlockAlign += blockAlign;
409 totalBlockAlignedVector += blockAlignedVector;
410 totalBlockUnalignedVector += blockUnalignedVector;
411 totalBlockRest += blockRest;
412#endif
413
414 for (int i = 0; i < blockAlign; i++) {
415 Compositor::template compositeOnePixelScalar<useMask, _impl>(src,
416 dst,
417 mask,
418 params.opacity,
419 paramsWrapper);
420 src += srcLinearInc;
421 dst += linearInc;
422
423 if (useMask) {
424 mask++;
425 }
426 }
427
428 for (int i = 0; i < blockAlignedVector; i++) {
429 Compositor::template compositeVector<useMask, true, _impl>(src,
430 dst,
431 mask,
432 params.opacity,
433 paramsWrapper);
434 src += srcVectorInc;
435 dst += vectorInc;
436
437 if (useMask) {
438 mask += vectorSize;
439 }
440 }
441
442 for (int i = 0; i < blockUnalignedVector; i++) {
443 Compositor::template compositeVector<useMask, false, _impl>(src,
444 dst,
445 mask,
446 params.opacity,
447 paramsWrapper);
448 src += srcVectorInc;
449 dst += vectorInc;
450
451 if (useMask) {
452 mask += vectorSize;
453 }
454 }
455
456 for (int i = 0; i < blockRest; i++) {
457 Compositor::template compositeOnePixelScalar<useMask, _impl>(src,
458 dst,
459 mask,
460 params.opacity,
461 paramsWrapper);
462 src += srcLinearInc;
463 dst += linearInc;
464
465 if (useMask) {
466 mask++;
467 }
468 }
469
470 srcRowStart += params.srcRowStride;
471 dstRowStart += params.dstRowStride;
472
473 if (useMask) {
474 maskRowStart += params.maskRowStride;
475 }
476 }
477
478#if BLOCKDEBUG
479 dbgPigment << "I"
480 << "rows:" << params.rows << "\tpad(S):" << totalBlockAlign << "\tbav(V):" << totalBlockAlignedVector
481 << "\tbuv(V):" << totalBlockUnalignedVector << "\tres(S)"
482 << totalBlockRest; // << srcAlignment << dstAlignment;
483#endif
484
485 if (!params.srcRowStride) {
486 xsimd::vector_aligned_free(srcRowStart);
487 }
488 }
489
490 template<bool useMask, bool useFlow, class Compositor>
492 {
493 genericComposite<useMask, useFlow, Compositor, 4>(params);
494 }
495
496 template<bool useMask, bool useFlow, class Compositor>
498 {
499 genericComposite<useMask, useFlow, Compositor, 16>(params);
500 }
501
502 template<bool useMask, bool useFlow, class Compositor>
504 {
505 genericComposite<useMask, useFlow, Compositor, 8>(params);
506 }
507};
508
509template<typename channels_type, class _impl>
511 using float_v = xsimd::batch<float, _impl>;
512 using float_m = typename float_v::batch_bool_type;
513
515 PixelStateRecoverHelper(const float_v &c1, const float_v &c2, const float_v &c3)
516 {
517 Q_UNUSED(c1);
518 Q_UNUSED(c2);
519 Q_UNUSED(c3);
520 }
521
523 void recoverPixels(const float_m &mask, float_v &c1, float_v &c2, float_v &c3) const {
524 Q_UNUSED(mask);
525 Q_UNUSED(c1);
526 Q_UNUSED(c2);
527 Q_UNUSED(c3);
528 }
529};
530
531template<class _impl>
532struct PixelStateRecoverHelper<float, _impl> {
533 using float_v = xsimd::batch<float, _impl>;
534 using float_m = typename float_v::batch_bool_type;
535
537 PixelStateRecoverHelper(const float_v &c1, const float_v &c2, const float_v &c3)
538 : m_orig_c1(c1),
539 m_orig_c2(c2),
540 m_orig_c3(c3)
541 {
542 }
543
545 void recoverPixels(const float_m &mask, float_v &c1, float_v &c2, float_v &c3) const {
546 if (xsimd::any(mask)) {
547 c1 = xsimd::select(mask, m_orig_c1, c1);
548 c2 = xsimd::select(mask, m_orig_c2, c2);
549 c3 = xsimd::select(mask, m_orig_c3, c3);
550 }
551 }
552
553private:
557};
558
559template<typename channels_type, class _impl>
561{
562};
563
564template<class _impl>
565struct PixelWrapper<quint16, _impl> {
566 using int_v = xsimd::batch<int, _impl>;
567 using uint_v = xsimd::batch<unsigned int, _impl>;
568 using float_v = xsimd::batch<float, _impl>;
569
570 static_assert(int_v::size == uint_v::size, "the selected architecture does not guarantee vector size equality!");
571 static_assert(uint_v::size == float_v::size, "the selected architecture does not guarantee vector size equality!");
572
574 static quint16 lerpMixedUintFloat(quint16 a, quint16 b, float alpha)
575 {
576 return OptiRound<_impl, quint16>::roundScalar((float(b) - a) * alpha + float(a));
577 }
578
580 static quint16 roundFloatToUint(float x)
581 {
583 }
584
586 static void normalizeAlpha(float &alpha)
587 {
588 const float uint16Rec1 = 1.0f / 65535.0f;
589 alpha *= uint16Rec1;
590 }
591
593 static void denormalizeAlpha(float &alpha)
594 {
595 const float uint16Max = 65535.0f;
596 alpha *= uint16Max;
597 }
598
600 : mask(0xFFFF)
601 , uint16Max(65535.0f)
602 , uint16Rec1(1.0f / 65535.0f)
603 {
604 }
605
606 ALWAYS_INLINE void read(const void *src, float_v &dst_c1, float_v &dst_c2, float_v &dst_c3, float_v &dst_alpha)
607 {
608 // struct PackedPixel {
609 // float rrgg;
610 // float bbaa;
611 // }
612#if XSIMD_VERSION_MAJOR < 10
613 uint_v pixelsC1C2;
614 uint_v pixelsC3Alpha;
615 KoRgbaInterleavers<16>::deinterleave(src, pixelsC1C2, pixelsC3Alpha);
616#else
617 const auto *srcPtr = static_cast<const typename uint_v::value_type *>(src);
618 const auto idx1 = xsimd::detail::make_sequence_as_batch<int_v>() * 2; // stride == 2
619 const auto idx2 = idx1 + 1; // offset 1 == 2nd members
620
621 const auto pixelsC1C2 = uint_v::gather(srcPtr, idx1);
622 const auto pixelsC3Alpha = uint_v::gather(srcPtr, idx2);
623#endif
624
625 dst_c1 = xsimd::to_float(xsimd::bitwise_cast_compat<int>(pixelsC1C2 & mask)); // r
626 dst_c2 = xsimd::to_float(xsimd::bitwise_cast_compat<int>((pixelsC1C2 >> 16) & mask)); // g
627 dst_c3 = xsimd::to_float(xsimd::bitwise_cast_compat<int>((pixelsC3Alpha & mask))); // b
628 dst_alpha = xsimd::to_float(xsimd::bitwise_cast_compat<int>((pixelsC3Alpha >> 16) & mask)); // a
629
630 dst_alpha *= uint16Rec1;
631 }
632
633 ALWAYS_INLINE void write(void *dst, const float_v &c1, const float_v &c2, const float_v &c3, const float_v &a)
634 {
635 const auto alpha = a * uint16Max;
636
637 const auto v1 = xsimd::bitwise_cast_compat<unsigned int>(xsimd::nearbyint_as_int(c1));
638 const auto v2 = xsimd::bitwise_cast_compat<unsigned int>(xsimd::nearbyint_as_int(c2));
639 const auto v3 = xsimd::bitwise_cast_compat<unsigned int>(xsimd::nearbyint_as_int(c3));
640 const auto v4 = xsimd::bitwise_cast_compat<unsigned int>(xsimd::nearbyint_as_int(alpha));
641
642 const auto c1c2 = ((v2 & mask) << 16) | (v1 & mask);
643 const auto c3ca = ((v4 & mask) << 16) | (v3 & mask);
644
645#if XSIMD_VERSION_MAJOR < 10
646 KoRgbaInterleavers<16>::interleave(dst, c1c2, c3ca);
647#else
648 auto dstPtr = reinterpret_cast<typename int_v::value_type *>(dst);
649
650 const auto idx1 = xsimd::detail::make_sequence_as_batch<int_v>() * 2;
651 const auto idx2 = idx1 + 1;
652
653 c1c2.scatter(dstPtr, idx1);
654 c3ca.scatter(dstPtr, idx2);
655#endif
656 }
657
659 void clearPixels(quint8 *dataDst)
660 {
661 memset(dataDst, 0, float_v::size * sizeof(quint16) * 4);
662 }
663
665 void copyPixels(const quint8 *dataSrc, quint8 *dataDst)
666 {
667 memcpy(dataDst, dataSrc, float_v::size * sizeof(quint16) * 4);
668 }
669
673};
674
675template<typename _impl>
676struct PixelWrapper<quint8, _impl> {
677 using int_v = xsimd::batch<int, _impl>;
678 using uint_v = xsimd::batch<unsigned int, _impl>;
679 using float_v = xsimd::batch<float, _impl>;
680
681 static_assert(int_v::size == uint_v::size, "the selected architecture does not guarantee vector size equality!");
682 static_assert(uint_v::size == float_v::size, "the selected architecture does not guarantee vector size equality!");
683
685 static quint8 lerpMixedUintFloat(quint8 a, quint8 b, float alpha)
686 {
688 }
689
691 static quint8 roundFloatToUint(float x)
692 {
694 }
695
697 static void normalizeAlpha(float &alpha)
698 {
699 const float uint8Rec1 = 1.0f / 255.0f;
700 alpha *= uint8Rec1;
701 }
702
704 static void denormalizeAlpha(float &alpha)
705 {
706 const float uint8Max = 255.0f;
707 alpha *= uint8Max;
708 }
709
711 : mask(quint32(0xFF))
712 , uint8Max(255.0f)
713 , uint8Rec1(1.0f / 255.0f)
714 {
715 }
716
717 ALWAYS_INLINE void read(const void *src, float_v &dst_c1, float_v &dst_c2, float_v &dst_c3, float_v &dst_alpha)
718 {
719 dst_alpha = KoStreamedMath<_impl>::template fetch_alpha_32<false>(src);
720 KoStreamedMath<_impl>::template fetch_colors_32<false>(src, dst_c1, dst_c2, dst_c3);
721
722 dst_alpha *= uint8Rec1;
723 }
724
726 void write(quint8 *dataDst, const float_v &c1, const float_v &c2, const float_v &c3, const float_v &a)
727 {
728 const auto alpha = a * uint8Max;
729
730 KoStreamedMath<_impl>::write_channels_32_unaligned(dataDst, alpha, c1, c2, c3);
731 }
732
734 void clearPixels(quint8 *dataDst)
735 {
736 memset(dataDst, 0, float_v::size * sizeof(quint8) * 4);
737 }
738
740 void copyPixels(const quint8 *dataSrc, quint8 *dataDst)
741 {
742 memcpy(dataDst, dataSrc, float_v::size * sizeof(quint8) * 4);
743 }
744
748};
749
750template<typename _impl>
751struct PixelWrapper<float, _impl> {
752 using int_v = xsimd::batch<int, _impl>;
753 using uint_v = xsimd::batch<unsigned int, _impl>;
754 using float_v = xsimd::batch<float, _impl>;
755
756 static_assert(int_v::size == uint_v::size, "the selected architecture does not guarantee vector size equality!");
757 static_assert(uint_v::size == float_v::size, "the selected architecture does not guarantee vector size equality!");
758
759 struct Pixel {
760 float red;
761 float green;
762 float blue;
763 float alpha;
764 };
765
767 static float lerpMixedUintFloat(float a, float b, float alpha)
768 {
769 return Arithmetic::lerp(a,b,alpha);
770 }
771
773 static float roundFloatToUint(float x)
774 {
775 return x;
776 }
777
779 static void normalizeAlpha(float &alpha)
780 {
781 Q_UNUSED(alpha);
782 }
783
785 static void denormalizeAlpha(float &alpha)
786 {
787 Q_UNUSED(alpha);
788 }
789
790 PixelWrapper() = default;
791
792 ALWAYS_INLINE void read(const void *src, float_v &dst_c1, float_v &dst_c2, float_v &dst_c3, float_v &dst_alpha)
793 {
794#if XSIMD_VERSION_MAJOR < 10
795 KoRgbaInterleavers<32>::deinterleave(src, dst_c1, dst_c2, dst_c3, dst_alpha);
796#else
797 const auto srcPtr = reinterpret_cast<const typename float_v::value_type *>(src);
798 const auto idx1 = xsimd::detail::make_sequence_as_batch<int_v>() * 4; // stride == 4
799 const auto idx2 = idx1 + 1;
800 const auto idx3 = idx1 + 2;
801 const auto idx4 = idx1 + 3;
802
803 dst_c1 = float_v::gather(srcPtr, idx1);
804 dst_c2 = float_v::gather(srcPtr, idx2);
805 dst_c3 = float_v::gather(srcPtr, idx3);
806 dst_alpha = float_v::gather(srcPtr, idx4);
807#endif
808 }
809
810 ALWAYS_INLINE void
811 write(void *dst, const float_v &src_c1, const float_v &src_c2, const float_v &src_c3, const float_v &src_alpha)
812 {
813#if XSIMD_VERSION_MAJOR < 10
814 KoRgbaInterleavers<32>::interleave(dst, src_c1, src_c2, src_c3, src_alpha);
815#else
816 auto dstPtr = reinterpret_cast<typename float_v::value_type *>(dst);
817
818 const auto idx1 = xsimd::detail::make_sequence_as_batch<int_v>() * 4; // stride == 4
819 const auto idx2 = idx1 + 1;
820 const auto idx3 = idx1 + 2;
821 const auto idx4 = idx1 + 3;
822
823 src_c1.scatter(dstPtr, idx1);
824 src_c2.scatter(dstPtr, idx2);
825 src_c3.scatter(dstPtr, idx3);
826 src_alpha.scatter(dstPtr, idx4);
827#endif
828 }
829
831 void clearPixels(quint8 *dataDst)
832 {
833 memset(dataDst, 0, float_v::size * sizeof(float) * 4);
834 }
835
837 void copyPixels(const quint8 *dataSrc, quint8 *dataDst)
838 {
839 memcpy(dataDst, dataSrc, float_v::size * sizeof(float) * 4);
840 }
841};
842
843#endif /* !defined(XSIMD_NO_SUPPORTED_ARCHITECTURE) */
844
846{
847template<int pixelSize>
848ALWAYS_INLINE void clearPixel(quint8 *dst)
849{
850 std::memset(dst, 0, pixelSize);
851}
852
853template<int pixelSize>
854ALWAYS_INLINE void copyPixel(const quint8 *src, quint8 *dst)
855{
856 std::memcpy(dst, src, pixelSize);
857}
858} // namespace KoStreamedMathFunctions
859
860#endif /* __KOSTREAMED_MATH_H */
#define dbgPigment
float value(const T *src, size_t ch)
qreal v
#define ALWAYS_INLINE
T lerp(T a, T b, T alpha)
ALWAYS_INLINE void clearPixel(quint8 *dst)
ALWAYS_INLINE void copyPixel(const quint8 *src, quint8 *dst)
void vector_aligned_free(const T *ptr) noexcept
static void genericComposite32(const KoCompositeOp::ParameterInfo &params)
static void genericComposite_novector(const KoCompositeOp::ParameterInfo &params)
static void genericComposite64_novector(const KoCompositeOp::ParameterInfo &params)
xsimd::batch< unsigned int, _impl > uint_v
static void write_channels_32(void *data, const float_v alpha, const float_v c1, const float_v c2, const float_v c3)
static void genericComposite128(const KoCompositeOp::ParameterInfo &params)
static void write_channels_32_unaligned(void *data, const float_v alpha, const float_v c1, const float_v c2, const float_v c3)
static void genericComposite(const KoCompositeOp::ParameterInfo &params)
xsimd::batch< float, _impl > float_v
static float_v fetch_mask_8(const quint8 *data)
static quint8 lerp_mixed_u8_float(quint8 a, quint8 b, float alpha)
static void fetch_colors_32(const void *data, float_v &c1, float_v &c2, float_v &c3)
static float_v fetch_alpha_32(const void *data)
static void genericComposite64(const KoCompositeOp::ParameterInfo &params)
static quint8 round_float_to_u8(float x)
static void genericComposite32_novector(const KoCompositeOp::ParameterInfo &params)
xsimd::batch< int, _impl > int_v
static void genericComposite128_novector(const KoCompositeOp::ParameterInfo &params)
xsimd::batch< float, _impl > float_v
static ALWAYS_INLINE float divScalar(const float &divident, const float &divisor)
static ALWAYS_INLINE float_v divVector(const float_v &divident, const float_v &divisor)
static ALWAYS_INLINE result_type roundScalar(const float value)
xsimd::batch< float, _impl > float_v
typename float_v::batch_bool_type float_m
ALWAYS_INLINE void recoverPixels(const float_m &mask, float_v &c1, float_v &c2, float_v &c3) const
ALWAYS_INLINE PixelStateRecoverHelper(const float_v &c1, const float_v &c2, const float_v &c3)
typename float_v::batch_bool_type float_m
ALWAYS_INLINE void recoverPixels(const float_m &mask, float_v &c1, float_v &c2, float_v &c3) const
xsimd::batch< float, _impl > float_v
ALWAYS_INLINE PixelStateRecoverHelper(const float_v &c1, const float_v &c2, const float_v &c3)
xsimd::batch< int, _impl > int_v
xsimd::batch< unsigned int, _impl > uint_v
ALWAYS_INLINE void copyPixels(const quint8 *dataSrc, quint8 *dataDst)
xsimd::batch< float, _impl > float_v
static ALWAYS_INLINE float lerpMixedUintFloat(float a, float b, float alpha)
static ALWAYS_INLINE void denormalizeAlpha(float &alpha)
ALWAYS_INLINE void write(void *dst, const float_v &src_c1, const float_v &src_c2, const float_v &src_c3, const float_v &src_alpha)
ALWAYS_INLINE void clearPixels(quint8 *dataDst)
static ALWAYS_INLINE float roundFloatToUint(float x)
static ALWAYS_INLINE void normalizeAlpha(float &alpha)
ALWAYS_INLINE void read(const void *src, float_v &dst_c1, float_v &dst_c2, float_v &dst_c3, float_v &dst_alpha)
xsimd::batch< int, _impl > int_v
static ALWAYS_INLINE void denormalizeAlpha(float &alpha)
ALWAYS_INLINE void read(const void *src, float_v &dst_c1, float_v &dst_c2, float_v &dst_c3, float_v &dst_alpha)
ALWAYS_INLINE void clearPixels(quint8 *dataDst)
xsimd::batch< float, _impl > float_v
static ALWAYS_INLINE quint16 roundFloatToUint(float x)
static ALWAYS_INLINE void normalizeAlpha(float &alpha)
xsimd::batch< unsigned int, _impl > uint_v
ALWAYS_INLINE void copyPixels(const quint8 *dataSrc, quint8 *dataDst)
static ALWAYS_INLINE quint16 lerpMixedUintFloat(quint16 a, quint16 b, float alpha)
ALWAYS_INLINE void write(void *dst, const float_v &c1, const float_v &c2, const float_v &c3, const float_v &a)
xsimd::batch< int, _impl > int_v
static ALWAYS_INLINE void normalizeAlpha(float &alpha)
static ALWAYS_INLINE quint8 roundFloatToUint(float x)
xsimd::batch< float, _impl > float_v
static ALWAYS_INLINE void denormalizeAlpha(float &alpha)
ALWAYS_INLINE void write(quint8 *dataDst, const float_v &c1, const float_v &c2, const float_v &c3, const float_v &a)
xsimd::batch< unsigned int, _impl > uint_v
ALWAYS_INLINE void copyPixels(const quint8 *dataSrc, quint8 *dataDst)
ALWAYS_INLINE void clearPixels(quint8 *dataDst)
ALWAYS_INLINE void read(const void *src, float_v &dst_c1, float_v &dst_c2, float_v &dst_c3, float_v &dst_alpha)
static ALWAYS_INLINE quint8 lerpMixedUintFloat(quint8 a, quint8 b, float alpha)