Krita Source Code Documentation
Loading...
Searching...
No Matches
KoRgbaInterleavers.h
Go to the documentation of this file.
1/*
2 * SPDX-FileCopyrightText: 2022 L. E. Segovia <amy@amyspark.me>
3 *
4 * SPDX-License-Identifier: BSD-3-Clause
5 */
6
7#ifndef KO_RGBA_INTERLEAVERS
8#define KO_RGBA_INTERLEAVERS
9
11
12#if XSIMD_VERSION_MAJOR >= 10
13#error "The interleavers use per-lane zipping semantics, which are not compatible with xsimd 10"
14#endif
15
16using namespace xsimd;
17
18template<typename T, size_t S>
19using enable_sized_t = typename std::enable_if<sizeof(T) == S, int>::type;
20
21template<typename T, size_t S>
23 typename std::enable_if<std::is_integral<T>::value && sizeof(T) == S,
24 int>::type;
25
26template<typename T, typename A, size_t S>
27using enable_sized_vector_t = typename std::enable_if<batch<T, A>::size == S, int>::type;
28
29#if XSIMD_WITH_AVX2
30template<typename A>
31inline batch<float, A> exchange_mid_halves(batch<float, A> const &a, kernel::requires_arch<avx2>) noexcept
32{
33 return _mm256_castpd_ps(_mm256_permute4x64_pd(_mm256_castps_pd(a.data), 0xD8));
34}
35
36template<typename T, typename A, enable_sized_integral_t<T, 4> = 0>
37inline batch<T, A> exchange_mid_halves(batch<T, A> const &a, kernel::requires_arch<avx2>) noexcept
38{
39 return _mm256_permute4x64_epi64(a.data, 0xD8);
40}
41#endif
42
43#if XSIMD_WITH_AVX
44template<typename A>
45inline batch<float, A> merge_low(batch<float, A> const &a, batch<float, A> const &b, kernel::requires_arch<avx>) noexcept
46{
47 return _mm256_insertf128_ps(a, _mm256_castps256_ps128(b), 1);
48}
49
50template<typename T, typename A, enable_sized_integral_t<T, 4> = 0>
51inline batch<T, A> merge_low(batch<T, A> const &a, batch<T, A> const &b, kernel::requires_arch<avx>) noexcept
52{
53 return _mm256_insertf128_si256(a, _mm256_castsi256_si128(b), 1);
54}
55
56template<typename A>
57inline batch<float, A> merge_high(batch<float, A> const &a, batch<float, A> const &b, kernel::requires_arch<avx>) noexcept
58{
59 return _mm256_permute2f128_ps(a, b, 0x31);
60}
61
62template<typename T, typename A, enable_sized_integral_t<T, 4> = 0>
63inline batch<T, A> merge_high(batch<T, A> const &a, batch<T, A> const &b, kernel::requires_arch<avx>) noexcept
64{
65 return _mm256_permute2f128_si256(a, b, 0x31);
66}
67
68template<typename A>
69inline batch<float, A> duplicate_low_halves(batch<float, A> const &a, batch<float, A> const &b, kernel::requires_arch<avx>) noexcept
70{
71 return _mm256_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
72}
73
74template<typename T, typename A, enable_sized_integral_t<T, 4> = 0>
75inline batch<T, A> duplicate_low_halves(batch<T, A> const &a, batch<T, A> const &b, kernel::requires_arch<avx>) noexcept
76{
77 return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(2, 0, 2, 0)));
78}
79
80template<typename A>
81inline batch<float, A> duplicate_high_halves(batch<float, A> const &a, batch<float, A> const &b, kernel::requires_arch<avx>) noexcept
82{
83 return _mm256_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
84}
85
86template<typename T, typename A, enable_sized_integral_t<T, 4> = 0>
87inline batch<T, A> duplicate_high_halves(batch<T, A> const &a, batch<T, A> const &b, kernel::requires_arch<avx>) noexcept
88{
89 return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(3, 1, 3, 1)));
90}
91#endif
92
93template<size_t N>
95
96template<>
98 template<bool aligned, typename T, typename A, enable_sized_integral_t<T, 4> = 0, enable_sized_vector_t<T, A, 4> = 0>
99 static inline void interleave(void *dst, batch<T, A> const &a, batch<T, A> const &b, kernel::requires_arch<generic>)
100 {
101 auto *dstPtr = static_cast<T *>(dst);
102 using U = std::conditional_t<aligned, aligned_mode, unaligned_mode>;
103 const auto t1 = zip_lo(a, b);
104 const auto t2 = zip_hi(a, b);
105 t1.store(dstPtr, U{});
106 t2.store(dstPtr + batch<T, A>::size, U{});
107 }
108
109 // The AVX versions are handmade ports of the ones generated
110 // by Clang 14.0.0: https://godbolt.org/z/Ts8MWosW3
111 // Except for interleave(avx) which comes from GCC 11.2
112
113#if XSIMD_WITH_AVX
114 template<bool aligned, typename T, typename A, enable_sized_t<T, 4> = 0>
115 static inline void interleave(void *dst, batch<T, A> const &a, batch<T, A> const &b, kernel::requires_arch<avx>)
116 {
117 auto *dstPtr = static_cast<T *>(dst);
118 using U = std::conditional_t<aligned, aligned_mode, unaligned_mode>;
119 const auto t1 = zip_lo(a, b);
120 const auto t2 = zip_hi(a, b);
121 const auto src1 = merge_low(t1, t2, A{});
122 const auto src2 = merge_high(t1, t2, A{});
123 src1.store(dstPtr, U{});
124 src2.store(dstPtr + batch<T, A>::size, U{});
125 }
126#endif
127
128 template<typename T, typename A, bool aligned = false>
129 static inline void interleave(void *dst, batch<T, A> const &a, batch<T, A> const &b)
130 {
131 return interleave<aligned>(dst, a, b, A{});
132 }
133
134 template<bool aligned, typename T, typename A, enable_sized_integral_t<T, 4> = 0, enable_sized_vector_t<T, A, 4> = 0>
135 static inline void deinterleave(const void *src, batch<T, A> &dst1, batch<T, A> &dst2, kernel::requires_arch<generic>)
136 {
137 const auto *srcPtr = static_cast<const T *>(src);
138 using U = std::conditional_t<aligned, aligned_mode, unaligned_mode>;
139
140 const auto a = batch<T, A>::load(srcPtr, U{});
141 const auto b = batch<T, A>::load(srcPtr + batch<T, A>::size, U{});
142 const auto t1 = zip_lo(a, b);
143 const auto t2 = zip_hi(a, b);
144 dst1 = zip_lo(t1, t2);
145 dst2 = zip_hi(t1, t2);
146 }
147
148#if XSIMD_WITH_AVX2
149 template<bool aligned, typename T, typename A, enable_sized_t<T, 4> = 0>
150 static inline void deinterleave(const void *src, batch<T, A> &a, batch<T, A> &b, kernel::requires_arch<avx2>)
151 {
152 const auto *srcPtr = static_cast<const T *>(src);
153 using U = std::conditional_t<aligned, aligned_mode, unaligned_mode>;
154 const auto src1 = batch<T, A>::load(srcPtr, U{});
155 const auto src2 = batch<T, A>::load(srcPtr + batch<T, A>::size, U{});
156 const auto t1 = duplicate_low_halves(src1, src2, A{});
157 a = exchange_mid_halves(t1, A{});
158 const auto t2 = duplicate_high_halves(src1, src2, A{});
159 b = exchange_mid_halves(t2, A{});
160 }
161#endif
162#if XSIMD_WITH_AVX
163 template<bool aligned, typename T, typename A, enable_sized_t<T, 4> = 0>
164 static inline void deinterleave(const void *src, batch<T, A> &a, batch<T, A> &b, kernel::requires_arch<avx>)
165 {
166 const auto *srcPtr = static_cast<const T *>(src);
167 using U = std::conditional_t<aligned, aligned_mode, unaligned_mode>;
168 const auto src1 = batch<T, A>::load(srcPtr, U{});
169 const auto src2 =
170 batch<T, A>::load(srcPtr + batch<T, A>::size, U{});
171 const auto t1 = merge_high(src1, src2, A{});
172 const auto t2 = merge_low(src1, src2, A{});
173 a.data = duplicate_low_halves(t2, t1, A{});
174 b.data = duplicate_high_halves(t2, t1, A{});
175 }
176#endif
177
178 template<typename T, typename A, bool aligned = false>
179 static inline void deinterleave(const void *src, batch<T, A> &a, batch<T, A> &b)
180 {
181 return deinterleave<aligned>(src, a, b, A{});
182 }
183};
184
185template<>
187 template<typename T, typename A, bool aligned = false, enable_sized_t<T, 4> = 0, enable_sized_vector_t<T, A, 4> = 0>
188 static inline void
189 interleave(void *dst, batch<T, A> const &a, batch<T, A> const &b, batch<T, A> const &c, batch<T, A> const &d, kernel::requires_arch<generic>)
190 {
191 auto *dstPtr = static_cast<T *>(dst);
192 using U = std::conditional_t<aligned, aligned_mode, unaligned_mode>;
193
194 const auto t1 = zip_lo(a, c);
195 const auto t2 = zip_hi(a, c);
196 const auto t3 = zip_lo(b, d);
197 const auto t4 = zip_hi(b, d);
198 const auto src1 = zip_lo(t1, t3);
199 const auto src2 = zip_hi(t1, t3);
200 const auto src3 = zip_lo(t2, t4);
201 const auto src4 = zip_hi(t2, t4);
202 src1.store(dstPtr, U{});
203 src2.store(dstPtr + batch<T, A>::size, U{});
204 src3.store(dstPtr + batch<T, A>::size * 2, U{});
205 src4.store(dstPtr + batch<T, A>::size * 3, U{});
206 }
207
208#if XSIMD_WITH_AVX
209 template<typename T, typename A, bool aligned = false, enable_sized_t<T, 4> = 0>
210 static inline void
211 interleave(void *dst, batch<T, A> const &a, batch<T, A> const &b, batch<T, A> const &c, batch<T, A> const &d, kernel::requires_arch<avx>)
212 {
213 auto *dstPtr = static_cast<T *>(dst);
214 using U = std::conditional_t<aligned, aligned_mode, unaligned_mode>;
215
216 const auto t1 = zip_lo(a, c);
217 const auto t2 = zip_lo(b, d);
218 const auto t3 = zip_hi(a, c);
219 const auto t4 = zip_hi(b, d);
220 const auto t5 = zip_lo(t1, t2);
221 const auto t6 = zip_hi(t1, t2);
222 const auto t7 = zip_lo(t3, t4);
223 const auto t8 = zip_hi(t3, t4);
224 const auto src1 = merge_low(t5, t6, A{});
225 const auto src2 = merge_low(t7, t8, A{});
226 const auto src3 = merge_high(t5, t6, A{});
227 const auto src4 = merge_high(t7, t8, A{});
228 src1.store(dstPtr, U{});
229 src2.store(dstPtr + batch<T, A>::size, U{});
230 src3.store(dstPtr + batch<T, A>::size * 2, U{});
231 src4.store(dstPtr + batch<T, A>::size * 3, U{});
232 }
233#endif
234
235 template<typename T, typename A, bool aligned = false>
236 static inline void interleave(void *dst, batch<T, A> const &a, batch<T, A> const &b, batch<T, A> const &c, batch<T, A> const &d)
237 {
238 return interleave<T, A, aligned>(dst, a, b, c, d, A{});
239 }
240
241 template<typename T, typename A, bool aligned = false, enable_sized_t<T, 4> = 0, enable_sized_vector_t<T, A, 4> = 0>
242 static inline void deinterleave(const void *src, batch<T, A> &a, batch<T, A> &b, batch<T, A> &c, batch<T, A> &d, kernel::requires_arch<generic>)
243 {
244 const auto *srcPtr = static_cast<const T *>(src);
245 using U = std::conditional_t<aligned, aligned_mode, unaligned_mode>;
246
247 const auto t1 = batch<T, A>::load(srcPtr, U{});
248 const auto t2 = batch<T, A>::load(srcPtr + batch<T, A>::size, U{});
249 const auto t3 = batch<T, A>::load(srcPtr + batch<T, A>::size * 2, U{});
250 const auto t4 = batch<T, A>::load(srcPtr + batch<T, A>::size * 3, U{});
251 const auto src1 = zip_lo(t1, t3);
252 const auto src2 = zip_hi(t1, t3);
253 const auto src3 = zip_lo(t2, t4);
254 const auto src4 = zip_hi(t2, t4);
255 a = zip_lo(src1, src3);
256 b = zip_hi(src1, src3);
257 c = zip_lo(src2, src4);
258 d = zip_hi(src2, src4);
259 }
260
261#if XSIMD_WITH_AVX
262 template<typename T, typename A, bool aligned = false, enable_sized_t<T, 4> = 0>
263 static inline void deinterleave(const void *src, batch<T, A> &a, batch<T, A> &b, batch<T, A> &c, batch<T, A> &d, kernel::requires_arch<avx>)
264 {
265 const auto *srcPtr = static_cast<const T *>(src);
266 using U = std::conditional_t<aligned, aligned_mode, unaligned_mode>;
267
268 const auto a0b0c0d0_a1b1c1d1 = batch<T, A>::load(srcPtr, U{});
269 const auto a2b2c2d2_a3b3c3d3 =
270 batch<T, A>::load(srcPtr + batch<T, A>::size, U{});
271 const auto a4b4c4d4_a5b5c5d5 =
272 batch<T, A>::load(srcPtr + batch<T, A>::size * 2, U{});
273 const auto a6b6c6d6_a7b7c7d7 =
274 batch<T, A>::load(srcPtr + batch<T, A>::size * 3, U{});
275
276 const auto a0a2b0b2_a1a3b1b3 =
277 zip_lo(a0b0c0d0_a1b1c1d1, a2b2c2d2_a3b3c3d3);
278 const auto c0c2d0d2_c1c3d1d3 =
279 zip_hi(a0b0c0d0_a1b1c1d1, a2b2c2d2_a3b3c3d3);
280 const auto a0a2b0b2_c0c2d0d2 =
281 merge_low(a0a2b0b2_a1a3b1b3, c0c2d0d2_c1c3d1d3, A{});
282 const auto a1a3b1b3_c1c3d1d3 =
283 merge_high(a0a2b0b2_a1a3b1b3, c0c2d0d2_c1c3d1d3, A{});
284 const auto a0a1a2a3_c0c1c2c3 =
285 zip_lo(a0a2b0b2_c0c2d0d2, a1a3b1b3_c1c3d1d3);
286 const auto b0b1b2b3_d0d1d2d3 =
287 zip_hi(a0a2b0b2_c0c2d0d2, a1a3b1b3_c1c3d1d3);
288
289 const auto a4a6b4b6_a5a7b5b7 =
290 zip_lo(a4b4c4d4_a5b5c5d5, a6b6c6d6_a7b7c7d7);
291 const auto c4c6d4d6_c5c7d5d7 =
292 zip_hi(a4b4c4d4_a5b5c5d5, a6b6c6d6_a7b7c7d7);
293 const auto a4a6b4b6_c4c6d4d6 =
294 merge_low(a4a6b4b6_a5a7b5b7, c4c6d4d6_c5c7d5d7, A{});
295 const auto a5a7b5b7_c5c7d5d7 =
296 merge_high(a4a6b4b6_a5a7b5b7, c4c6d4d6_c5c7d5d7, A{});
297 const auto a4a5a6a7_c4c5c6c7 =
298 zip_lo(a4a6b4b6_c4c6d4d6, a5a7b5b7_c5c7d5d7);
299 const auto b4b5b6b7_d4d5d6d7 =
300 zip_hi(a4a6b4b6_c4c6d4d6, a5a7b5b7_c5c7d5d7);
301
302 a = merge_low(a0a1a2a3_c0c1c2c3, a4a5a6a7_c4c5c6c7, A{});
303 b = merge_low(b0b1b2b3_d0d1d2d3, b4b5b6b7_d4d5d6d7, A{});
304 c = merge_high(a0a1a2a3_c0c1c2c3, a4a5a6a7_c4c5c6c7, A{});
305 d = merge_high(b0b1b2b3_d0d1d2d3, b4b5b6b7_d4d5d6d7, A{});
306 }
307#endif
308
309 template<typename T, typename A, bool aligned = false>
310 static inline void deinterleave(const void *src, batch<T, A> &a, batch<T, A> &b, batch<T, A> &c, batch<T, A> &d)
311 {
312 return deinterleave<T, A, aligned>(src, a, b, c, d, A{});
313 }
314};
315
316#endif // KO_RGBA_INTERLEAVERS
Eigen::Matrix< double, 4, 2 > S
typename std::enable_if< batch< T, A >::size==S, int >::type enable_sized_vector_t
typename std::enable_if< std::is_integral< T >::value &&sizeof(T)==S, int >::type enable_sized_integral_t
typename std::enable_if< sizeof(T)==S, int >::type enable_sized_t
std::pair< V, V > interleave(const V &a, const V &b) noexcept
static void interleave(void *dst, batch< T, A > const &a, batch< T, A > const &b)
static void deinterleave(const void *src, batch< T, A > &a, batch< T, A > &b)
static void deinterleave(const void *src, batch< T, A > &dst1, batch< T, A > &dst2, kernel::requires_arch< generic >)
static void interleave(void *dst, batch< T, A > const &a, batch< T, A > const &b, kernel::requires_arch< generic >)
static void deinterleave(const void *src, batch< T, A > &a, batch< T, A > &b, batch< T, A > &c, batch< T, A > &d, kernel::requires_arch< generic >)
static void interleave(void *dst, batch< T, A > const &a, batch< T, A > const &b, batch< T, A > const &c, batch< T, A > const &d, kernel::requires_arch< generic >)
static void deinterleave(const void *src, batch< T, A > &a, batch< T, A > &b, batch< T, A > &c, batch< T, A > &d)
static void interleave(void *dst, batch< T, A > const &a, batch< T, A > const &b, batch< T, A > const &c, batch< T, A > const &d)