7#ifndef KO_RGBA_INTERLEAVERS
8#define KO_RGBA_INTERLEAVERS
12#if XSIMD_VERSION_MAJOR >= 10
13#error "The interleavers use per-lane zipping semantics, which are not compatible with xsimd 10"
18template<
typename T,
size_t S>
21template<
typename T,
size_t S>
23 typename std::enable_if<std::is_integral<T>::value &&
sizeof(T) ==
S,
26template<
typename T,
typename A,
size_t S>
31inline batch<float, A> exchange_mid_halves(batch<float, A>
const &a, kernel::requires_arch<avx2>)
noexcept
33 return _mm256_castpd_ps(_mm256_permute4x64_pd(_mm256_castps_pd(a.data), 0xD8));
36template<
typename T,
typename A, enable_sized_
integral_t<T, 4> = 0>
37inline batch<T, A> exchange_mid_halves(batch<T, A>
const &a, kernel::requires_arch<avx2>)
noexcept
39 return _mm256_permute4x64_epi64(a.data, 0xD8);
45inline batch<float, A> merge_low(batch<float, A>
const &a, batch<float, A>
const &b, kernel::requires_arch<avx>)
noexcept
47 return _mm256_insertf128_ps(a, _mm256_castps256_ps128(b), 1);
50template<
typename T,
typename A, enable_sized_
integral_t<T, 4> = 0>
51inline batch<T, A> merge_low(batch<T, A>
const &a, batch<T, A>
const &b, kernel::requires_arch<avx>)
noexcept
53 return _mm256_insertf128_si256(a, _mm256_castsi256_si128(b), 1);
57inline batch<float, A> merge_high(batch<float, A>
const &a, batch<float, A>
const &b, kernel::requires_arch<avx>)
noexcept
59 return _mm256_permute2f128_ps(a, b, 0x31);
62template<
typename T,
typename A, enable_sized_
integral_t<T, 4> = 0>
63inline batch<T, A> merge_high(batch<T, A>
const &a, batch<T, A>
const &b, kernel::requires_arch<avx>)
noexcept
65 return _mm256_permute2f128_si256(a, b, 0x31);
69inline batch<float, A> duplicate_low_halves(batch<float, A>
const &a, batch<float, A>
const &b, kernel::requires_arch<avx>)
noexcept
71 return _mm256_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
74template<
typename T,
typename A, enable_sized_
integral_t<T, 4> = 0>
75inline batch<T, A> duplicate_low_halves(batch<T, A>
const &a, batch<T, A>
const &b, kernel::requires_arch<avx>)
noexcept
77 return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(2, 0, 2, 0)));
81inline batch<float, A> duplicate_high_halves(batch<float, A>
const &a, batch<float, A>
const &b, kernel::requires_arch<avx>)
noexcept
83 return _mm256_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
86template<
typename T,
typename A, enable_sized_
integral_t<T, 4> = 0>
87inline batch<T, A> duplicate_high_halves(batch<T, A>
const &a, batch<T, A>
const &b, kernel::requires_arch<avx>)
noexcept
89 return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(3, 1, 3, 1)));
98 template<
bool aligned,
typename T,
typename A, enable_sized_
integral_t<T, 4> = 0, enable_sized_vector_t<T, A, 4> = 0>
99 static inline void interleave(
void *dst, batch<T, A>
const &a, batch<T, A>
const &b, kernel::requires_arch<generic>)
101 auto *dstPtr =
static_cast<T *
>(dst);
102 using U = std::conditional_t<aligned, aligned_mode, unaligned_mode>;
103 const auto t1 = zip_lo(a, b);
104 const auto t2 = zip_hi(a, b);
105 t1.store(dstPtr, U{});
106 t2.store(dstPtr + batch<T, A>::size, U{});
114 template<
bool aligned,
typename T,
typename A, enable_sized_t<T, 4> = 0>
115 static inline void interleave(
void *dst, batch<T, A>
const &a, batch<T, A>
const &b, kernel::requires_arch<avx>)
117 auto *dstPtr =
static_cast<T *
>(dst);
118 using U = std::conditional_t<aligned, aligned_mode, unaligned_mode>;
119 const auto t1 = zip_lo(a, b);
120 const auto t2 = zip_hi(a, b);
121 const auto src1 = merge_low(t1, t2,
A{});
122 const auto src2 = merge_high(t1, t2,
A{});
123 src1.store(dstPtr, U{});
124 src2.store(dstPtr + batch<T, A>::size, U{});
128 template<
typename T,
typename A,
bool aligned = false>
129 static inline void interleave(
void *dst, batch<T, A>
const &a, batch<T, A>
const &b)
131 return interleave<aligned>(dst, a, b,
A{});
134 template<
bool aligned,
typename T,
typename A, enable_sized_
integral_t<T, 4> = 0, enable_sized_vector_t<T, A, 4> = 0>
135 static inline void deinterleave(
const void *src, batch<T, A> &dst1, batch<T, A> &dst2, kernel::requires_arch<generic>)
137 const auto *srcPtr =
static_cast<const T *
>(src);
138 using U = std::conditional_t<aligned, aligned_mode, unaligned_mode>;
140 const auto a = batch<T, A>::load(srcPtr, U{});
141 const auto b = batch<T, A>::load(srcPtr + batch<T, A>::size, U{});
142 const auto t1 = zip_lo(a, b);
143 const auto t2 = zip_hi(a, b);
144 dst1 = zip_lo(t1, t2);
145 dst2 = zip_hi(t1, t2);
149 template<
bool aligned,
typename T,
typename A, enable_sized_t<T, 4> = 0>
150 static inline void deinterleave(
const void *src, batch<T, A> &a, batch<T, A> &b, kernel::requires_arch<avx2>)
152 const auto *srcPtr =
static_cast<const T *
>(src);
153 using U = std::conditional_t<aligned, aligned_mode, unaligned_mode>;
154 const auto src1 = batch<T, A>::load(srcPtr, U{});
155 const auto src2 = batch<T, A>::load(srcPtr + batch<T, A>::size, U{});
156 const auto t1 = duplicate_low_halves(src1, src2,
A{});
157 a = exchange_mid_halves(t1,
A{});
158 const auto t2 = duplicate_high_halves(src1, src2,
A{});
159 b = exchange_mid_halves(t2,
A{});
163 template<
bool aligned,
typename T,
typename A, enable_sized_t<T, 4> = 0>
164 static inline void deinterleave(
const void *src, batch<T, A> &a, batch<T, A> &b, kernel::requires_arch<avx>)
166 const auto *srcPtr =
static_cast<const T *
>(
src);
167 using U = std::conditional_t<aligned, aligned_mode, unaligned_mode>;
168 const auto src1 = batch<T, A>::load(srcPtr, U{});
170 batch<T, A>::load(srcPtr + batch<T, A>::size, U{});
171 const auto t1 = merge_high(src1, src2,
A{});
172 const auto t2 = merge_low(src1, src2,
A{});
173 a.data = duplicate_low_halves(t2, t1,
A{});
174 b.data = duplicate_high_halves(t2, t1,
A{});
178 template<
typename T,
typename A,
bool aligned = false>
179 static inline void deinterleave(
const void *src, batch<T, A> &a, batch<T, A> &b)
181 return deinterleave<aligned>(src, a, b,
A{});
187 template<
typename T,
typename A,
bool aligned = false, enable_sized_t<T, 4> = 0, enable_sized_vector_t<T, A, 4> = 0>
189 interleave(
void *dst, batch<T, A>
const &a, batch<T, A>
const &b, batch<T, A>
const &c, batch<T, A>
const &d, kernel::requires_arch<generic>)
191 auto *dstPtr =
static_cast<T *
>(dst);
192 using U = std::conditional_t<aligned, aligned_mode, unaligned_mode>;
194 const auto t1 = zip_lo(a, c);
195 const auto t2 = zip_hi(a, c);
196 const auto t3 = zip_lo(b, d);
197 const auto t4 = zip_hi(b, d);
198 const auto src1 = zip_lo(t1, t3);
199 const auto src2 = zip_hi(t1, t3);
200 const auto src3 = zip_lo(t2, t4);
201 const auto src4 = zip_hi(t2, t4);
202 src1.store(dstPtr, U{});
203 src2.store(dstPtr + batch<T, A>::size, U{});
204 src3.store(dstPtr + batch<T, A>::size * 2, U{});
205 src4.store(dstPtr + batch<T, A>::size * 3, U{});
209 template<
typename T,
typename A,
bool aligned = false, enable_sized_t<T, 4> = 0>
211 interleave(
void *dst, batch<T, A>
const &a, batch<T, A>
const &b, batch<T, A>
const &c, batch<T, A>
const &d, kernel::requires_arch<avx>)
213 auto *dstPtr =
static_cast<T *
>(dst);
214 using U = std::conditional_t<aligned, aligned_mode, unaligned_mode>;
216 const auto t1 = zip_lo(a, c);
217 const auto t2 = zip_lo(b, d);
218 const auto t3 = zip_hi(a, c);
219 const auto t4 = zip_hi(b, d);
220 const auto t5 = zip_lo(t1, t2);
221 const auto t6 = zip_hi(t1, t2);
222 const auto t7 = zip_lo(t3, t4);
223 const auto t8 = zip_hi(t3, t4);
224 const auto src1 = merge_low(t5, t6,
A{});
225 const auto src2 = merge_low(t7, t8,
A{});
226 const auto src3 = merge_high(t5, t6,
A{});
227 const auto src4 = merge_high(t7, t8,
A{});
228 src1.store(dstPtr, U{});
229 src2.store(dstPtr + batch<T, A>::size, U{});
230 src3.store(dstPtr + batch<T, A>::size * 2, U{});
231 src4.store(dstPtr + batch<T, A>::size * 3, U{});
235 template<
typename T,
typename A,
bool aligned = false>
236 static inline void interleave(
void *dst, batch<T, A>
const &a, batch<T, A>
const &b, batch<T, A>
const &c, batch<T, A>
const &d)
238 return interleave<T, A, aligned>(dst, a, b, c, d,
A{});
241 template<
typename T,
typename A,
bool aligned = false, enable_sized_t<T, 4> = 0, enable_sized_vector_t<T, A, 4> = 0>
242 static inline void deinterleave(
const void *src, batch<T, A> &a, batch<T, A> &b, batch<T, A> &c, batch<T, A> &d, kernel::requires_arch<generic>)
244 const auto *srcPtr =
static_cast<const T *
>(src);
245 using U = std::conditional_t<aligned, aligned_mode, unaligned_mode>;
247 const auto t1 = batch<T, A>::load(srcPtr, U{});
248 const auto t2 = batch<T, A>::load(srcPtr + batch<T, A>::size, U{});
249 const auto t3 = batch<T, A>::load(srcPtr + batch<T, A>::size * 2, U{});
250 const auto t4 = batch<T, A>::load(srcPtr + batch<T, A>::size * 3, U{});
251 const auto src1 = zip_lo(t1, t3);
252 const auto src2 = zip_hi(t1, t3);
253 const auto src3 = zip_lo(t2, t4);
254 const auto src4 = zip_hi(t2, t4);
255 a = zip_lo(src1, src3);
256 b = zip_hi(src1, src3);
257 c = zip_lo(src2, src4);
258 d = zip_hi(src2, src4);
262 template<
typename T,
typename A,
bool aligned = false, enable_sized_t<T, 4> = 0>
263 static inline void deinterleave(
const void *src, batch<T, A> &a, batch<T, A> &b, batch<T, A> &c, batch<T, A> &d, kernel::requires_arch<avx>)
265 const auto *srcPtr =
static_cast<const T *
>(src);
266 using U = std::conditional_t<aligned, aligned_mode, unaligned_mode>;
268 const auto a0b0c0d0_a1b1c1d1 = batch<T, A>::load(srcPtr, U{});
269 const auto a2b2c2d2_a3b3c3d3 =
270 batch<T, A>::load(srcPtr + batch<T, A>::size, U{});
271 const auto a4b4c4d4_a5b5c5d5 =
272 batch<T, A>::load(srcPtr + batch<T, A>::size * 2, U{});
273 const auto a6b6c6d6_a7b7c7d7 =
274 batch<T, A>::load(srcPtr + batch<T, A>::size * 3, U{});
276 const auto a0a2b0b2_a1a3b1b3 =
277 zip_lo(a0b0c0d0_a1b1c1d1, a2b2c2d2_a3b3c3d3);
278 const auto c0c2d0d2_c1c3d1d3 =
279 zip_hi(a0b0c0d0_a1b1c1d1, a2b2c2d2_a3b3c3d3);
280 const auto a0a2b0b2_c0c2d0d2 =
281 merge_low(a0a2b0b2_a1a3b1b3, c0c2d0d2_c1c3d1d3,
A{});
282 const auto a1a3b1b3_c1c3d1d3 =
283 merge_high(a0a2b0b2_a1a3b1b3, c0c2d0d2_c1c3d1d3,
A{});
284 const auto a0a1a2a3_c0c1c2c3 =
285 zip_lo(a0a2b0b2_c0c2d0d2, a1a3b1b3_c1c3d1d3);
286 const auto b0b1b2b3_d0d1d2d3 =
287 zip_hi(a0a2b0b2_c0c2d0d2, a1a3b1b3_c1c3d1d3);
289 const auto a4a6b4b6_a5a7b5b7 =
290 zip_lo(a4b4c4d4_a5b5c5d5, a6b6c6d6_a7b7c7d7);
291 const auto c4c6d4d6_c5c7d5d7 =
292 zip_hi(a4b4c4d4_a5b5c5d5, a6b6c6d6_a7b7c7d7);
293 const auto a4a6b4b6_c4c6d4d6 =
294 merge_low(a4a6b4b6_a5a7b5b7, c4c6d4d6_c5c7d5d7,
A{});
295 const auto a5a7b5b7_c5c7d5d7 =
296 merge_high(a4a6b4b6_a5a7b5b7, c4c6d4d6_c5c7d5d7,
A{});
297 const auto a4a5a6a7_c4c5c6c7 =
298 zip_lo(a4a6b4b6_c4c6d4d6, a5a7b5b7_c5c7d5d7);
299 const auto b4b5b6b7_d4d5d6d7 =
300 zip_hi(a4a6b4b6_c4c6d4d6, a5a7b5b7_c5c7d5d7);
302 a = merge_low(a0a1a2a3_c0c1c2c3, a4a5a6a7_c4c5c6c7,
A{});
303 b = merge_low(b0b1b2b3_d0d1d2d3, b4b5b6b7_d4d5d6d7,
A{});
304 c = merge_high(a0a1a2a3_c0c1c2c3, a4a5a6a7_c4c5c6c7,
A{});
305 d = merge_high(b0b1b2b3_d0d1d2d3, b4b5b6b7_d4d5d6d7,
A{});
309 template<
typename T,
typename A,
bool aligned = false>
310 static inline void deinterleave(
const void *src, batch<T, A> &a, batch<T, A> &b, batch<T, A> &c, batch<T, A> &d)
312 return deinterleave<T, A, aligned>(src, a, b, c, d,
A{});
Eigen::Matrix< double, 4, 2 > S
typename std::enable_if< batch< T, A >::size==S, int >::type enable_sized_vector_t
typename std::enable_if< std::is_integral< T >::value &&sizeof(T)==S, int >::type enable_sized_integral_t
typename std::enable_if< sizeof(T)==S, int >::type enable_sized_t
std::pair< V, V > interleave(const V &a, const V &b) noexcept
static void interleave(void *dst, batch< T, A > const &a, batch< T, A > const &b)
static void deinterleave(const void *src, batch< T, A > &a, batch< T, A > &b)
static void deinterleave(const void *src, batch< T, A > &dst1, batch< T, A > &dst2, kernel::requires_arch< generic >)
static void interleave(void *dst, batch< T, A > const &a, batch< T, A > const &b, kernel::requires_arch< generic >)
static void deinterleave(const void *src, batch< T, A > &a, batch< T, A > &b, batch< T, A > &c, batch< T, A > &d, kernel::requires_arch< generic >)
static void interleave(void *dst, batch< T, A > const &a, batch< T, A > const &b, batch< T, A > const &c, batch< T, A > const &d, kernel::requires_arch< generic >)
static void deinterleave(const void *src, batch< T, A > &a, batch< T, A > &b, batch< T, A > &c, batch< T, A > &d)
static void interleave(void *dst, batch< T, A > const &a, batch< T, A > const &b, batch< T, A > const &c, batch< T, A > const &d)