128 {
130
131#if XSIMD_WITH_AVX2
132 using uint16_avx_v = xsimd::batch<uint16_t, xsimd::avx2>;
133 using uint16_v = xsimd::batch<uint16_t, xsimd::sse4_1>;
134
135 const int channelsPerAvx2Block = 32;
136 const int channelsPerSse2Block = 16;
137 const int avx2Block = numColorChannels / channelsPerAvx2Block;
138 const int rest = numColorChannels % channelsPerAvx2Block;
139 const int sse2Block = rest / channelsPerSse2Block;
140 const int scalarBlock = rest % channelsPerSse2Block;
141
142 const auto offset1 = uint16_avx_v(128);
143 const auto offset2 = uint16_v(128);
144
145#elif XSIMD_WITH_SSE2 || XSIMD_WITH_NEON || XSIMD_WITH_NEON64
146
147
148#if XSIMD_WITH_SSE2
149 using uint16_v = xsimd::batch<uint16_t, xsimd::sse2>;
150#elif XSIMD_WITH_NEON64
151 using uint16_v = xsimd::batch<uint16_t, xsimd::neon64>;
152#else
153 using uint16_v = xsimd::batch<uint16_t, xsimd::neon>;
154#endif
155
156 const int channelsPerSse2Block = 16;
157 const int avx2Block = 0;
158 const int sse2Block = numColorChannels / channelsPerSse2Block;
159 const int scalarBlock = numColorChannels % channelsPerSse2Block;
160
161 const auto offset2 = uint16_v(128);
162#else
163 const int avx2Block = 0;
164 const int sse2Block = 0;
165 const int scalarBlock = numColorChannels;
166#endif
167
168
169
170 for (int row = 0; row < numRows; row++) {
171 const quint16 *srcPtr =
reinterpret_cast<const quint16 *
>(
src);
172 quint8 *dstPtr = dst;
173
174#if XSIMD_WITH_AVX2
175 for (int i = 0; i < avx2Block; i++) {
176 auto x1 = uint16_avx_v::load_unaligned(srcPtr);
177 auto x2 = uint16_avx_v::load_unaligned(srcPtr + uint16_avx_v::size);
178
179 const auto x1_shifted = x1 >> 8;
180 const auto x2_shifted = x2 >> 8;
181
182 x1 -= x1_shifted;
183 x1 += offset1;
184 x1 >>= 8;
185
186 x2 -= x2_shifted;
187 x2 += offset1;
188 x2 >>= 8;
189
190 x1.data = _mm256_packus_epi16(x1, x2);
191
192
193
194
195 x1.data = _mm256_permute4x64_epi64(x1, 0xd8);
196
197 x1.store_unaligned(reinterpret_cast<typename uint16_v::value_type *>(dstPtr));
198
199 srcPtr += channelsPerAvx2Block;
200 dstPtr += channelsPerAvx2Block;
201 }
202#else
203 Q_UNUSED(avx2Block);
204#endif
205
206#if (XSIMD_WITH_SSE2 || XSIMD_WITH_NEON || XSIMD_WITH_NEON64)
207 for (int i = 0; i < sse2Block; i++) {
208 auto x1 = uint16_v::load_unaligned(srcPtr);
209 auto x2 = uint16_v::load_unaligned(srcPtr + uint16_v::size);
210
211 const uint16_v x1_shifted = x1 >> 8;
212 const uint16_v x2_shifted = x2 >> 8;
213
214 x1 -= x1_shifted;
215 x1 += offset2;
216 x1 >>= 8;
217
218 x2 -= x2_shifted;
219 x2 += offset2;
220 x2 >>= 8;
221#if XSIMD_WITH_SSE2
222 x1.data = _mm_packus_epi16(x1, x2);
223#else
224 x1.data = vreinterpretq_u16_u8(vcombine_u8(vqmovun_s16(vreinterpretq_s16_u16(x1)), vqmovun_s16(vreinterpretq_s16_u16(x2))));
225#endif
226 x1.store_unaligned(reinterpret_cast<typename uint16_v::value_type *>(dstPtr));
227 srcPtr += channelsPerSse2Block;
228 dstPtr += channelsPerSse2Block;
229 }
230#else
231 Q_UNUSED(sse2Block);
232#endif
233
234 for (int i = 0; i < scalarBlock; i++) {
235 const quint16
value = *srcPtr;
236
238
239 srcPtr++;
240 dstPtr++;
241 }
242
244 dst += dstRowStride;
245 }
246 }
float value(const T *src, size_t ch)