26 void convertU8ToU16(
const quint8 *src,
int srcRowStride, quint8 *dst,
int dstRowStride,
int numRows,
int numColumns)
const override
31 using uint16_avx_v = xsimd::batch<uint16_t, xsimd::avx2>;
32 using uint16_v = xsimd::batch<uint16_t, xsimd::sse4_1>;
33 using uint8_v = xsimd::batch<uint8_t, xsimd::sse4_1>;
35 const int channelsPerAvx2Block = 16;
36 const int channelsPerSse2Block = 8;
37 const int avx2Block = numColorChannels / channelsPerAvx2Block;
38 const int rest = numColorChannels % channelsPerAvx2Block;
39 const int sse2Block = rest / channelsPerSse2Block;
40 const int scalarBlock = rest % channelsPerSse2Block;
41#elif (XSIMD_WITH_SSE4_1 || XSIMD_WITH_NEON || XSIMD_WITH_NEON64)
43 using uint16_v = xsimd::batch<uint16_t, xsimd::sse4_1>;
44 using uint8_v = xsimd::batch<uint8_t, xsimd::sse4_1>;
45#elif XSIMD_WITH_NEON64
46 using uint16_v = xsimd::batch<uint16_t, xsimd::neon64>;
47 using uint8_v = xsimd::batch<uint8_t, xsimd::neon64>;
49 using uint16_v = xsimd::batch<uint16_t, xsimd::neon>;
50 using uint8_v = xsimd::batch<uint8_t, xsimd::neon>;
53 const int channelsPerSse2Block = 8;
54 const int avx2Block = 0;
55 const int sse2Block = numColorChannels / channelsPerSse2Block;
56 const int scalarBlock = numColorChannels % channelsPerSse2Block;
58 const int avx2Block = 0;
59 const int sse2Block = 0;
60 const int scalarBlock = numColorChannels;
65 for (
int row = 0; row < numRows; row++) {
66 const quint8 *srcPtr = src;
67 auto *dstPtr =
reinterpret_cast<quint16 *
>(dst);
70 for (
int i = 0; i < avx2Block; i++) {
71 const auto x = uint8_v::load_unaligned(srcPtr);
73 uint16_avx_v y(_mm256_cvtepu8_epi16(x));
74 const auto y_shifted = y << 8;
78 reinterpret_cast<typename uint16_avx_v::value_type *
>(dstPtr));
80 srcPtr += channelsPerAvx2Block;
81 dstPtr += channelsPerAvx2Block;
87#if (XSIMD_WITH_SSE4_1 || XSIMD_WITH_NEON || XSIMD_WITH_NEON64)
88 for (
int i = 0; i < sse2Block; i++) {
90 const uint8_v x(_mm_loadl_epi64(
reinterpret_cast<const __m128i *
>(srcPtr)));
92 const uint8_v x(vreinterpretq_u8_u32(vcombine_u32(
93 vld1_u32(
reinterpret_cast<const uint32_t *
>(srcPtr)),
97 uint16_v y(_mm_cvtepu8_epi16(x.data));
99 uint16_v y(vmovl_u8(vget_low_u8(x.data)));
101 const auto y_shifted = y << 8;
104 y.store_unaligned(
reinterpret_cast<typename uint16_v::value_type *
>(dstPtr));
106 srcPtr += channelsPerSse2Block;
107 dstPtr += channelsPerSse2Block;
113 for (
int i = 0; i < scalarBlock; i++) {
114 const quint16
value = *srcPtr;
116 *dstPtr =
static_cast<quint16
>(
value | (
value << 8));
127 void convertU16ToU8(
const quint8 *src,
int srcRowStride, quint8 *dst,
int dstRowStride,
int numRows,
int numColumns)
const override
132 using uint16_avx_v = xsimd::batch<uint16_t, xsimd::avx2>;
133 using uint16_v = xsimd::batch<uint16_t, xsimd::sse4_1>;
135 const int channelsPerAvx2Block = 32;
136 const int channelsPerSse2Block = 16;
137 const int avx2Block = numColorChannels / channelsPerAvx2Block;
138 const int rest = numColorChannels % channelsPerAvx2Block;
139 const int sse2Block = rest / channelsPerSse2Block;
140 const int scalarBlock = rest % channelsPerSse2Block;
142 const auto offset1 = uint16_avx_v(128);
143 const auto offset2 = uint16_v(128);
145#elif XSIMD_WITH_SSE2 || XSIMD_WITH_NEON || XSIMD_WITH_NEON64
149 using uint16_v = xsimd::batch<uint16_t, xsimd::sse2>;
150#elif XSIMD_WITH_NEON64
151 using uint16_v = xsimd::batch<uint16_t, xsimd::neon64>;
153 using uint16_v = xsimd::batch<uint16_t, xsimd::neon>;
156 const int channelsPerSse2Block = 16;
157 const int avx2Block = 0;
158 const int sse2Block = numColorChannels / channelsPerSse2Block;
159 const int scalarBlock = numColorChannels % channelsPerSse2Block;
161 const auto offset2 = uint16_v(128);
163 const int avx2Block = 0;
164 const int sse2Block = 0;
165 const int scalarBlock = numColorChannels;
170 for (
int row = 0; row < numRows; row++) {
171 const quint16 *srcPtr =
reinterpret_cast<const quint16 *
>(src);
172 quint8 *dstPtr = dst;
175 for (
int i = 0; i < avx2Block; i++) {
176 auto x1 = uint16_avx_v::load_unaligned(srcPtr);
177 auto x2 = uint16_avx_v::load_unaligned(srcPtr + uint16_avx_v::size);
179 const auto x1_shifted = x1 >> 8;
180 const auto x2_shifted = x2 >> 8;
190 x1.data = _mm256_packus_epi16(x1, x2);
195 x1.data = _mm256_permute4x64_epi64(x1, 0xd8);
197 x1.store_unaligned(
reinterpret_cast<typename uint16_v::value_type *
>(dstPtr));
199 srcPtr += channelsPerAvx2Block;
200 dstPtr += channelsPerAvx2Block;
206#if (XSIMD_WITH_SSE2 || XSIMD_WITH_NEON || XSIMD_WITH_NEON64)
207 for (
int i = 0; i < sse2Block; i++) {
208 auto x1 = uint16_v::load_unaligned(srcPtr);
209 auto x2 = uint16_v::load_unaligned(srcPtr + uint16_v::size);
211 const uint16_v x1_shifted = x1 >> 8;
212 const uint16_v x2_shifted = x2 >> 8;
222 x1.data = _mm_packus_epi16(x1, x2);
224 x1.data = vreinterpretq_u16_u8(vcombine_u8(vqmovun_s16(vreinterpretq_s16_u16(x1)), vqmovun_s16(vreinterpretq_s16_u16(x2))));
226 x1.store_unaligned(
reinterpret_cast<typename uint16_v::value_type *
>(dstPtr));
227 srcPtr += channelsPerSse2Block;
228 dstPtr += channelsPerSse2Block;
234 for (
int i = 0; i < scalarBlock; i++) {
235 const quint16
value = *srcPtr;