Krita Source Code Documentation
Loading...
Searching...
No Matches
KoOptimizedPixelDataScalerU8ToU16< _impl > Class Template Reference

#include <KoOptimizedPixelDataScalerU8ToU16.h>

+ Inheritance diagram for KoOptimizedPixelDataScalerU8ToU16< _impl >:

Public Member Functions

void convertU16ToU8 (const quint8 *src, int srcRowStride, quint8 *dst, int dstRowStride, int numRows, int numColumns) const override
 
void convertU8ToU16 (const quint8 *src, int srcRowStride, quint8 *dst, int dstRowStride, int numRows, int numColumns) const override
 
 KoOptimizedPixelDataScalerU8ToU16 (int channelsPerPixel)
 
- Public Member Functions inherited from KoOptimizedPixelDataScalerU8ToU16Base
int channelsPerPixel () const
 
 KoOptimizedPixelDataScalerU8ToU16Base (int channelsPerPixel)
 
virtual ~KoOptimizedPixelDataScalerU8ToU16Base ()
 

Additional Inherited Members

- Protected Attributes inherited from KoOptimizedPixelDataScalerU8ToU16Base
int m_channelsPerPixel
 

Detailed Description

template<typename _impl = xsimd::current_arch>
class KoOptimizedPixelDataScalerU8ToU16< _impl >

Definition at line 18 of file KoOptimizedPixelDataScalerU8ToU16.h.

Constructor & Destructor Documentation

◆ KoOptimizedPixelDataScalerU8ToU16()

Member Function Documentation

◆ convertU16ToU8()

template<typename _impl = xsimd::current_arch>
void KoOptimizedPixelDataScalerU8ToU16< _impl >::convertU16ToU8 ( const quint8 * src,
int srcRowStride,
quint8 * dst,
int dstRowStride,
int numRows,
int numColumns ) const
inlineoverridevirtual

Implements KoOptimizedPixelDataScalerU8ToU16Base.

Definition at line 127 of file KoOptimizedPixelDataScalerU8ToU16.h.

128 {
129 const int numColorChannels = m_channelsPerPixel * numColumns;
130
131#if XSIMD_WITH_AVX2
132 using uint16_avx_v = xsimd::batch<uint16_t, xsimd::avx2>;
133 using uint16_v = xsimd::batch<uint16_t, xsimd::sse4_1>;
134
135 const int channelsPerAvx2Block = 32;
136 const int channelsPerSse2Block = 16;
137 const int avx2Block = numColorChannels / channelsPerAvx2Block;
138 const int rest = numColorChannels % channelsPerAvx2Block;
139 const int sse2Block = rest / channelsPerSse2Block;
140 const int scalarBlock = rest % channelsPerSse2Block;
141
142 const auto offset1 = uint16_avx_v(128);
143 const auto offset2 = uint16_v(128);
144
145#elif XSIMD_WITH_SSE2 || XSIMD_WITH_NEON || XSIMD_WITH_NEON64
146 // SSE2, unlike the previous function, is a perfectly valid option
147 // while under generic.
148#if XSIMD_WITH_SSE2
149 using uint16_v = xsimd::batch<uint16_t, xsimd::sse2>;
150#elif XSIMD_WITH_NEON64
151 using uint16_v = xsimd::batch<uint16_t, xsimd::neon64>;
152#else
153 using uint16_v = xsimd::batch<uint16_t, xsimd::neon>;
154#endif
155
156 const int channelsPerSse2Block = 16;
157 const int avx2Block = 0;
158 const int sse2Block = numColorChannels / channelsPerSse2Block;
159 const int scalarBlock = numColorChannels % channelsPerSse2Block;
160
161 const auto offset2 = uint16_v(128);
162#else
163 const int avx2Block = 0;
164 const int sse2Block = 0;
165 const int scalarBlock = numColorChannels;
166#endif
167
168 // qWarning() << ppVar(avx2Block) << ppVar(sse2Block);
169
170 for (int row = 0; row < numRows; row++) {
171 const quint16 *srcPtr = reinterpret_cast<const quint16 *>(src);
172 quint8 *dstPtr = dst;
173
174#if XSIMD_WITH_AVX2
175 for (int i = 0; i < avx2Block; i++) {
176 auto x1 = uint16_avx_v::load_unaligned(srcPtr);
177 auto x2 = uint16_avx_v::load_unaligned(srcPtr + uint16_avx_v::size);
178
179 const auto x1_shifted = x1 >> 8;
180 const auto x2_shifted = x2 >> 8;
181
182 x1 -= x1_shifted;
183 x1 += offset1;
184 x1 >>= 8;
185
186 x2 -= x2_shifted;
187 x2 += offset1;
188 x2 >>= 8;
189
190 x1.data = _mm256_packus_epi16(x1, x2);
191
192 // Packing in AVX2 does a bit different thing, not
193 // what you expect that after seeing a SSE2 version :)
194 // Therefore we need to permute the result...
195 x1.data = _mm256_permute4x64_epi64(x1, 0xd8);
196
197 x1.store_unaligned(reinterpret_cast<typename uint16_v::value_type *>(dstPtr));
198
199 srcPtr += channelsPerAvx2Block;
200 dstPtr += channelsPerAvx2Block;
201 }
202#else
203 Q_UNUSED(avx2Block);
204#endif
205
206#if (XSIMD_WITH_SSE2 || XSIMD_WITH_NEON || XSIMD_WITH_NEON64)
207 for (int i = 0; i < sse2Block; i++) {
208 auto x1 = uint16_v::load_unaligned(srcPtr);
209 auto x2 = uint16_v::load_unaligned(srcPtr + uint16_v::size);
210
211 const uint16_v x1_shifted = x1 >> 8;
212 const uint16_v x2_shifted = x2 >> 8;
213
214 x1 -= x1_shifted;
215 x1 += offset2;
216 x1 >>= 8;
217
218 x2 -= x2_shifted;
219 x2 += offset2;
220 x2 >>= 8;
221#if XSIMD_WITH_SSE2
222 x1.data = _mm_packus_epi16(x1, x2);
223#else
224 x1.data = vreinterpretq_u16_u8(vcombine_u8(vqmovun_s16(vreinterpretq_s16_u16(x1)), vqmovun_s16(vreinterpretq_s16_u16(x2))));
225#endif
226 x1.store_unaligned(reinterpret_cast<typename uint16_v::value_type *>(dstPtr));
227 srcPtr += channelsPerSse2Block;
228 dstPtr += channelsPerSse2Block;
229 }
230#else
231 Q_UNUSED(sse2Block);
232#endif
233
234 for (int i = 0; i < scalarBlock; i++) {
235 const quint16 value = *srcPtr;
236
237 *dstPtr = (value - (value >> 8) + 128) >> 8;
238
239 srcPtr++;
240 dstPtr++;
241 }
242
243 src += srcRowStride;
244 dst += dstRowStride;
245 }
246 }
float value(const T *src, size_t ch)

References KoOptimizedPixelDataScalerU8ToU16Base::m_channelsPerPixel, and value().

◆ convertU8ToU16()

template<typename _impl = xsimd::current_arch>
void KoOptimizedPixelDataScalerU8ToU16< _impl >::convertU8ToU16 ( const quint8 * src,
int srcRowStride,
quint8 * dst,
int dstRowStride,
int numRows,
int numColumns ) const
inlineoverridevirtual

Implements KoOptimizedPixelDataScalerU8ToU16Base.

Definition at line 26 of file KoOptimizedPixelDataScalerU8ToU16.h.

27 {
28 const int numColorChannels = m_channelsPerPixel * numColumns;
29
30#if XSIMD_WITH_AVX2
31 using uint16_avx_v = xsimd::batch<uint16_t, xsimd::avx2>;
32 using uint16_v = xsimd::batch<uint16_t, xsimd::sse4_1>;
33 using uint8_v = xsimd::batch<uint8_t, xsimd::sse4_1>;
34
35 const int channelsPerAvx2Block = 16;
36 const int channelsPerSse2Block = 8;
37 const int avx2Block = numColorChannels / channelsPerAvx2Block;
38 const int rest = numColorChannels % channelsPerAvx2Block;
39 const int sse2Block = rest / channelsPerSse2Block;
40 const int scalarBlock = rest % channelsPerSse2Block;
41#elif (XSIMD_WITH_SSE4_1 || XSIMD_WITH_NEON || XSIMD_WITH_NEON64)
42#if XSIMD_WITH_SSE4_1
43 using uint16_v = xsimd::batch<uint16_t, xsimd::sse4_1>;
44 using uint8_v = xsimd::batch<uint8_t, xsimd::sse4_1>;
45#elif XSIMD_WITH_NEON64
46 using uint16_v = xsimd::batch<uint16_t, xsimd::neon64>;
47 using uint8_v = xsimd::batch<uint8_t, xsimd::neon64>;
48#else
49 using uint16_v = xsimd::batch<uint16_t, xsimd::neon>;
50 using uint8_v = xsimd::batch<uint8_t, xsimd::neon>;
51#endif
52
53 const int channelsPerSse2Block = 8;
54 const int avx2Block = 0;
55 const int sse2Block = numColorChannels / channelsPerSse2Block;
56 const int scalarBlock = numColorChannels % channelsPerSse2Block;
57#else
58 const int avx2Block = 0;
59 const int sse2Block = 0;
60 const int scalarBlock = numColorChannels;
61#endif
62
63 // qWarning() << ppVar(avx2Block) << ppVar(sse2Block);
64
65 for (int row = 0; row < numRows; row++) {
66 const quint8 *srcPtr = src;
67 auto *dstPtr = reinterpret_cast<quint16 *>(dst);
68
69#if XSIMD_WITH_AVX2
70 for (int i = 0; i < avx2Block; i++) {
71 const auto x = uint8_v::load_unaligned(srcPtr);
72
73 uint16_avx_v y(_mm256_cvtepu8_epi16(x));
74 const auto y_shifted = y << 8;
75 y |= y_shifted;
76
77 y.store_unaligned(
78 reinterpret_cast<typename uint16_avx_v::value_type *>(dstPtr));
79
80 srcPtr += channelsPerAvx2Block;
81 dstPtr += channelsPerAvx2Block;
82 }
83#else
84 Q_UNUSED(avx2Block);
85#endif
86
87#if (XSIMD_WITH_SSE4_1 || XSIMD_WITH_NEON || XSIMD_WITH_NEON64)
88 for (int i = 0; i < sse2Block; i++) {
89#if XSIMD_WITH_SSE4_1
90 const uint8_v x(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(srcPtr)));
91#else
92 const uint8_v x(vreinterpretq_u8_u32(vcombine_u32(
93 vld1_u32(reinterpret_cast<const uint32_t *>(srcPtr)),
94 vcreate_u32(0))));
95#endif
96#if XSIMD_WITH_SSE4_1
97 uint16_v y(_mm_cvtepu8_epi16(x.data));
98#else
99 uint16_v y(vmovl_u8(vget_low_u8(x.data)));
100#endif
101 const auto y_shifted = y << 8;
102 y |= y_shifted;
103
104 y.store_unaligned(reinterpret_cast<typename uint16_v::value_type *>(dstPtr));
105
106 srcPtr += channelsPerSse2Block;
107 dstPtr += channelsPerSse2Block;
108 }
109#else
110 Q_UNUSED(sse2Block);
111#endif
112
113 for (int i = 0; i < scalarBlock; i++) {
114 const quint16 value = *srcPtr;
115
116 *dstPtr = static_cast<quint16>(value | (value << 8));
117
118 srcPtr++;
119 dstPtr++;
120 }
121
122 src += srcRowStride;
123 dst += dstRowStride;
124 }
125 }

References KoOptimizedPixelDataScalerU8ToU16Base::m_channelsPerPixel, and value().


The documentation for this class was generated from the following file: