This version uses internal processor's parallelism and gives 20% better performance in KisStrokeBenchmark in comparison to per-pixel memcpy version (tested on Sandy Bridge (+20%) and on Merom (+10%)).
240{
248 int block1 =
size / 8;
249 int block2 =
size % 8;
250
251 for (int i = 0; i < block1; i++) {
252 quint8 *d1 = buf;
253 quint8 *d2 = buf + pixelSize;
254 quint8 *d3 = buf + 2 * pixelSize;
255 quint8 *d4 = buf + 3 * pixelSize;
256 quint8 *d5 = buf + 4 * pixelSize;
257 quint8 *d6 = buf + 5 * pixelSize;
258 quint8 *d7 = buf + 6 * pixelSize;
259 quint8 *d8 = buf + 7 * pixelSize;
260
261 for (int j = 0; j < pixelSize; j++) {
262 *(d1 + j) = color[j];
263 *(d2 + j) = color[j];
264 *(d3 + j) = color[j];
265 *(d4 + j) = color[j];
266 *(d5 + j) = color[j];
267 *(d6 + j) = color[j];
268 *(d7 + j) = color[j];
269 *(d8 + j) = color[j];
270 }
271
272 buf += 8 * pixelSize;
273 }
274
275 for (int i = 0; i < block2; i++) {
276 memcpy(buf, color, pixelSize);
277 buf += pixelSize;
278 }
279}