2016-11-17 41 views
-1

我正在编程一个完美的程序来并行化多媒体扩展。该程序包括转换图像,所以我通过一个矩阵,我修改其中的每个像素。为了加快速度,我使用了多媒体扩展:avx较慢,然后sse多媒体扩展

起初我使用了SSE3扩展并实现了2.5加速。接下来,我编程扩展了使用AVX扩展(双倍大小矢量)的sse算法,但是我没有得到SSE3的收益。用SSE执行程序的时间或多或少与AVX相同。 这里是SSE和AVX,分别代码的总结:

 for(i=0; i<lim; i+=12) { //tam vale n*n o n*n-12 dependiendo de si n*n es multiplo de 12. 12 ya que 3componentes*4pixeles(4 tamvector) 
      vectorR = _mm_set_ps(matrix[i+9], matrix[i+6], matrix[i+3], matrix[i]); 
      vectorG = _mm_set_ps(matrix[i+10], matrix[i+7], matrix[i+4], matrix[i+1]); 
      vectorB = _mm_set_ps(matrix[i+11], matrix[i+8], matrix[i+5], matrix[i+2]); 

      calcular_coeficientes_sse3(Ycoef0, Ycoef1, Ycoef2, Ucoef0, Vcoef0, vectorR, vectorG, vectorB, 
            values0, values255, values128, &vectorY, &vectorU, &vectorV); 

      _mm_store_ps(&y_aux[0], vectorY); 
      _mm_store_ps(&u_aux[0], vectorU); 
      _mm_store_ps(&v_aux[0], vectorV); 

      //Colocamos los datos en la matriz 
      //PIXEL 1 
      matrix[i] = y_aux[0]; 
      matrix[i+1]=u_aux[0]; 
      matrix[i+2]=v_aux[0]; 
      //PIXEL 2 
      matrix[i+3]=y_aux[1]; 
      matrix[i+4]=u_aux[1]; 
      matrix[i+5]=v_aux[1]; 
      //PIXEL 3 
      matrix[i+6] = y_aux[2];; 
      matrix[i+7]=u_aux[2]; 
      matrix[i+8]=v_aux[2]; 
      //PIXEL 4 
      matrix[i+9]=y_aux[3]; 
      matrix[i+10]=u_aux[3]; 
      matrix[i+11]=v_aux[3]; 
    } 

for(i=0; i<lim; i+=24) { //Vamos de 8 en 8 pixeles 
      vectorR = _mm256_set_ps(matrix[i+21], matrix[i+18], matrix[i+15] ,matrix[i+12],matrix[i+9], matrix[i+6], matrix[i+3], matrix[i]); 
      vectorG = _mm256_set_ps(matrix[i+22], matrix[i+19], matrix[i+16], matrix[i+13], matrix[i+10], matrix[i+7], matrix[i+4], matrix[i+1]); 
      vectorB = _mm256_set_ps(matrix[i+23], matrix[i+20], matrix[i+17], matrix[i+14], matrix[i+11], matrix[i+8], matrix[i+5], matrix[i+2]); 

      calcular_coeficientes_avx(Ycoef0, Ycoef1, Ycoef2, Ucoef0, Vcoef0, vectorR, vectorG, vectorB, 
            values0, values255, values128, &vectorY, &vectorU, &vectorV); 

      _mm256_store_ps(&y_aux[0], vectorY); 
      _mm256_store_ps(&u_aux[0], vectorU); 
      _mm256_store_ps(&v_aux[0], vectorV); 

      //Colocamos los datos en la matriz 
      //PIXEL 1 
      matrix[i] = y_aux[0]; 
      matrix[i+1]=u_aux[0]; 
      matrix[i+2]=v_aux[0]; 
      //PIXEL 2 
      matrix[i+3]=y_aux[1]; 
      matrix[i+4]=u_aux[1]; 
      matrix[i+5]=v_aux[1]; 
      //PIXEL 3 
      matrix[i+6] = y_aux[2];; 
      matrix[i+7]=u_aux[2]; 
      matrix[i+8]=v_aux[2]; 
      //PIXEL 4 
      matrix[i+9]=y_aux[3]; 
      matrix[i+10]=u_aux[3]; 
      matrix[i+11]=v_aux[3]; 
      //PIXEL 5 
      matrix[i+12]=y_aux[4]; 
      matrix[i+13]=u_aux[4]; 
      matrix[i+14]=v_aux[4]; 
      //PIXEL 6 
      matrix[i+15]=y_aux[5]; 
      matrix[i+16]=u_aux[5]; 
      matrix[i+17]=v_aux[5]; 
      //PIXEL 7 
      matrix[i+18]=y_aux[6]; 
      matrix[i+19]=u_aux[6]; 
      matrix[i+20]=v_aux[6]; 
      //PIXEL 8 
      matrix[i+21]=y_aux[7]; 
      matrix[i+22]=u_aux[7]; 
      matrix[i+23]=v_aux[7]; 
    } 

void calcular_coeficientes_sse3(__m128 Ycoef0, __m128 Ycoef1, __m128 Ycoef2, __m128 Ucoef0, __m128 Vcoef0, __m128 vectorR, 
          __m128 vectorG, __m128 vectorB, __m128 values0, __m128 values255, __m128 values128, 
            __m128 *vectorY, __m128 *vectorU, __m128 *vectorV) { 

    //CALCULO DE Y3, Y2, Y1, Y0 (Cuatro píxeles consecutivos) 
                //PRIMERA VUELta 
    __m128 valores1 = _mm_mul_ps(Ycoef0, vectorR); // valores1 = (0.299*R[3], 0.299*R[2], 0.299*R[1], 0.299*R[0]) 
    __m128 valores2 = _mm_mul_ps(Ycoef1, vectorG); // valores2 = (0.587*G[3], 0.587*G[2], 0.587*G[1], 0.587*G[0]) 
    __m128 valores3 = _mm_mul_ps(Ycoef2, vectorB); // valores3 = (0.114*B[3], 0.114*B[2], 0.114*B[1], 0.114*B[0]); 
    valores1 = _mm_add_ps(valores1, valores2);  // valores1 = (0.299*R[3] + 0.587*G[3], 0.299*R[2] + 0.587*G[2], 0.299*G[1]+ ..., ...) 
    *vectorY = _mm_add_ps(valores1, valores3);  // vectorY = (Y[3], Y[2], Y[1], Y[0]) 
    *vectorY = _mm_floor_ps(*vectorY); 
    //Calculo de U3, U2, U1, U0 
    //B-Y 
    valores1 = _mm_sub_ps(vectorB, *vectorY);  // valores1 = (B[3]-Y[3], B[2]-Y[2], B[1]-Y[1], B[0]-Y[0]) 
    valores1 = _mm_mul_ps(Ucoef0, valores1);  // valores1 = (0.492*(B[3]-Y[3]), 0.492*(B[2]-Y[2]), 0.492*(B[1]-Y[1]), 0.492*(...)) 
    *vectorU = _mm_add_ps(valores1, values128); // vectorU = (U[3], U[2], U[1], U[0]) 
    //CALCULO DE V3, V2, V1, V0 
    // R-Y 
    valores1 = _mm_sub_ps(vectorR, *vectorY);  // valores1 = (R[3]-Y[3], R[2]-Y[2], R[1]-Y[1], R[0]-Y[0]) 
    valores1 = _mm_mul_ps(Vcoef0, valores1);  // valores1 = (0.877*(R[3]-Y[3]), 0.877*(R[2]-Y[2]), 0.877*(R[1]-Y[1]), 0.877*(...)) 
    valores1 = _mm_add_ps(valores1, values128); // valores1 = (0.877*(R[3]-Y[3]) + 128, 0.877*(R[2]-Y[2]) + 128, ..., ...) 
    //valores1 pueden necesitar saturacion. 

    //SATURACIONES a 0 
    //Para evitar hacer comparaciones cogemos el mayor entre 0 y el valor V[i]: 
                  // Si V[i] > 0 se queda con V[i] pues es mayor que 0. 
                  // Si V[i] < 0 se queda con 0 pues es mayor que un número negativo. 
    valores1 = _mm_max_ps(valores1, values0);  // valores1 = (max(0.877*(R[3]-Y[3]) + 128,0), ..., ..., ...) 

    // SATURACIONES a 255 
    //Para evitar hacer comparacion cogemos el menor entre 255 y el valor V[i] 
                  // Si V[i] < 255 entonces se queda con el menor, V[i] 
                  // Si V[i] > 255 entonces se queda con el menor, 255. 
    *vectorV = _mm_min_ps(valores1, values255); //vectorV = (V[3], V[2], V[1], V[0]) 

    //NOTA: Al estar las operaciones implementadas en hardware se hacen las operaciones max y min en un 1 ciclo. 
    //por lo que solo en dos ciclos comprobamos la saturacion de 4 valores V. 

    return; //El procedimiento termina devolviendo vectorY, vectorU y vectorV 

}

void calcular_coeficientes_avx(__m256 Ycoef0, __m256 Ycoef1, __m256 Ycoef2, __m256 Ucoef0, __m256 Vcoef0, __m256 vectorR, 
          __m256 vectorG, __m256 vectorB, __m256 values0, __m256 values255, __m256 values128, 
            __m256 *vectorY, __m256 *vectorU, __m256 *vectorV) { 

    //CALCULO DE Y7, Y6, Y5, Y4, Y3, Y2, Y1, Y0 (Cuatro píxeles consecutivos) 

    __m256 valores1 = _mm256_mul_ps(Ycoef0, vectorR); 
    __m256 valores2 = _mm256_mul_ps(Ycoef1, vectorG); 
    __m256 valores3 = _mm256_mul_ps(Ycoef2, vectorB); 
    valores1 = _mm256_add_ps(valores1, valores2); 
    *vectorY = _mm256_add_ps(valores1, valores3); 
    *vectorY = _mm256_floor_ps(*vectorY); 
    //Calculo de U7, U6, U5, U4, U3, U2, U1, U0 
    valores1 = _mm256_sub_ps(vectorB, *vectorY); 
    valores1 = _mm256_mul_ps(Ucoef0, valores1); 
    *vectorU = _mm256_add_ps(valores1, values128); 
    //CALCULO DE V7, V6, V5, V4, V3, V2, V1, V0 
    // R-Y 
    valores1 = _mm256_sub_ps(vectorR, *vectorY); 
    valores1 = _mm256_mul_ps(Vcoef0, valores1); 
    valores1 = _mm256_add_ps(valores1, values128); 
    //valores1 pueden necesitar saturacion. 

    //SATURACIONES a 0 
    //Para evitar hacer comparaciones cogemos el mayor entre 0 y el valor V[i]: 


    valores1 = _mm256_max_ps(valores1, values0); 

    // SATURACIONES a 255 
    //Para evitar hacer comparacion cogemos el menor entre 255 y el valor V[i] 
                  // Si V[i] < 255 entonces se queda con el menor, V[i] 
                  // Si V[i] > 255 entonces se queda con el menor, 255. 
    *vectorV = _mm256_min_ps(valores1, values255); //vectorV = (V[3], V[2], V[1], V[0]) 

    //NOTA: Al estar las operaciones implementadas en hardware se hacen las operaciones max y min en un 1 ciclo. 
    //por lo que solo en dos ciclos comprobamos la saturacion de 4 valores V. 

    return; //El procedimiento termina devolviendo vectorY, vectorU y vectorV 

}

正如你可以看到两个SSE和AVX是相同的,但最后是扩展并使用更长的大小矢量。为什么执行时间相同?

注意:我已经在两台带有AVX支持的计算机上试过(显然),我也有同样的问题。

非常感谢。

+2

除了在商店瓶颈,工作是在'calcular_coeficientes_avx'和'calcular_coeficientes_sse'完成的,但这些功能都没有。另外,AVX512标签可能会引起误解。 –

+2

'_mm256_set_ps(矩阵[i + 21],矩阵[i + 18],...'请不要这样做,也不要在SSE中,每当你写这样的东西时,一只小狗就会死亡。 ,然后洗牌 – harold

+1

*我在两台不同的计算机上试过*:不同的是:它们都是Intel SnB系列,还是AMD?如果*你的代码写得很好,AMD CPU通常不会获得太多好处从使用256b AVX。请参阅http://agner.org/optimize/,和其他链接在[x86标签wiki](http://stackoverflow.com/tags/x86/info)。 –

回答

2

使用RGB到YUV的浮点算法是巨大的矫枉过正,所以我会在这里使用定点算法。输出格式非常烦人,可能有一种比我使用的方法更加合理的方法,但无论您做什么,当您想要对YUV数据执行任何操作时都会再次令人讨厌。使用[YYYYYYYY UUUUUUUU VVVVVVVV]等格式会更有意义。

虽然已经有很多RGB到YUV转换器可用,但是我仍然尝试了其他尝试。如果有更多的替代品可用,并且它们中的大多数不太匹配 - 例如进行色度子采样并且通常使用其他输出格式,这并不会造成什么影响。

我还没有测试它(除了检查它编译),所以任何有关正确性或性能的意见将不胜感激。

void toYUV(uint8_t* pixels, int size) 
{ 
    // r g b r g b r g b r g b r g b r | g b r g b r g b 

    short rw = 9798; // (short)round(0.299 * 2^15) 
    short gw = 19234;// (short)round(0.587 * 2^15) 
    short bw = 3736; // (short)round(0.114 * 2^15) 
    char _ = -1; 

    __m128i z = _mm_setzero_si128(); 

    for (int i = 0; i < size; i += 24) 
    { 
    __m128i p0 = _mm_loadu_si128((__m128i*)(pixels + i)); 
    __m128i p1 = _mm_loadl_epi64((__m128i*)(pixels + i + 16)); 

    // widen 
    // w0 = r0 g0 b0 r1 g1 b1 r2 g2 
    __m128i w0 = _mm_unpacklo_epi8(p0, z); 
    // w1 = b2 r3 g3 b3 r4 g4 b4 r5 
    __m128i w1 = _mm_unpackhi_epi8(p0, z); 
    // w2 = g5 b5 r6 g6 b6 r7 g7 b7 
    __m128i w2 = _mm_unpacklo_epi8(p1, z); 

    __m128i zRGBRGBz = _mm_setr_epi16(0, rw, gw, bw, rw, gw, bw, 0); 
    // calculate Y 
    // 0+r0 g0+b0 r1+g1 b1+0 
    __m128i s0 = _mm_madd_epi16(_mm_bslli_si128(w0, 2), zRGBRGBz); 
    // 0+r2 g2+b2 r3+g3 b3+0 
    __m128i s1 = _mm_madd_epi16(_mm_alignr_epi8(w1, w0, 10), zRGBRGBz); 
    // 0+r4 g4+b4 r5+g5 b5+0 
    __m128i s2 = _mm_madd_epi16(_mm_alignr_epi8(w2, w1, 6), zRGBRGBz); 
    // 0+r6 g6+b6 r7+g7 b7+0 
    __m128i s3 = _mm_madd_epi16(_mm_bsrli_si128(w2, 2), zRGBRGBz); 
    // the math works out to make the Y's in [0 .. 254] after scaling back, so 
    // y0 = y0 0 y1 0 y2 0 y3 0 
    // y1 = y4 0 y5 0 y6 0 y7 0 
    __m128i y0 = _mm_srli_epi32(_mm_hadd_epi32(s0, s1), 15); 
    __m128i y1 = _mm_srli_epi32(_mm_hadd_epi32(s2, s3), 15); 
    __m128i y = _mm_packus_epi32(y0, y1); 
    y = _mm_packus_epi16(y, y); 

    // calculate U 
    // todo: do smarter shuffles? 
    // b0 = 0 b0 0 b1 0 b2 0 b3 
    __m128i b0 = _mm_or_si128(
     _mm_shuffle_epi8(w0, _mm_setr_epi8(_, _, 4, 5, _, _, 10, 11, _, _, _, _, _, _, _, _)), 
     _mm_shuffle_epi8(w1, _mm_setr_epi8(_, _, _, _, _, _, _, _, _, _, 0, 1, _, _, 6, 7))); 
    // b1 = 0 b4 0 b5 0 b6 0 b7 
    __m128i b1 = _mm_or_si128(
     _mm_shuffle_epi8(w1, _mm_setr_epi8(_, _, 13, 14, _, _, _, _, _, _, _, _, _, _, _, _)), 
     _mm_shuffle_epi8(w2, _mm_setr_epi8(_, _, _, _, _, _, 2, 3, _, _, 8, 9, _, _, 14, 15))); 
    // b - y in [-225 .. 226] 
    // u = 18492 * (b - y) >> 15 -> (18492 * b + -18492 * y) >> 15 
    // .. so u in [-128 .. 127] 
    short us = 18492; 
    __m128i u0 = _mm_madd_epi16(_mm_or_si128(b0, y0), _mm_setr_epi16(-us, us, -us, us, -us, us, -us, us)); 
    u0 = _mm_srai_epi32(u0, 15); 
    __m128i u1 = _mm_madd_epi16(_mm_or_si128(b1, y1), _mm_setr_epi16(-us, us, -us, us, -us, us, -us, us)); 
    u1 = _mm_srai_epi32(u1, 15); 
    // pack to sbytes 
    __m128i u = _mm_packs_epi32(u0, u1); 
    u = _mm_packs_epi16(u, u); 

    // calculate V 
    // todo: do smarter shuffles? 
    // r0 = 0 r0 0 r1 0 r2 0 r3 
    __m128i r0 = _mm_or_si128(
     _mm_shuffle_epi8(w0, _mm_setr_epi8(_, _, 0, 1, _, _, 6, 7, _, _, 12, 13, _, _, _, _)), 
     _mm_shuffle_epi8(w1, _mm_setr_epi8(_, _, _, _, _, _, _, _, _, _, _, _, _, _, 2, 3))); 
    // r1 = 0 r4 0 r5 0 r6 0 r7 
    __m128i r1 = _mm_or_si128(
     _mm_shuffle_epi8(w1, _mm_setr_epi8(_, _, 8, 9, _, _, 14, 15, _, _, _, _, _, _, _, _)), 
     _mm_shuffle_epi8(w2, _mm_setr_epi8(_, _, _, _, _, _, _, _, _, _, 4, 5, _, _, 10, 11))); 
    // r - y in [-178 .. 179] 
    // v = 23372 * (r - y) >> 15 -> (23372 * r + -23372 * y) >> 15 
    // .. so v in [-128 .. 127] 
    short vs = 23372; 
    __m128i v0 = _mm_madd_epi16(_mm_or_si128(r0, y0), _mm_setr_epi16(-vs, vs, -vs, vs, -vs, vs, -vs, vs)); 
    v0 = _mm_srai_epi32(v0, 15); 
    __m128i v1 = _mm_madd_epi16(_mm_or_si128(r1, y1), _mm_setr_epi16(-vs, vs, -vs, vs, -vs, vs, -vs, vs)); 
    v1 = _mm_srai_epi32(v1, 15); 
    // pack to sbytes 
    __m128i v = _mm_packs_epi32(v0, v1); 
    v = _mm_packs_epi16(v, v); 

    // interleave :(
    // y0 u0 v0 y1 u1 v1 y2 u2 v2 y3 u3 v3 y4 u4 v4 y5 
    // u5 v5 y6 u6 v6 y7 u7 v7 
    __m128i shuf0 = _mm_setr_epi8(0, _, _, 1, _, _, 2, _, _, 3, _, _, 4, _, _, 5); 
    p0 = _mm_or_si128(_mm_or_si128(
     _mm_shuffle_epi8(y, shuf0), 
     _mm_shuffle_epi8(u, _mm_bslli_si128(shuf0, 1))), 
     _mm_shuffle_epi8(v, _mm_bslli_si128(shuf0, 2))); 
    __m128i shuf1 = _mm_setr_epi8(_, 5, _, _, 6, _, _, 7, _, _, _, _, _, _, _, _); 
    p1 = _mm_or_si128(_mm_or_si128(
     _mm_shuffle_epi8(y, _mm_srli_si128(shuf1, 2)), 
     _mm_shuffle_epi8(u, _mm_srli_si128(shuf1, 1))), 
     _mm_shuffle_epi8(v, shuf1)); 

    _mm_storeu_si128((__m128i*)(pixels + i), p0); 
    _mm_storel_epi64((__m128i*)(pixels + i + 16), p1); 
    } 
} 

这并不是很好,它对所有这些洗牌和(联合)打包都太重了。在常春藤这不是一个问题。使用较少交错的输出格式将会有很大帮助,节省6次洗牌。当然,由于这是固定点,你不能真正将其转换为AVX,但你可以试试AVX2。

这是一个“更垂直”的版本,基于pmulhrsw而不是pmaddwd,也未经过测试(可能已损坏)。 asm看起来更好,但我还没有测试性能。

void toYUV(uint8_t* pixels, int size) 
{ 
    // r g b r g b r g b r g b r g b r | g b r g b r g b 
    char _ = -1; 
    __m128i rshuf0 = _mm_setr_epi8(0, _, 3, _, 6, _, 9, _, 12, _, 15, _, _, _, _, _); 
    __m128i rshuf1 = _mm_setr_epi8(_, _, _, _, _, _, _, _, _, _, _, _, 2, _, 5, _); 
    __m128i gshuf0 = _mm_setr_epi8(1, _, 4, _, 7, _, 10, _, 13, _, _, _, _, _, _, _); 
    __m128i gshuf1 = _mm_setr_epi8(_, _, _, _, _, _, _, _, _, _, 0, _, 3, _, 6, _); 
    __m128i bshuf0 = _mm_setr_epi8(2, _, 5, _, 8, _, 11, _, 14, _, _, _, _, _, _, _); 
    __m128i bshuf1 = _mm_setr_epi8(_, _, _, _, _, _, _, _, _, _, 1, _, 2, _, 5, _); 

    __m128i ryweight = _mm_set1_epi16(9798); // (short)round(0.299 * 2^15) 
    __m128i gyweight = _mm_set1_epi16(19235);// (short)round(0.587 * 2^15) 
    __m128i byweight = _mm_set1_epi16(3736); // (short)round(0.114 * 2^15) 
    __m128i uscale = _mm_set1_epi16(18492); // (short)floor(2^15/(2^15 - byweight) * 0.5 * 2^15) 
    __m128i uofs = _mm_set1_epi16(128); 
    __m128i vscale = _mm_set1_epi16(23372); // (short)floor(2^15/(2^15 - ryweight) * 0.5 * 2^15) 
    __m128i vofs = _mm_set1_epi16(128); 

    __m128i yshuf0 = _mm_setr_epi8(0, _, _, 2, _, _, 4, _, _, 6, _, _, 8, _, _, _); 
    __m128i yshuf1 = _mm_setr_epi8(10, _, _, 12, _, _, 14, _, _, _, _, _, _, _, _, _); 
    __m128i ushuf0 = _mm_setr_epi8(_, 0, _, _, 1, _, _, 2, _, _, 3, _, _, 4, _, _); 
    __m128i ushuf1 = _mm_setr_epi8(5, _, _, 6, _, _, 7, _, _, _, _, _, _, _, _, _); 

    for (int i = 0; i < size; i += 24) 
    { 
    __m128i p0 = _mm_loadu_si128((__m128i*)(pixels + i)); 
    __m128i p1 = _mm_loadl_epi64((__m128i*)(pixels + i + 16)); 

    __m128i r = _mm_or_si128(_mm_shuffle_epi8(p0, rshuf0), _mm_shuffle_epi8(p1, rshuf1)); 
    __m128i g = _mm_or_si128(_mm_shuffle_epi8(p0, gshuf0), _mm_shuffle_epi8(p1, gshuf1)); 
    __m128i b = _mm_or_si128(_mm_shuffle_epi8(p0, bshuf0), _mm_shuffle_epi8(p1, bshuf1)); 

    __m128i scaledr = _mm_mulhrs_epi16(r, ryweight); 
    __m128i scaledg = _mm_mulhrs_epi16(g, gyweight); 
    __m128i scaledb = _mm_mulhrs_epi16(b, byweight); 
    __m128i y = _mm_add_epi16(_mm_add_epi16(scaledr, scaledg), scaledb); 
    __m128i u = _mm_mulhrs_epi16(_mm_sub_epi16(b, y), uscale); 
    __m128i v = _mm_mulhrs_epi16(_mm_sub_epi16(r, y), vscale); 
    // pack back to bytes and shuffle 
    // words in y are guaranteed to be in [0 - 255] so just shuffle them into place 
    // swords in u and v may be slightly wonky so pack with saturation first 
    u = _mm_packs_epi16(u, u); 
    v = _mm_packs_epi16(v, v); 
    p0 = _mm_or_si128(_mm_or_si128(
     _mm_shuffle_epi8(y, yshuf0), 
     _mm_shuffle_epi8(u, ushuf0)), 
     _mm_shuffle_epi8(v, _mm_bslli_si128(ushuf0, 1))); 
    p1 = _mm_or_si128(_mm_or_si128(
     _mm_shuffle_epi8(y, yshuf1), 
     _mm_shuffle_epi8(u, ushuf1)), 
     _mm_shuffle_epi8(v, _mm_bslli_si128(ushuf1, 1))); 

    _mm_storeu_si128((__m128i*)(pixels + i), p0); 
    _mm_storel_epi64((__m128i*)(pixels + i + 16), p1); 
    } 
}