2012-02-13 46 views
1

我试图加速使用英特尔编译器的自动矢量化和使用sse的一些代码。 所有计算都将一些struct node_t转换为另一个struct w_t(函数tr()和gen_tr())。 当我尝试矢量化函数gen_tr()时,它不会产生任何效果。使用自动矢量化和sse依赖数据大小的速度加快

如果更改数据存储格式,当每个struct组件存储在不同的float数组中时,则自动向量化运行良好,请参阅函数genv_tr()。

使用sse的函数称为ssev_tr(N应该均匀地除以4)。

transform.c:

#include <stdio.h> 
#include <stdlib.h> 
#include <malloc.h> 
#include <xmmintrin.h> 

static __inline__ unsigned long getCC(void) 
{ 
    unsigned a, d; 
    asm volatile("rdtsc" : "=a" (a), "=d" (d)); 
    return ((unsigned long)a) | (((unsigned long)d) << 32); 
} 

typedef struct { 
    float x1, x2, x3, x4, x5; 
} node_t; 

typedef struct { 
    float w1, w2, w3, w4; 
} w_t; 

void tr(node_t *n, float c1, float c2, w_t *w) 
{ 
    const float nv = n->x1; 
    const float N00T = n->x3 * c1; 

    const float n1v = n->x2; 
    const float N01T = n->x4 * c2; 

    w->w1 = nv - N00T; 
    w->w2 = nv + N00T; 
    w->w3 = n1v - N01T; 
    w->w4 = n1v + N01T; 
} 

__attribute__ ((noinline)) 
void gen_tr(node_t *n, w_t *w, const int N, float c1, float c2) 
{ 
    int i; 
    #pragma vector aligned 
    #pragma ivdep 
    for (i = 0; i < N; i++) { 
     tr(n + i, c1, c2, w + i); 
    } 
} 

__attribute__ ((noinline)) 
void genv_tr(float *x1, float *x2, float *x3, float *x4, float *x5, float *w1, float *w2, float *w3, float *w4, const int N, float c1, float c2) 
{ 
    int i; 
    #pragma vector aligned 
    #pragma ivdep 
    for (i = 0; i < N; i++) { 
     const float N00T = x3[i] * c1; 
     const float N01T = x4[i] * c2; 

     w1[i] = x1[i] - N00T; 
     w2[i] = x1[i] + N00T; 
     w3[i] = x2[i] - N01T; 
     w4[i] = x2[i] + N01T; 
    } 
} 

__attribute__ ((noinline)) 
void ssev_tr(float *x1, float *x2, float *x3, float *x4, float *x5, float *w1, float *w2, float *w3, float *w4, const int N, float c1, float c2) 
{ 
    __m128 *ws1 = (__m128*)w1; 
    __m128 *ws2 = (__m128*)w2; 
    __m128 *ws3 = (__m128*)w3; 
    __m128 *ws4 = (__m128*)w4; 

    __m128 *xs1 = (__m128*)x1; 
    __m128 *xs2 = (__m128*)x2; 
    __m128 *xs3 = (__m128*)x3; 
    __m128 *xs4 = (__m128*)x4; 

    const __m128 cs1 = _mm_set1_ps(c1); 
    const __m128 cs2 = _mm_set1_ps(c2); 

    int i; 
    #pragma vector aligned 
    #pragma ivdep 
    for (i = 0; i < N/4; i++) { 
     const __m128 N00T = _mm_mul_ps(xs3[i], cs1); 
     const __m128 N01T = _mm_mul_ps(xs4[i], cs2); 

     ws1[i] = _mm_sub_ps(xs1[i], N00T); 
     ws2[i] = _mm_add_ps(xs1[i], N00T); 
     ws3[i] = _mm_sub_ps(xs2[i], N01T); 
     ws4[i] = _mm_add_ps(xs2[i], N01T); 
    } 
} 

#define test(func) \ 
    for (i = 0; i < n; i++) { \ 
     x[i].x1 = 1.0; \ 
     x[i].x2 = 2.0; \ 
     x[i].x3 = 2.0; \ 
     x[i].x4 = 2.0; \ 
     x[i].x5 = 2.0; \ 
    } \ 
    \ 
    t1 = getCC(); \ 
    for (i = 0; i < rep; i++) { \ 
     func(x, w, n, c1, c2); \ 
    } \ 
    t2 = getCC(); \ 
    printf("\t%f", ((double)(t2 - t1))/n/rep); 

#define test1(func) \ 
    for (i = 0; i < n; i++) { \ 
     x1[i] = 1.0; \ 
     x2[i] = 2.0; \ 
     x3[i] = 2.0; \ 
     x4[i] = 2.0; \ 
     x5[i] = 2.0; \ 
    } \ 
    \ 
    t1 = getCC(); \ 
    for (i = 0; i < rep; i++) { \ 
     func(x1, x2, x3, x4, x5, w1, w2, w3, w4, n, c1, c2); \ 
    } \ 
    t2 = getCC(); \ 
    printf("\t%f", ((double)(t2 - t1))/n/rep); 

int main(int argc, char *argv[]) 
{ 
    if (argc < 2) { 
     printf("Usage %s vector_size\n", argv[0]); 
    } 
    int n = atoi(argv[1]); 
    printf("%d", n); 
    int rep = 100000000/n; 
    int i; 
    int inc = 1; 
    float c1 = 2.0, c2 = 1.0; 
    unsigned long t1, t2; 
    node_t *x = (node_t*)malloc(n * sizeof(node_t)); 
    w_t *w = (w_t*)malloc(n * sizeof(w_t)); 

    float *x1 = (float*)malloc(n * sizeof(float)); 
    float *x2 = (float*)malloc(n * sizeof(float)); 
    float *x3 = (float*)malloc(n * sizeof(float)); 
    float *x4 = (float*)malloc(n * sizeof(float)); 
    float *x5 = (float*)malloc(n * sizeof(float)); 

    float *w1 = (float*)malloc(n * sizeof(float)); 
    float *w2 = (float*)malloc(n * sizeof(float)); 
    float *w3 = (float*)malloc(n * sizeof(float)); 
    float *w4 = (float*)malloc(n * sizeof(float)); 

    test(gen_tr); 
    test1(genv_tr); 
    test1(ssev_tr); 

    printf("\n"); 
    return 0; 
} 

编译选项:ICC -O3 -Wall -W -vec-报告6 transform.c -o变换ICC的

版本 - 12.1.2,OS - Fedora的16个x86_64,CPU - Intel Core2 Quad CPU Q8200。

然后我与步骤64中具有不同的大小运行它从16〜3000,在这里的脚本:

#!/bin/bash 

echo "" > run.log 

for ((c=16;c<3000;c+=64)) 
do 
./transform $c | tee -a run.log 
done 

这里工作这个脚本(大小,gen_tr,genv_tr,ssev_tr)的一些结果,每显示的所有时间一个数组元素:

16  7.710743  3.168577  3.253829 
272  7.166493  1.983918  2.618569 
528  7.121866  1.920195  2.567109 
784  7.115007  1.899451  2.549645 
1040 8.104026  2.481062  2.944317 
1296 8.137537  5.105032  5.104614 
1552 8.118534  5.068812  5.064211 
1808 8.138309  5.077831  5.085015 
2064 8.149699  5.107503  5.069958 
2320 8.164556  5.080981  5.099313 
2576 8.151524  5.086056  5.089294 
2832 8.212946  5.061927  5.072261 

为什么在使用矢量化版本的函数时它的尺寸大小如此重大?这是因为缓存未命中吗?是否可以在所有数据范围内保存相同的速度?

回答

1

你有8个浮点数组。当它们的大小为1000时,您的测试正在操纵大约32kB的数据。即使您的L1缓存可能有点大(64kB),由于关联性,L1缓存可能无法同时保存所有32kB数据。

您的测试迭代,一遍又一遍地处理相同的数据。考虑两种情况:

  • 尺寸= 528:8个阵列方便地装配到L1高速缓存。每个测试迭代(第一个除外)都可以快速访问数据。
  • Size = 1268:8个阵列不能同时放入L1缓存。每次测试迭代都会从L1中清除数据,因此所有读取和写入操作都有效地进入L2。

所以输入大小1000的跳转部分是您的测试的人工产物,但并不完全。在现实世界中,如果你已经有了L1缓存中需要的所有数据,genv_tr将会非常快。但是对于大于1000的输入,所有的输入都不适合L1缓存,所以一些访问肯定会进入L2。