我试图将功能转换为AVX版本。函数本身基本上只是比较float和返回true/false取决于计算。AVX版本没有预期的那么快
这里是原来的功能:
bool testSingle(float* thisFloat, float* otherFloat)
{
for (unsigned int k = 0; k < COL_COUNT/2; k++)
{
if (thisFloat[k] < -otherFloat[COL_COUNT/2 + k] || -thisFloat[COL_COUNT/2 + k] > otherFloat[k])
{
return true;
}
}
return false;
}
而且,这是AVX版
__m256 testAVX(float* thisFloat, __m256* otherFloatInAVX)
{
__m256 vTemp1;
__m256 vTemp2;
__m256 vTempResult;
__m256 vEndResult = _mm256_set1_ps(0.0f);
for (unsigned int k = 0; k < COL_COUNT/2; k++)
{
vTemp1 = _mm256_cmp_ps(_mm256_set1_ps(thisFloat[k]), otherFloatInAVX[COL_COUNT/2 + k], _CMP_LT_OQ);
vTemp2 = _mm256_cmp_ps(_mm256_set1_ps(-thisFloat[COL_COUNT/2 + k]), otherFloatInAVX[k], _CMP_GT_OQ);
vTempResult = _mm256_or_ps(vTemp1, vTemp2);
vEndResult = _mm256_or_ps(vTempResult, vEndResult);
if (_mm256_movemask_ps(vEndResult) == 255)
{
break;
}
}
return vEndResult;
}
这里是完整的代码。我在开始时生成了一些随机的浮点数,并将其保存到AVX中以便在AVX版本中进行计算。变量thisFloat中的值将与otherFloat1,otherFloat2,...,otherFloat8进行比较。
#define ROW_COUNT 1000000
#define COL_COUNT 46
float randomNumberFloat(float Min, float Max)
{
return ((float(rand())/float(RAND_MAX)) * (Max - Min)) + Min;
}
int main(int argc, char** argv)
{
float** thisFloat = new float*[ROW_COUNT];
for (int i = 0; i < ROW_COUNT; ++i)
thisFloat[i] = new float[COL_COUNT];
float** otherFloat1 = new float*[ROW_COUNT];
for (int i = 0; i < ROW_COUNT; ++i)
otherFloat1[i] = new float[COL_COUNT];
float** otherFloat2 = new float*[ROW_COUNT];
for (int i = 0; i < ROW_COUNT; ++i)
otherFloat2[i] = new float[COL_COUNT];
float** otherFloat3 = new float*[ROW_COUNT];
for (int i = 0; i < ROW_COUNT; ++i)
otherFloat3[i] = new float[COL_COUNT];
float** otherFloat4 = new float*[ROW_COUNT];
for (int i = 0; i < ROW_COUNT; ++i)
otherFloat4[i] = new float[COL_COUNT];
float** otherFloat5 = new float*[ROW_COUNT];
for (int i = 0; i < ROW_COUNT; ++i)
otherFloat5[i] = new float[COL_COUNT];
float** otherFloat6 = new float*[ROW_COUNT];
for (int i = 0; i < ROW_COUNT; ++i)
otherFloat6[i] = new float[COL_COUNT];
float** otherFloat7 = new float*[ROW_COUNT];
for (int i = 0; i < ROW_COUNT; ++i)
otherFloat7[i] = new float[COL_COUNT];
float** otherFloat8 = new float*[ROW_COUNT];
for (int i = 0; i < ROW_COUNT; ++i)
otherFloat8[i] = new float[COL_COUNT];
// save to AVX
__m256** otherFloatInAVX = new __m256*[ROW_COUNT];
for (int i = 0; i < ROW_COUNT; ++i)
otherFloatInAVX[i] = new __m256[COL_COUNT];
// variable for results
unsigned int* resultsSingle = new unsigned int[ROW_COUNT];
__m256* resultsAVX = new __m256[ROW_COUNT];
// Generate Random Values
for (unsigned int i = 0; i < ROW_COUNT; i++)
{
for (unsigned int j = 0; j < COL_COUNT; j++)
{
thisFloat[i][j] = randomNumberFloat(-1000.0f, 1000.0f);
otherFloat1[i][j] = randomNumberFloat(-1000.0f, 1000.0f);
otherFloat2[i][j] = randomNumberFloat(-1000.0f, 1000.0f);
otherFloat3[i][j] = randomNumberFloat(-1000.0f, 1000.0f);
otherFloat4[i][j] = randomNumberFloat(-1000.0f, 1000.0f);
otherFloat5[i][j] = randomNumberFloat(-1000.0f, 1000.0f);
otherFloat6[i][j] = randomNumberFloat(-1000.0f, 1000.0f);
otherFloat7[i][j] = randomNumberFloat(-1000.0f, 1000.0f);
otherFloat8[i][j] = randomNumberFloat(-1000.0f, 1000.0f);
}
for (unsigned int j = 0; j < COL_COUNT/2; j++)
{
otherFloatInAVX[i][j] = _mm256_setr_ps(otherFloat1[i][j], otherFloat2[i][j], otherFloat3[i][j], otherFloat4[i][j], otherFloat5[i][j], otherFloat6[i][j], otherFloat7[i][j], otherFloat8[i][j]);
otherFloatInAVX[i][COL_COUNT/2 + j] = _mm256_setr_ps(-otherFloat1[i][j], -otherFloat2[i][j], -otherFloat3[i][j], -otherFloat4[i][j], -otherFloat5[i][j], -otherFloat6[i][j], -otherFloat7[i][j], -otherFloat8[i][j]);
}
}
// do normal test
auto start_normal = std::chrono::high_resolution_clock::now();
for (unsigned int i = 0; i < ROW_COUNT; i++)
{
resultsSingle[i] = testSingle(thisFloat[i], otherFloat1[i]);
resultsSingle[i] = testSingle(thisFloat[i], otherFloat2[i]);
resultsSingle[i] = testSingle(thisFloat[i], otherFloat3[i]);
resultsSingle[i] = testSingle(thisFloat[i], otherFloat4[i]);
resultsSingle[i] = testSingle(thisFloat[i], otherFloat5[i]);
resultsSingle[i] = testSingle(thisFloat[i], otherFloat6[i]);
resultsSingle[i] = testSingle(thisFloat[i], otherFloat7[i]);
resultsSingle[i] = testSingle(thisFloat[i], otherFloat8[i]);
}
auto end_normal = std::chrono::high_resolution_clock::now();
auto duration_normal = std::chrono::duration_cast<std::chrono::milliseconds>(end_normal - start_normal);
std::cout << "Duration of normal test: " << duration_normal.count() << " ms \n";
// do AVX test
auto start_avx = std::chrono::high_resolution_clock::now();
for (unsigned int i = 0; i < ROW_COUNT; i++)
{
resultsAVX[i] = testAVX(thisFloat[i], otherFloatInAVX[i]);
}
auto end_avx = std::chrono::high_resolution_clock::now();
auto duration_avx = std::chrono::duration_cast<std::chrono::milliseconds>(end_avx - start_avx);
std::cout << "Duration of AVX test: " << duration_avx.count() << " ms";
return 0;
}
然后,我测两者的运行时间,并得到
Duration of normal test: 290 ms
Duration of AVX test: 159 ms
的AVX版本是1.82x速度比原来的一个。
是否仍有可能改进AVX版本?或者我以错误的方式做了AVX?由于我同时进行了8次计算,因此我预计它可能会快5到6倍。
如果你只是陷入内存/高速缓存带宽,你有没有切入? – Elalfer
SIMD性能的关键是在数据寄存器中对数据进行大量计算,以帮助掩盖从内存中加载大量数据的开销。否则,你只是写了一个非常复杂的''memcpy''。 –
我认为你的AVX例程有一个错误 - 在标量代码相同的条件下它不会“早”出来 - 你需要将测试从'== 255'改为'!= 0'。 (注意:这是早上的早上,我还没有喝咖啡,但是经常检查,这个*看起来像是一个bug。) –