问

Haswell上的AVX2比SSE慢

菜蔸蔸发布于 2023-01-16 14:46

int

我有以下代码(普通,SSE和AVX):

int testSSE(const aligned_vector & ghs, const aligned_vector & lhs) {
    int result[4] __attribute__((aligned(16))) = {0};
    __m128i vresult = _mm_set1_epi32(0);
    __m128i v1, v2, vmax;

    for (int k = 0; k < ghs.size(); k += 4) {
        v1 = _mm_load_si128((__m128i *) & lhs[k]);
        v2 = _mm_load_si128((__m128i *) & ghs[k]);
        vmax = _mm_add_epi32(v1, v2);
        vresult = _mm_max_epi32(vresult, vmax);
    }
    _mm_store_si128((__m128i *) result, vresult);
    int mymax = result[0];
    for (int k = 1; k < 4; k++) {
        if (result[k] > mymax) {
            mymax = result[k];
        }
    }
    return mymax;
}

 int testAVX(const aligned_vector & ghs, const aligned_vector & lhs) {
    int result[8] __attribute__((aligned(32))) = {0};
    __m256i vresult = _mm256_set1_epi32(0);
    __m256i v1, v2, vmax;

    for (int k = 0; k < ghs.size(); k += 8) {
        v1 = _mm256_load_si256((__m256i *) & ghs[ k]);
        v2 = _mm256_load_si256((__m256i *) & lhs[k]);
        vmax = _mm256_add_epi32(v1, v2);
        vresult = _mm256_max_epi32(vresult, vmax);
    }
    _mm256_store_si256((__m256i *) result, vresult);
    int mymax = result[0];
    for (int k = 1; k < 8; k++) {
        if (result[k] > mymax) {
            mymax = result[k];
        }
    }
    return mymax;
}

int testNormal(const aligned_vector & ghs, const aligned_vector & lhs) {
    int max = 0;
    int tempMax;
    for (int k = 0; k < ghs.size(); k++) {
        tempMax = lhs[k] + ghs[k];
        if (max < tempMax) {
            max = tempMax;
        }
    }
    return max;
}

所有这些功能都使用以下代码进行测试:

void alignTestSSE() {
    aligned_vector lhs;
    aligned_vector ghs;

    int mySize = 4096;
    int FinalResult;
    int nofTestCases = 1000;
    double time, time1, time2, time3;
    vector lhs2;
    vector ghs2;

    lhs.resize(mySize);
    ghs.resize(mySize);
    lhs2.resize(mySize);
    ghs2.resize(mySize);

    srand(1);
    for (int k = 0; k < mySize; k++) {
        lhs[k] = randomNodeID(1000000);
        lhs2[k] = lhs[k];
        ghs[k] = randomNodeID(1000000);
        ghs2[k] = ghs[k];
    }
    /* Warming UP */
    for (int k = 0; k < nofTestCases; k++) {
        FinalResult = testNormal(lhs, ghs);
    }

    for (int k = 0; k < nofTestCases; k++) {
        FinalResult = testSSE(lhs, ghs);
    }

    for (int k = 0; k < nofTestCases; k++) {
        FinalResult = testAVX(lhs, ghs);
    }

    cout << "===========================" << endl;
    time = timestamp();
    for (int k = 0; k < nofTestCases; k++) {
        FinalResult = testSSE(lhs, ghs);
    }
    time = timestamp() - time;
    time1 = time;
    cout << "SSE took " << time << " s" << endl;
    cout << "SSE Result: " << FinalResult << endl;

    time = timestamp();
    for (int k = 0; k < nofTestCases; k++) {
        FinalResult = testAVX(lhs, ghs);
    }
    time = timestamp() - time;
    time3 = time;
    cout << "AVX took " << time << " s" << endl;
    cout << "AVX Result: " << FinalResult << endl;



    time = timestamp();
    for (int k = 0; k < nofTestCases; k++) {
        FinalResult = testNormal(lhs, ghs);
    }
    time = timestamp() - time;
    cout << "Normal took " << time << " s" << endl;
    cout << "Normal Result: " << FinalResult << endl;
    cout << "SpeedUP SSE= " << time / time1 << " s" << endl;
    cout << "SpeedUP AVX= " << time / time3 << " s" << endl;
    cout << "===========================" << endl;
    ghs.clear();
    lhs.clear();
}

哪里

inline double timestamp() {
    struct timeval tp;
    gettimeofday(&tp, NULL);
    return double(tp.tv_sec) + tp.tv_usec / 1000000.;
}

和

typedef vector > aligned_vector;

是使用https://gist.github.com/donny-dont/1471329的AlignedAllocator的对齐向量

我有一个intel-i7 haswell 4771,以及最新的Ubuntu 14.04 64bit和gcc 4.8.2.一切都是最新的.我用-march = native -mtune = native -O3 -m64编译.

结果是:

SSE took 0.000375986 s
SSE Result: 1982689
AVX took 0.000459909 s
AVX Result: 1982689
Normal took 0.00315714 s
Normal Result: 1982689
SpeedUP SSE= 8.39696 s
SpeedUP AVX= 6.8647 s

这表明完全相同的代码在AVX2上比SSE慢22%.我做错了什么还是这种正常行为？

撰写答案

今天，你开发时遇到什么问题呢？

立即提问

热门标签