I try to use temporary variables p1aux, p2aux, and psumaux because the Reduction clause cannot be used with pointers or intrinsic functions. But as I said the result is incorrect.
When I say "result is not correct", I would say: The calculations of array1 + array2 are stored in the matrix sumsse. The calculations are correct until the component sumsse[50] [50] [50], more or less, but the other components of the matrix are always 0.
Do You have any idea what might happen?.
If anyone can help me, thank you very much.
Expand|Select|Wrap|Line Numbers
- #include <stdio.h>
- #ifdef __SSE2__
- #include <emmintrin.h>
- #endif
- #include <omp.h>
- #include <stdlib.h>
- #include <math.h>
- double __attribute__((aligned(16))) array1[256][256][256],array2[256][256][256];
- double __attribute__((aligned(16))) sumsse[256][256][256];
- int main ()
- {
- int k,j,i,dim;
- dim=256;
- __m128d *p1= (__m128d*) &array1[0][0][0];
- __m128d *p2= (__m128d*) &array2[0][0][0];
- __m128d *psum= (__m128d*) &sumsse[0][0][0];
- __m128d *p1aux= (__m128d*) &array1[0][0][0];
- __m128d *p2aux= (__m128d*) &array2[0][0][0];
- __m128d *psumaux= (__m128d*) &sumsse[0][0][0];
- int nthreads =(8);
- omp_set_num_threads(nthreads);
- //initializa array2
- for(k = 0; k < dim; ++k){
- for(j = 0; j < dim; ++j){
- for(i = 0; i < dim; ++i){
- array2[k][j][i] = 1.0;
- }
- }
- }
- //initialize array1
- for(k = 0; k < dim; ++k){
- for(j = 0; j < dim; ++j){
- for(i = 0; i < dim; ++i){
- array1[k][j][i] = i + j + k;
- }
- }
- }
- //initialize array sumsse
- for(k = 0; k < dim; ++k){
- for(j = 0; j < dim; ++j){
- for(i = 0; i < dim; ++i){
- sumsse[k][j][i] = 0.0;
- }
- }
- }
- //add array1 + array2 with sse
- #pragma omp parallel firstprivate(p1,p2,p1aux,p2aux)
- {
- for(k = 0; k < dim; ++k){
- for(j = 0; j < dim; ++j){
- #pragma omp for private(i)
- for(i = 0; i < dim; i += 2){
- *psum = _mm_add_pd(*p1,*p2);
- psum = psumaux + 1;
- p1 = p1aux+1;
- p2 = p2aux+1;
- psumaux = psum;
- p1aux = p1;
- p2aux = p2;
- }
- }
- }
- }// end parallel
- //show some datas
- printf("Elementosse=10 sumsse=%f",sumsse[10][10][10]);
- printf("\n");
- printf("Elementosse=50 sumsse=%f",sumsse[50][50][50]);
- printf("Elementosse=100 sumsse=%f",sumsse[100][100][100]);
- printf("\n");
- printf("Elementosse=120 sumsse=%f",sumsse[120][120][120]);
- printf("\n");
- printf("Elementosse=200 sumsse=%f",sumsse[200][200][200]);
- printf("\n");
- return 0;
- }