I've wanted to make same SIMD optimizations done in Blender render engine as made with SSE (Intel).

The code (Blender 2.5alpha0) is the following one:

inline int test_bb_group4(__m128 *bb_group, const Isect *isec)
{
        
        const __m128 tmin0 = _mm_setzero_ps();
        const __m128 tmax0 = _mm_load1_ps(&isec->labda);

        const __m128 tmin1 = _mm_max_ps(tmin0, _mm_mul_ps( _mm_sub_ps( bb_group[isec->bv_index[0]], _mm_load1_ps(&isec->start[0]) ), _mm_load1_ps(&isec->idot_axis[0])) );
        const __m128 tmax1 = _mm_min_ps(tmax0, _mm_mul_ps( _mm_sub_ps( bb_group[isec->bv_index[1]], _mm_load1_ps(&isec->start[0]) ), _mm_load1_ps(&isec->idot_axis[0])) );
        const __m128 tmin2 = _mm_max_ps(tmin1, _mm_mul_ps( _mm_sub_ps( bb_group[isec->bv_index[2]], _mm_load1_ps(&isec->start[1]) ), _mm_load1_ps(&isec->idot_axis[1])) );
        const __m128 tmax2 = _mm_min_ps(tmax1, _mm_mul_ps( _mm_sub_ps( bb_group[isec->bv_index[3]], _mm_load1_ps(&isec->start[1]) ), _mm_load1_ps(&isec->idot_axis[1])) );
        const __m128 tmin3 = _mm_max_ps(tmin2, _mm_mul_ps( _mm_sub_ps( bb_group[isec->bv_index[4]], _mm_load1_ps(&isec->start[2]) ), _mm_load1_ps(&isec->idot_axis[2])) );
        const __m128 tmax3 = _mm_min_ps(tmax2, _mm_mul_ps( _mm_sub_ps( bb_group[isec->bv_index[5]], _mm_load1_ps(&isec->start[2]) ), _mm_load1_ps(&isec->idot_axis[2])) );
        
        return _mm_movemask_ps(_mm_cmpge_ps(tmax3, tmin3));
}

And this is my first working Altivec version:

inline int test_bb_group4(vector float *bb_group, const Isect *isec)
{
        int res[4] ALIGNED_16;

        const vector float v0 = (vector float) vec_splat_u32(0);
    
        const vector float tmin0  = (vector float) vec_splat_u32(0);
    
        const vector float vstart = vec_ld(0, &isec->start[0]);
        const vector float tmax0  = vec_splat(vstart, 3);
        const vector float vidot_axis = vec_ld(0, &isec->idot_axis[0]);

        const vector float tmin1 = vec_max(tmin0, vec_madd( vec_sub( bb_group[isec->bv_index[0]], vec_splat(vstart, 0) ), vec_splat(vidot_axis, 0), v0) );
        const vector float tmax1 = vec_min(tmax0, vec_madd( vec_sub( bb_group[isec->bv_index[1]], vec_splat(vstart, 0) ), vec_splat(vidot_axis, 0), v0) );
        const vector float tmin2 = vec_max(tmin1, vec_madd( vec_sub( bb_group[isec->bv_index[2]], vec_splat(vstart, 1) ), vec_splat(vidot_axis, 1), v0) );
        const vector float tmax2 = vec_min(tmax1, vec_madd( vec_sub( bb_group[isec->bv_index[3]], vec_splat(vstart, 1) ), vec_splat(vidot_axis, 1), v0) );
        const vector float tmin3 = vec_max(tmin2, vec_madd( vec_sub( bb_group[isec->bv_index[4]], vec_splat(vstart, 2) ), vec_splat(vidot_axis, 2), v0) );
        const vector float tmax3 = vec_min(tmax2, vec_madd( vec_sub( bb_group[isec->bv_index[5]], vec_splat(vstart, 2) ), vec_splat(vidot_axis, 2), v0) );
        
       const vector unsigned int vmask   = (vector unsigned int){0x1, 0x2, 0x4, 0x8};
       const vector unsigned int vmasked = vec_and(vec_cmpge(tmax3, tmin3), vmask);
       const vector signed int   vres    = vec_sums((vector signed int)vmasked, vec_splat_s32(0));

        vec_st(vres, 0, (signed int *)&res[0]);
        return res[3];
}

To use it we shall also take car to align on 16-bytes following variables:

  • isec->start
  • isect->idot_axis
  • isect->labda

Unfortunatly speed-up was not here :-(
I've got only a gain of 10% render time on the scene from the benchmark .blender file here

Today I've tried to optimize again more, by changing how I've emulated the SSE predicat _mm_movemask_ps in Altivec... a bit complex. But I need to change how this function is used also! Packing 4 unsigned int into 4 bits is a bit ... overkill.

To be continued ;-)