I've wanted to make same SIMD optimizations done in Blender render engine as
made with SSE (Intel).
The code (Blender 2.5alpha0) is the following one:
inline int test_bb_group4(__m128 *bb_group, const Isect *isec)
{
const __m128 tmin0 = _mm_setzero_ps();
const __m128 tmax0 = _mm_load1_ps(&isec->labda);
const __m128 tmin1 = _mm_max_ps(tmin0, _mm_mul_ps( _mm_sub_ps( bb_group[isec->bv_index[0]], _mm_load1_ps(&isec->start[0]) ), _mm_load1_ps(&isec->idot_axis[0])) );
const __m128 tmax1 = _mm_min_ps(tmax0, _mm_mul_ps( _mm_sub_ps( bb_group[isec->bv_index[1]], _mm_load1_ps(&isec->start[0]) ), _mm_load1_ps(&isec->idot_axis[0])) );
const __m128 tmin2 = _mm_max_ps(tmin1, _mm_mul_ps( _mm_sub_ps( bb_group[isec->bv_index[2]], _mm_load1_ps(&isec->start[1]) ), _mm_load1_ps(&isec->idot_axis[1])) );
const __m128 tmax2 = _mm_min_ps(tmax1, _mm_mul_ps( _mm_sub_ps( bb_group[isec->bv_index[3]], _mm_load1_ps(&isec->start[1]) ), _mm_load1_ps(&isec->idot_axis[1])) );
const __m128 tmin3 = _mm_max_ps(tmin2, _mm_mul_ps( _mm_sub_ps( bb_group[isec->bv_index[4]], _mm_load1_ps(&isec->start[2]) ), _mm_load1_ps(&isec->idot_axis[2])) );
const __m128 tmax3 = _mm_min_ps(tmax2, _mm_mul_ps( _mm_sub_ps( bb_group[isec->bv_index[5]], _mm_load1_ps(&isec->start[2]) ), _mm_load1_ps(&isec->idot_axis[2])) );
return _mm_movemask_ps(_mm_cmpge_ps(tmax3, tmin3));
}
And this is my first working Altivec version:
inline int test_bb_group4(vector float *bb_group, const Isect *isec)
{
int res[4] ALIGNED_16;
const vector float v0 = (vector float) vec_splat_u32(0);
const vector float tmin0 = (vector float) vec_splat_u32(0);
const vector float vstart = vec_ld(0, &isec->start[0]);
const vector float tmax0 = vec_splat(vstart, 3);
const vector float vidot_axis = vec_ld(0, &isec->idot_axis[0]);
const vector float tmin1 = vec_max(tmin0, vec_madd( vec_sub( bb_group[isec->bv_index[0]], vec_splat(vstart, 0) ), vec_splat(vidot_axis, 0), v0) );
const vector float tmax1 = vec_min(tmax0, vec_madd( vec_sub( bb_group[isec->bv_index[1]], vec_splat(vstart, 0) ), vec_splat(vidot_axis, 0), v0) );
const vector float tmin2 = vec_max(tmin1, vec_madd( vec_sub( bb_group[isec->bv_index[2]], vec_splat(vstart, 1) ), vec_splat(vidot_axis, 1), v0) );
const vector float tmax2 = vec_min(tmax1, vec_madd( vec_sub( bb_group[isec->bv_index[3]], vec_splat(vstart, 1) ), vec_splat(vidot_axis, 1), v0) );
const vector float tmin3 = vec_max(tmin2, vec_madd( vec_sub( bb_group[isec->bv_index[4]], vec_splat(vstart, 2) ), vec_splat(vidot_axis, 2), v0) );
const vector float tmax3 = vec_min(tmax2, vec_madd( vec_sub( bb_group[isec->bv_index[5]], vec_splat(vstart, 2) ), vec_splat(vidot_axis, 2), v0) );
const vector unsigned int vmask = (vector unsigned int){0x1, 0x2, 0x4, 0x8};
const vector unsigned int vmasked = vec_and(vec_cmpge(tmax3, tmin3), vmask);
const vector signed int vres = vec_sums((vector signed int)vmasked, vec_splat_s32(0));
vec_st(vres, 0, (signed int *)&res[0]);
return res[3];
}
To use it we shall also take car to align on 16-bytes following
variables:
isec->start
isect->idot_axis
isect->labda
Unfortunatly speed-up was not here 
I've got only a gain of 10% render time on the scene from the benchmark .blender file
here
Today I've tried to optimize again more, by changing how I've emulated the
SSE predicat _mm_movemask_ps in Altivec... a bit complex. But I
need to change how this function is used also! Packing 4 unsigned int into 4
bits is a bit ... overkill.
To be continued 