To prepare 256-bit SIMD(double x 4) for Intel's AVX(Intel's future CPU functionality), I'm trying to implemt double x 4 feature in MUDA.
New type dvec is introduced for MUDA, which represents double x 4.
// input.mu
dvec bora_func(dvec a)
{
return a * a * a;
}
First, I've wrote SSE backend which translates dvec-typed expression with almost same manner as done in vec(float x 4) type.
$ mudah input.mu > bora.c
dvec bora (const double * a)
{
const __muda_m256 t_dvec2 = (*((__muda_m256 *)(a))) ;
const __muda_m256 t_dvec1 = (*((__muda_m256 *)(a))) ;
const __muda_m256 t_dvec3 = _muda_mul_4d( t_dvec2 , t_dvec1 ) ;
const __muda_m256 t_dvec4 = (*((__muda_m256 *)(a))) ;
const __muda_m256 t_dvec5 = _muda_mul_4d( t_dvec3 , t_dvec4 ) ;
return t_dvec5 ;
}
Here __muda_m256 and _muda_mul_4d is a simple wrapper C function which emulates 256-bit SIMD in current 128-bit SIMD machine, as defined in following.
typedef union {
struct { __m128d v[2]; };
double f[4];
} __muda_m256 __attribute__((aligned(16)));
static inline __muda_m256 _muda_mul_4d(__muda_m256 a, __muda_m256 b)
{
__muda_m256 ret;
ret.v[0] = _mm_mul_pd(a.v[0], b.v[0]);
ret.v[1] = _mm_mul_pd(a.v[1], b.v[1]);
return ret;
}
But gcc compiler translates this code into following unoptimized assembly.
$ gcc -msse2 -O3 -c bora.c
$ otool -v -t bora.o
_bora:
00000000 pushl %ebp
00000001 movl %esp,%ebp
00000003 pushl %edi
00000004 pushl %esi
00000005 subl $0x00000150,%esp
0000000b movl 0x0c(%ebp),%eax
0000000e movl (%eax),%edx
00000010 movl %edx,0xfffffed4(%ebp)
...
00000111 movl 0xfffffec4(%ebp),%eax
00000117 mulpd 0xffffff38(%ebp),%xmm0
0000011f movapd %xmm0,0xffffff18(%ebp)
00000127 movapd 0xffffff08(%ebp),%xmm0
...
(total 170 instructions)
Doh! lots of mov*!
Even though when using latest llvm-gcc(llvm-gcc4.2-2.2-x86-darwin8), still some redundant mov instructions remains in the output.
_bora:
00000000 pushl %ebp
00000001 movl %esp,%ebp
00000003 subl $0x000000e8,%esp
00000009 movl 0x0c(%ebp),%eax
0000000c movapd 0x10(%eax),%xmm0
00000011 movapd (%eax),%xmm1
00000015 movapd %xmm0,0xffffff68(%ebp)
0000001d movapd %xmm1,0xffffff58(%ebp)
00000025 movapd %xmm0,0xffffff48(%ebp)
0000002d movapd %xmm1,0xffffff38(%ebp)
00000035 movapd 0xffffff58(%ebp),%xmm2
0000003d mulpd 0xffffff38(%ebp),%xmm2
00000045 movapd %xmm2,0xffffff78(%ebp)
0000004d movapd 0xffffff68(%ebp),%xmm2
00000055 mulpd 0xffffff48(%ebp),%xmm2
0000005d movapd %xmm2,0x88(%ebp)
00000062 movapd 0xffffff78(%ebp),%xmm3
0000006a movapd %xmm2,0xc8(%ebp)
0000006f movapd %xmm3,0xb8(%ebp)
00000074 movapd %xmm0,0xa8(%ebp)
00000079 movapd %xmm1,0x98(%ebp)
0000007e movapd 0xb8(%ebp),%xmm0
00000083 mulpd 0x98(%ebp),%xmm0
00000088 movapd %xmm0,0xd8(%ebp)
0000008d movapd 0xc8(%ebp),%xmm0
00000092 mulpd 0xa8(%ebp),%xmm0
00000097 movapd %xmm0,0xe8(%ebp)
0000009c movapd 0xd8(%ebp),%xmm1
000000a1 movapd %xmm0,0xffffff28(%ebp)
000000a9 movapd %xmm1,0xffffff18(%ebp)
000000b1 movl 0x08(%ebp),%eax
000000b4 movapd 0xffffff28(%ebp),%xmm0
000000bc movapd 0xffffff18(%ebp),%xmm1
000000c4 movapd %xmm1,(%eax)
000000c8 movapd %xmm0,0x10(%eax)
000000cd addl $0x000000e8,%esp
000000d3 popl %ebp
000000d4 ret $0x0004
(38 instructions)
I also got almost same result from Intel's icc compiler.
It seems that for C compiler this code is difficult to optimize.
I think I have to translate MUDA code into C code much more in flat manner without using any macros or inlined wrapper function.
(directly emit 2 _mm_mul_pd() for dvec-typed mulitiply).
How about LLVM IR?
Then, I also added initial double x 4 support for
LLVM IR version.
$ mudah --llvm input.mu
define <4xdouble> @bora (<4xdouble> %a)
{
%a.addr = alloca <4xdouble> ;
store <4xdouble> %a, <4xdouble>* %a.addr ;
%t_dvec2 = load <4xdouble>* %a.addr ;
%t_dvec1 = load <4xdouble>* %a.addr ;
%t_dvec3 = mul <4xdouble> %t_dvec2 , %t_dvec1 ;
%t_dvec4 = load <4xdouble>* %a.addr ;
%t_dvec5 = mul <4xdouble> %t_dvec3 , %t_dvec4 ;
ret <4xdouble> %t_dvec5 ;
}
$ llvm-as bora.ll -f
$ llc bora.bc -f
$ cat bora.s
_bora:
Leh_func_begin3:
Llabel3:
subl $44, %esp
movapd %xmm0, (%esp)
movapd %xmm1, 16(%esp)
movaps %xmm1, %xmm2
mulpd %xmm2, %xmm2
mulpd %xmm2, %xmm1
movaps %xmm0, %xmm2
mulpd %xmm2, %xmm2
mulpd %xmm2, %xmm0
addl $44, %esp
ret
(11 instructions)
The output assembly is almost optimized!
LLVM infrastructure do good job when we use vector expression!