This is a blog for MUDA development. MUDA is a (short) vector language for CPUs.

Thursday, April 10, 2008

Initial tryout on double x 4 in MUDA.

To prepare 256-bit SIMD(double x 4) for Intel's AVX(Intel's future CPU functionality), I'm trying to implemt double x 4 feature in MUDA.

New type dvec is introduced for MUDA, which represents double x 4.

dvec bora_func(dvec a)
return a * a * a;

First, I've wrote SSE backend which translates dvec-typed expression with almost same manner as done in vec(float x 4) type.

$ mudah > bora.c
dvec bora (const double * a)
const __muda_m256 t_dvec2 = (*((__muda_m256 *)(a))) ;
const __muda_m256 t_dvec1 = (*((__muda_m256 *)(a))) ;
const __muda_m256 t_dvec3 = _muda_mul_4d( t_dvec2 , t_dvec1 ) ;
const __muda_m256 t_dvec4 = (*((__muda_m256 *)(a))) ;
const __muda_m256 t_dvec5 = _muda_mul_4d( t_dvec3 , t_dvec4 ) ;
return t_dvec5 ;

Here __muda_m256 and _muda_mul_4d is a simple wrapper C function which emulates 256-bit SIMD in current 128-bit SIMD machine, as defined in following.

typedef union {
struct { __m128d v[2]; };
double f[4];
} __muda_m256 __attribute__((aligned(16)));

static inline __muda_m256 _muda_mul_4d(__muda_m256 a, __muda_m256 b)
__muda_m256 ret;

ret.v[0] = _mm_mul_pd(a.v[0], b.v[0]);
ret.v[1] = _mm_mul_pd(a.v[1], b.v[1]);

return ret;

But gcc compiler translates this code into following unoptimized assembly.

$ gcc -msse2 -O3 -c bora.c
$ otool -v -t bora.o
00000000 pushl %ebp
00000001 movl %esp,%ebp
00000003 pushl %edi
00000004 pushl %esi
00000005 subl $0x00000150,%esp
0000000b movl 0x0c(%ebp),%eax
0000000e movl (%eax),%edx
00000010 movl %edx,0xfffffed4(%ebp)
00000111 movl 0xfffffec4(%ebp),%eax
00000117 mulpd 0xffffff38(%ebp),%xmm0
0000011f movapd %xmm0,0xffffff18(%ebp)
00000127 movapd 0xffffff08(%ebp),%xmm0
(total 170 instructions)

Doh! lots of mov*!

Even though when using latest llvm-gcc(llvm-gcc4.2-2.2-x86-darwin8), still some redundant mov instructions remains in the output.

00000000 pushl %ebp
00000001 movl %esp,%ebp
00000003 subl $0x000000e8,%esp
00000009 movl 0x0c(%ebp),%eax
0000000c movapd 0x10(%eax),%xmm0
00000011 movapd (%eax),%xmm1
00000015 movapd %xmm0,0xffffff68(%ebp)
0000001d movapd %xmm1,0xffffff58(%ebp)
00000025 movapd %xmm0,0xffffff48(%ebp)
0000002d movapd %xmm1,0xffffff38(%ebp)
00000035 movapd 0xffffff58(%ebp),%xmm2
0000003d mulpd 0xffffff38(%ebp),%xmm2
00000045 movapd %xmm2,0xffffff78(%ebp)
0000004d movapd 0xffffff68(%ebp),%xmm2
00000055 mulpd 0xffffff48(%ebp),%xmm2
0000005d movapd %xmm2,0x88(%ebp)
00000062 movapd 0xffffff78(%ebp),%xmm3
0000006a movapd %xmm2,0xc8(%ebp)
0000006f movapd %xmm3,0xb8(%ebp)
00000074 movapd %xmm0,0xa8(%ebp)
00000079 movapd %xmm1,0x98(%ebp)
0000007e movapd 0xb8(%ebp),%xmm0
00000083 mulpd 0x98(%ebp),%xmm0
00000088 movapd %xmm0,0xd8(%ebp)
0000008d movapd 0xc8(%ebp),%xmm0
00000092 mulpd 0xa8(%ebp),%xmm0
00000097 movapd %xmm0,0xe8(%ebp)
0000009c movapd 0xd8(%ebp),%xmm1
000000a1 movapd %xmm0,0xffffff28(%ebp)
000000a9 movapd %xmm1,0xffffff18(%ebp)
000000b1 movl 0x08(%ebp),%eax
000000b4 movapd 0xffffff28(%ebp),%xmm0
000000bc movapd 0xffffff18(%ebp),%xmm1
000000c4 movapd %xmm1,(%eax)
000000c8 movapd %xmm0,0x10(%eax)
000000cd addl $0x000000e8,%esp
000000d3 popl %ebp
000000d4 ret $0x0004
(38 instructions)

I also got almost same result from Intel's icc compiler.
It seems that for C compiler this code is difficult to optimize.
I think I have to translate MUDA code into C code much more in flat manner without using any macros or inlined wrapper function.
(directly emit 2 _mm_mul_pd() for dvec-typed mulitiply).

How about LLVM IR?

Then, I also added initial double x 4 support for LLVM backend of MUDA.

LLVM IR version.

$ mudah --llvm
define <4xdouble> @bora (<4xdouble> %a)
%a.addr = alloca <4xdouble> ;
store <4xdouble> %a, <4xdouble>* %a.addr ;
%t_dvec2 = load <4xdouble>* %a.addr ;

%t_dvec1 = load <4xdouble>* %a.addr ;

%t_dvec3 = mul <4xdouble> %t_dvec2 , %t_dvec1 ;
%t_dvec4 = load <4xdouble>* %a.addr ;

%t_dvec5 = mul <4xdouble> %t_dvec3 , %t_dvec4 ;
ret <4xdouble> %t_dvec5 ;

$ llvm-as bora.ll -f
$ llc bora.bc -f
$ cat bora.s

subl $44, %esp
movapd %xmm0, (%esp)
movapd %xmm1, 16(%esp)
movaps %xmm1, %xmm2
mulpd %xmm2, %xmm2
mulpd %xmm2, %xmm1
movaps %xmm0, %xmm2
mulpd %xmm2, %xmm2
mulpd %xmm2, %xmm0
addl $44, %esp
(11 instructions)

The output assembly is almost optimized!
LLVM infrastructure do good job when we use vector expression!


RPG said...

How about checking out something like Eigen

instead of doing all this superfluous work?

Disclaimer: I have contributed to Eigen.

syoyo said...

Eigen is

- C++ library. Not a language.
- LGPL, it might have licensing problem for a closed-source.

Thus, Eigen doesn't match what I am trying to do in MUDA.

Anyway, I am suspending development of MUDA for a while because there is a good alternative SIMD language recently: OpenCL and LLVM/clang.

mohan said...

Hi.. how's the development of lucille going on? I couldn't comment on your blog even after registering.. Eagerly waiting..


syoyo said...

Hello mohan,

We are now working in commercial version of lucille. We are planning to show it in SIGGRAPH 2010.


My life to be a renderer writer & quant. Here is my main blog.