This
page contains dco's optimization results for the double
precision convolution
optimizing code generated by the gcc version
4.2.2 on x86-64 and IA-32
systems. Variation
of the double precision
convolution are found
in many DSP
kernels.
For instance of the <code
to
be optimized> the
following program was created: <initialization>
startTime = start_timer(); for ( count = 0; count < NUMBER_OF_ITERATIONS; count++ ) { execute <code to be optimized> }
endTime = end_timer(); <report the execution time 'endTime - startTime'> - <initialization>
- initializes the data utilized by the <code
to be
optimized>
- start_timer(),
end_timer() - routines that
return an accurate ( double precision
) value of the system clock
- NUMBER_OF_ITERATIONS
depends on the <code
to be optimized> and
is chosen to smooth the inaccuracies of the
system clock and to produce a reasonably large and accurate total
execution
time
Read this
to understand how the benchmarks were executed.
Convolution
results for x86-64
64-bit code The gcc version 4.2.2 compiler, used to process
the benchmark, was invoked with the
following command line options:
-S -O3
-fomit-frame-pointer -funroll-all-loops-ffast-math -march=nocona
-mfpmath=sse -msse3
On
the
64-bit Linux operating system
running on the 2.66GHz Core2 computer we tested the double precision
convolution on arrays of 1000 points. NUMBER_OF_ITERATIONS was
set to 10000. gcc
( version 4.2.2 )
generated code ran for
2.9106 seconds, dco
( version 1.1.0 ) optimized code ran for 1.7927 seconds, thus a 38% improvement.
double *ap, *bp, tmp1, tmp2, sum; int i;
sum = 0.; for ( i = 0; i < ARRAY_DIM; i++ ) { tmp1 = *(ap += I); tmp2 = *(bp += J); sum = sum + tmp1 * tmp2; }
|
movq %rdx,%r10 andq $15,%r10 jne ___dcox86_wl_3_ ___dcox86_wl_5_: movsd b(,%rax,8),%xmm8 addq $10,%rax movhpd b+-72(,%rax,8),%xmm8 subq $1,%rbx movsd b+-64(,%rax,8),%xmm7 movhpd b+-56(,%rax,8),%xmm7 movsd b+-48(,%rax,8),%xmm12 movhpd b+-40(,%rax,8),%xmm12 movsd b+-32(,%rax,8),%xmm13 movhpd b+-24(,%rax,8),%xmm13 movsd b+-16(,%rax,8),%xmm10 movhpd b+-8(,%rax,8),%xmm10 mulpd -80(%rdx,%rax,8),%xmm8 mulpd -64(%rdx,%rax,8),%xmm7 mulpd -48(%rdx,%rax,8),%xmm12 mulpd -32(%rdx,%rax,8),%xmm13 mulpd -16(%rdx,%rax,8),%xmm10 addpd %xmm8,%xmm1 addpd %xmm7,%xmm4 addpd %xmm12,%xmm5 addpd %xmm13,%xmm11 addpd %xmm10,%xmm6 jg ___dcox86_wl_5_ jmp ___dcox86_cc_7_ ___dcox86_wl_3_: movsd (%rdx,%rax,8),%xmm9 addq $10,%rax movhpd -72(%rdx,%rax,8),%xmm9 subq $1,%rbx movsd b+-80(,%rax,8),%xmm13 movhpd b+-72(,%rax,8),%xmm13 movsd -64(%rdx,%rax,8),%xmm8 movhpd -56(%rdx,%rax,8),%xmm8 movsd b+-64(,%rax,8),%xmm7 movhpd b+-56(,%rax,8),%xmm7 movsd -48(%rdx,%rax,8),%xmm15 movhpd -40(%rdx,%rax,8),%xmm15 movsd b+-48(,%rax,8),%xmm14 movhpd b+-40(,%rax,8),%xmm14 movsd -32(%rdx,%rax,8),%xmm2 movhpd -24(%rdx,%rax,8),%xmm2 mulpd %xmm13,%xmm9 movsd b+-32(,%rax,8),%xmm13 movhpd b+-24(,%rax,8),%xmm13 movsd -16(%rdx,%rax,8),%xmm12 movhpd -8(%rdx,%rax,8),%xmm12 mulpd %xmm7,%xmm8 movsd b+-16(,%rax,8),%xmm10 addpd %xmm9,%xmm1 movhpd b+-8(,%rax,8),%xmm10 mulpd %xmm14,%xmm15 addpd %xmm8,%xmm4 mulpd %xmm13,%xmm2 addpd %xmm15,%xmm5 mulpd %xmm10,%xmm12 addpd %xmm2,%xmm11 addpd %xmm12,%xmm6 jg ___dcox86_wl_3_ ___dcox86_cc_7_:
|
L50: movsd (%rdx,%rax,8), %xmm13 mulsd b(,%rax,8), %xmm13 addsd %xmm13, %xmm1 movsd 8(%rdx,%rax,8), %xmm12 mulsd b+8(,%rax,8), %xmm12 addsd %xmm12, %xmm1 movsd 16(%rdx,%rax,8), %xmm11 mulsd b+16(,%rax,8), %xmm11 addsd %xmm11, %xmm1optimization results of the movsd 24(%rdx,%rax,8), %xmm10 mulsd b+24(,%rax,8), %xmm10 addsd %xmm10, %xmm1 movsd 32(%rdx,%rax,8), %xmm9 mulsd b+32(,%rax,8), %xmm9 addsd %xmm9, %xmm1 movsd 40(%rdx,%rax,8), %xmm8 mulsd b+40(,%rax,8), %xmm8 addsd %xmm8, %xmm1 movsd 48(%rdx,%rax,8), %xmm7 mulsd b+48(,%rax,8), %xmm7 addsd %xmm7, %xmm1 movsd 56(%rdx,%rax,8), %xmm6 mulsd b+56(,%rax,8), %xmm6 addsd %xmm6, %xmm1 movsd 64(%rdx,%rax,8), %xmm5 mulsd b+64(,%rax,8), %xmm5 addsd %xmm5, %xmm1 movsd 72(%rdx,%rax,8), %xmm4 mulsd b+72(,%rax,8), %xmm4 addsd %xmm4, %xmm1 addq $10, %rax cmpq $512, %rax jne .L50
| See this for more optimization results showing how to achieve 54% improvement of the 64-bit x86-64 code for the double precision convolution.results
for IA-32 32-bit code The gcc version 4.3.0 compiler, used to process
the benchmark, was invoked with the
following command line options:
-S -O3
-fomit-frame-pointer -funroll-all-loops-ffast-math -march=pentium4
-mfpmath=sse -msse2
On the
32-bit Linux operating system
running on the 2.8GHz Pentium4computer we tested the double precision
convolution on arrays of 512 points. NUMBER_OF_ITERATIONS was
set to 10000. gcc
( version 4.3.0 )
generated code ran for 0.5619 seconds, dco
( version 1.1.1 ) optimized code ran for 0.4209 seconds, thus a 25% improvement.
double *ap, *bp, tmp1, tmp2, sum; int i;
sum = 0.; for ( i = 0; i < ARRAY_DIM; i++ ) { tmp1 = *(ap += I); tmp2 = *(bp += J); sum = sum + tmp1 * tmp2; }
|
movl %edx,%ebx pxor %xmm1,%xmm1 andl $15,%ebx pxor %xmm0,%xmm0 pxor %xmm3,%xmm3 pxor %xmm4,%xmm4 pxor %xmm5,%xmm5 pxor %xmm6,%xmm6 jne .L43 ___dcox86_wl_2_: movapd (%edx,%eax),%xmm7 mulpd b(%eax),%xmm7 addpd %xmm7,%xmm2 movapd 16(%edx,%eax),%xmm7 mulpd b+16(%eax),%xmm7 addpd %xmm7,%xmm1 movapd 32(%edx,%eax),%xmm7 mulpd b+32(%eax),%xmm7 addpd %xmm7,%xmm0 movapd 48(%edx,%eax),%xmm7 mulpd b+48(%eax),%xmm7 addpd %xmm7,%xmm4 movapd 64(%edx,%eax),%xmm7 mulpd b+64(%eax),%xmm7 addpd %xmm7,%xmm3 movapd 80(%edx,%eax),%xmm7 mulpd b+80(%eax),%xmm7 addpd %xmm7,%xmm5 movapd 96(%edx,%eax),%xmm7 mulpd b+96(%eax),%xmm7 addpd %xmm7,%xmm6 movapd 112(%edx,%eax),%xmm7 subl $-128,%eax mulpd b+-16(%eax),%xmm7 cmpl $2048,%eax addpd %xmm7,%xmm4 jne ___dcox86_wl_2_ jmp ___dcox86_cc_4_ .L43: movsd (%edx,%eax),%xmm7 movhpd 8(%edx,%eax),%xmm7 mulpd b(%eax),%xmm7 addpd %xmm7,%xmm2 movsd 16(%edx,%eax),%xmm7 movhpd 24(%edx,%eax),%xmm7 mulpd b+16(%eax),%xmm7 addpd %xmm7,%xmm1 movsd 32(%edx,%eax),%xmm7 movhpd 40(%edx,%eax),%xmm7 mulpd b+32(%eax),%xmm7 addpd %xmm7,%xmm0 movsd 48(%edx,%eax),%xmm7 movhpd 56(%edx,%eax),%xmm7 mulpd b+48(%eax),%xmm7 addpd %xmm7,%xmm4 movsd 64(%edx,%eax),%xmm7 movhpd 72(%edx,%eax),%xmm7 mulpd b+64(%eax),%xmm7 addpd %xmm7,%xmm3 movsd 80(%edx,%eax),%xmm7 movhpd 88(%edx,%eax),%xmm7 mulpd b+80(%eax),%xmm7 addpd %xmm7,%xmm5 movsd 96(%edx,%eax),%xmm7 movhpd 104(%edx,%eax),%xmm7 mulpd b+96(%eax),%xmm7 addpd %xmm7,%xmm6 movsd 112(%edx,%eax),%xmm7 movhpd 120(%edx,%eax),%xmm7 subl $-128,%eax cmpl $2048,%eax mulpd b+-16(%eax),%xmm7 addpd %xmm7,%xmm4 jne .L43 ___dcox86_cc_4_: addpd %xmm6,%xmm0 addpd %xmm5,%xmm1 addpd %xmm3,%xmm2 addpd %xmm4,%xmm1 addpd %xmm0,%xmm2 addpd %xmm1,%xmm2
|
.L43: movsd (%edx,%eax), %xmm1 movhpd 8(%edx,%eax), %xmm1 movapd %xmm1, %xmm7 mulpd b(%eax), %xmm7 addpd %xmm7, %xmm2 movsd 16(%edx,%eax), %xmm6 movhpd 24(%edx,%eax), %xmm6 movapd %xmm6, %xmm5 mulpd b+16(%eax), %xmm5 addpd %xmm5, %xmm2 movsd 32(%edx,%eax), %xmm0 movhpd 40(%edx,%eax), %xmm0 movapd %xmm0, %xmm3 mulpd b+32(%eax), %xmm3 addpd %xmm3, %xmm2 movsd 48(%edx,%eax), %xmm4 movhpd 56(%edx,%eax), %xmm4 movapd %xmm4, %xmm7 mulpd b+48(%eax), %xmm7 addpd %xmm7, %xmm2 movsd 64(%edx,%eax), %xmm1 movhpd 72(%edx,%eax), %xmm1 movapd %xmm1, %xmm6 mulpd b+64(%eax), %xmm6 addpd %xmm6, %xmm2 movsd 80(%edx,%eax), %xmm5 movhpd 88(%edx,%eax), %xmm5 movapd %xmm5, %xmm0 mulpd b+80(%eax), %xmm0 addpd %xmm0, %xmm2 movsd 96(%edx,%eax), %xmm3 movhpd 104(%edx,%eax), %xmm3 movapd %xmm3, %xmm4 mulpd b+96(%eax), %xmm4 addpd %xmm4, %xmm2 movsd 112(%edx,%eax), %xmm7 movhpd 120(%edx,%eax), %xmm7 movapd %xmm7, %xmm6 mulpd b+112(%eax), %xmm6 addpd %xmm6, %xmm2 subl $-128, %eax cmpl $2048, %eax jne .L43
| Note
the use of the SIMD instructions in the compiler generated
code ( e.g. mulpd
). Note also the use
of conditionally generated aligned memory accesses by the dco
generated code ( loop at ___dcox86_wl_2_
).
|