This page contains dco's optimization results for the double precision convolution optimizing code generated by the gcc version 4.2.2 on x86-64 and IA-32 systems. Variation of the double precision convolution are found in many DSP kernels.

Preparing the benchmark

For instance of the <code to be optimized> the following program was created:
<initialization>
startTime = start_timer();
for ( count = 0; count < NUMBER_OF_ITERATIONS; count++ )
{
    execute <code to be optimized>
}

endTime = end_timer();
<report the execution time 'endTime - startTime'>
Read this to understand how the benchmarks were executed.

Convolution

results for x86-64 64-bit code

The gcc version 4.2.2 compiler, used to process the benchmark, was invoked with the following command line options:
-S -O3 -fomit-frame-pointer -funroll-all-loops-ffast-math -march=nocona -mfpmath=sse -msse3

On the 64-bit Linux operating system running on the 2.66GHz Core2 computer we tested the double precision convolution on arrays of 1000 points. NUMBER_OF_ITERATIONS was set to 10000. gcc ( version 4.2.2 ) generated code ran for 2.9106 seconds, dco ( version 1.1.0 ) optimized code ran for 1.7927 seconds, thus a 38% improvement.

code to be optimized ( the inner loop )
double *ap, *bp, tmp1, tmp2, sum;
int i;

sum = 0.;
for ( i = 0; i < ARRAY_DIM; i++ ) {
tmp1 = *(ap += I);
tmp2 = *(bp += J);
sum = sum + tmp1 * tmp2;
}
dco generated code ( only the loop body is shown )
	movq %rdx,%r10
andq $15,%r10
jne ___dcox86_wl_3_
___dcox86_wl_5_:
movsd b(,%rax,8),%xmm8
addq $10,%rax
movhpd b+-72(,%rax,8),%xmm8
subq $1,%rbx
movsd b+-64(,%rax,8),%xmm7
movhpd b+-56(,%rax,8),%xmm7
movsd b+-48(,%rax,8),%xmm12
movhpd b+-40(,%rax,8),%xmm12
movsd b+-32(,%rax,8),%xmm13
movhpd b+-24(,%rax,8),%xmm13
movsd b+-16(,%rax,8),%xmm10
movhpd b+-8(,%rax,8),%xmm10
mulpd -80(%rdx,%rax,8),%xmm8
mulpd -64(%rdx,%rax,8),%xmm7
mulpd -48(%rdx,%rax,8),%xmm12
mulpd -32(%rdx,%rax,8),%xmm13
mulpd -16(%rdx,%rax,8),%xmm10
addpd %xmm8,%xmm1
addpd %xmm7,%xmm4
addpd %xmm12,%xmm5
addpd %xmm13,%xmm11
addpd %xmm10,%xmm6
jg ___dcox86_wl_5_
jmp ___dcox86_cc_7_
___dcox86_wl_3_:
movsd (%rdx,%rax,8),%xmm9
addq $10,%rax
movhpd -72(%rdx,%rax,8),%xmm9
subq $1,%rbx
movsd b+-80(,%rax,8),%xmm13
movhpd b+-72(,%rax,8),%xmm13
movsd -64(%rdx,%rax,8),%xmm8
movhpd -56(%rdx,%rax,8),%xmm8
movsd b+-64(,%rax,8),%xmm7
movhpd b+-56(,%rax,8),%xmm7
movsd -48(%rdx,%rax,8),%xmm15
movhpd -40(%rdx,%rax,8),%xmm15
movsd b+-48(,%rax,8),%xmm14
movhpd b+-40(,%rax,8),%xmm14
movsd -32(%rdx,%rax,8),%xmm2
movhpd -24(%rdx,%rax,8),%xmm2
mulpd %xmm13,%xmm9
movsd b+-32(,%rax,8),%xmm13
movhpd b+-24(,%rax,8),%xmm13
movsd -16(%rdx,%rax,8),%xmm12
movhpd -8(%rdx,%rax,8),%xmm12
mulpd %xmm7,%xmm8
movsd b+-16(,%rax,8),%xmm10
addpd %xmm9,%xmm1
movhpd b+-8(,%rax,8),%xmm10
mulpd %xmm14,%xmm15
addpd %xmm8,%xmm4
mulpd %xmm13,%xmm2
addpd %xmm15,%xmm5
mulpd %xmm10,%xmm12
addpd %xmm2,%xmm11
addpd %xmm12,%xmm6
jg ___dcox86_wl_3_
___dcox86_cc_7_:
gcc 4.2.2 generated code

L50:
movsd (%rdx,%rax,8), %xmm13
mulsd b(,%rax,8), %xmm13
addsd %xmm13, %xmm1
movsd 8(%rdx,%rax,8), %xmm12
mulsd b+8(,%rax,8), %xmm12
addsd %xmm12, %xmm1
movsd 16(%rdx,%rax,8), %xmm11
mulsd b+16(,%rax,8), %xmm11
addsd %xmm11, %xmm1 
movsd 24(%rdx,%rax,8), %xmm10
mulsd b+24(,%rax,8), %xmm10
addsd %xmm10, %xmm1
movsd 32(%rdx,%rax,8), %xmm9
mulsd b+32(,%rax,8), %xmm9
addsd %xmm9, %xmm1
movsd 40(%rdx,%rax,8), %xmm8
mulsd b+40(,%rax,8), %xmm8
addsd %xmm8, %xmm1
movsd 48(%rdx,%rax,8), %xmm7
mulsd b+48(,%rax,8), %xmm7
addsd %xmm7, %xmm1
movsd 56(%rdx,%rax,8), %xmm6
mulsd b+56(,%rax,8), %xmm6
addsd %xmm6, %xmm1
movsd 64(%rdx,%rax,8), %xmm5
mulsd b+64(,%rax,8), %xmm5
addsd %xmm5, %xmm1
movsd 72(%rdx,%rax,8), %xmm4
mulsd b+72(,%rax,8), %xmm4
addsd %xmm4, %xmm1
addq $10, %rax
cmpq $512, %rax
jne .L50

See this for more optimization results showing how to achieve 54% improvement of the 64-bit x86-64 code for the double precision convolution.

results for IA-32 32-bit code

The gcc version 4.3.0 compiler, used to process the benchmark, was invoked with the following command line options:
-S -O3 -fomit-frame-pointer -funroll-all-loops-ffast-math -march=pentium4 -mfpmath=sse -msse2

On the 32-bit Linux operating system running on the 2.8GHz Pentium4computer we tested the double precision convolution on arrays of 512 points. NUMBER_OF_ITERATIONS was set to 10000. gcc ( version 4.3.0 ) generated code ran for 0.5619 seconds, dco ( version 1.1.1 ) optimized code ran for 0.4209 seconds, thus a 25% improvement.

code to be optimized ( the inner loop )
double *ap, *bp, tmp1, tmp2, sum;
int i;

sum = 0.;
for ( i = 0; i < ARRAY_DIM; i++ ) {
tmp1 = *(ap += I);
tmp2 = *(bp += J);
sum = sum + tmp1 * tmp2;
}
dco generated code
	movl %edx,%ebx
pxor %xmm1,%xmm1
andl $15,%ebx
pxor %xmm0,%xmm0
pxor %xmm3,%xmm3
pxor %xmm4,%xmm4
pxor %xmm5,%xmm5
pxor %xmm6,%xmm6
jne .L43
___dcox86_wl_2_:
movapd (%edx,%eax),%xmm7
mulpd b(%eax),%xmm7
addpd %xmm7,%xmm2
movapd 16(%edx,%eax),%xmm7
mulpd b+16(%eax),%xmm7
addpd %xmm7,%xmm1
movapd 32(%edx,%eax),%xmm7
mulpd b+32(%eax),%xmm7
addpd %xmm7,%xmm0
movapd 48(%edx,%eax),%xmm7
mulpd b+48(%eax),%xmm7
addpd %xmm7,%xmm4
movapd 64(%edx,%eax),%xmm7
mulpd b+64(%eax),%xmm7
addpd %xmm7,%xmm3
movapd 80(%edx,%eax),%xmm7
mulpd b+80(%eax),%xmm7
addpd %xmm7,%xmm5
movapd 96(%edx,%eax),%xmm7
mulpd b+96(%eax),%xmm7
addpd %xmm7,%xmm6
movapd 112(%edx,%eax),%xmm7
subl $-128,%eax
mulpd b+-16(%eax),%xmm7
cmpl $2048,%eax
addpd %xmm7,%xmm4
jne ___dcox86_wl_2_
jmp ___dcox86_cc_4_
.L43:
movsd (%edx,%eax),%xmm7
movhpd 8(%edx,%eax),%xmm7
mulpd b(%eax),%xmm7
addpd %xmm7,%xmm2
movsd 16(%edx,%eax),%xmm7
movhpd 24(%edx,%eax),%xmm7
mulpd b+16(%eax),%xmm7
addpd %xmm7,%xmm1
movsd 32(%edx,%eax),%xmm7
movhpd 40(%edx,%eax),%xmm7
mulpd b+32(%eax),%xmm7
addpd %xmm7,%xmm0
movsd 48(%edx,%eax),%xmm7
movhpd 56(%edx,%eax),%xmm7
mulpd b+48(%eax),%xmm7
addpd %xmm7,%xmm4
movsd 64(%edx,%eax),%xmm7
movhpd 72(%edx,%eax),%xmm7
mulpd b+64(%eax),%xmm7
addpd %xmm7,%xmm3
movsd 80(%edx,%eax),%xmm7
movhpd 88(%edx,%eax),%xmm7
mulpd b+80(%eax),%xmm7
addpd %xmm7,%xmm5
movsd 96(%edx,%eax),%xmm7
movhpd 104(%edx,%eax),%xmm7
mulpd b+96(%eax),%xmm7
addpd %xmm7,%xmm6
movsd 112(%edx,%eax),%xmm7
movhpd 120(%edx,%eax),%xmm7
subl $-128,%eax
cmpl $2048,%eax
mulpd b+-16(%eax),%xmm7
addpd %xmm7,%xmm4
jne .L43
___dcox86_cc_4_:
addpd %xmm6,%xmm0
addpd %xmm5,%xmm1
addpd %xmm3,%xmm2
addpd %xmm4,%xmm1
addpd %xmm0,%xmm2
addpd %xmm1,%xmm2
gcc 4.3.0 generated code
.L43:
movsd (%edx,%eax), %xmm1
movhpd 8(%edx,%eax), %xmm1
movapd %xmm1, %xmm7
mulpd b(%eax), %xmm7
addpd %xmm7, %xmm2
movsd 16(%edx,%eax), %xmm6
movhpd 24(%edx,%eax), %xmm6
movapd %xmm6, %xmm5
mulpd b+16(%eax), %xmm5
addpd %xmm5, %xmm2
movsd 32(%edx,%eax), %xmm0
movhpd 40(%edx,%eax), %xmm0
movapd %xmm0, %xmm3
mulpd b+32(%eax), %xmm3
addpd %xmm3, %xmm2
movsd 48(%edx,%eax), %xmm4
movhpd 56(%edx,%eax), %xmm4
movapd %xmm4, %xmm7
mulpd b+48(%eax), %xmm7
addpd %xmm7, %xmm2
movsd 64(%edx,%eax), %xmm1
movhpd 72(%edx,%eax), %xmm1
movapd %xmm1, %xmm6
mulpd b+64(%eax), %xmm6
addpd %xmm6, %xmm2
movsd 80(%edx,%eax), %xmm5
movhpd 88(%edx,%eax), %xmm5
movapd %xmm5, %xmm0
mulpd b+80(%eax), %xmm0
addpd %xmm0, %xmm2
movsd 96(%edx,%eax), %xmm3
movhpd 104(%edx,%eax), %xmm3
movapd %xmm3, %xmm4
mulpd b+96(%eax), %xmm4
addpd %xmm4, %xmm2
movsd 112(%edx,%eax), %xmm7
movhpd 120(%edx,%eax), %xmm7
movapd %xmm7, %xmm6
mulpd b+112(%eax), %xmm6
addpd %xmm6, %xmm2
subl $-128, %eax
cmpl $2048, %eax
jne .L43

Note the use of the SIMD instructions in the compiler generated code ( e.g. mulpd ). Note also
the use of conditionally generated aligned memory accesses by the dco generated code ( loop at ___dcox86_wl_2_ ).