In this article you will find samples of the code, generated by dco, that illustrate various optimization techniques supported by the optimizer.
Click here for the example of the code generated by the SIMDinator ( part of the dco responsible for conversion of scalar instructions into SIMD instructions ).

Click here for the example of the DSP code, optimized by dco.

loops reconstruction

Often gcc converts loops into convoluted set of conditional jumps which is difficult to deal with and just runs slower than it could. dco is able to reconstruct some loops, which enables it to perform optimizations otherwise not applicable.

Consider the following code ( part of the Livermore kernel benchmark, 16% improved by dco )

code to be optimized ( the inner loop )
 for ( j=1 ; j<jn ; j++ )
{
za[k][j] = ( zp[k+1][j-1] +zq[k+1][j-1] -zp[k][j-1] -zq[k][j-1] )*
( zr[k][j] +zr[k][j-1] ) / ( zm[k][j-1] +zm[k+1][j-1]);
zb[k][j] = ( zp[k][j-1] +zq[k][j-1] -zp[k][j] -zq[k][j] ) *
( zr[k][j] +zr[k-1][j] ) / ( zm[k][j] +zm[k][j-1]);
}
dco generated code ( with -no-packing )
.L1038:
movl as2,%edx
movl $1,%esi
movl $1608035756,as1+433160
movl $1064193910,as1+433164
movl $2,%ebp
movl $1587419913,as1+433128
movl %edx,28(%esp)
cmpl 28(%esp),%esi
movl $1064356649,as1+433132
movl $202,20(%esp)
jl .L1011
cmpl $5,%ebp
jg .L1031
___dcox86_do_3_:
movl $1,%esi
addl $1,%ebp
addl $101,20(%esp)
cmpl 28(%esp),%esi
jge .L1064
.L1011:
movl 20(%esp),%ecx
movl $as1+265080,%eax
movl %ebp,%ebx
imull $808,%ebx
imull $8,%ecx
leal as1+264272(,%ebx),%ebx
addl %ecx,%eax
___dcox86_wl_6_:
movsd (%eax),%xmm5
addl $1,%esi
movsd (%ebx),%xmm3
addl $8,%eax
addsd 5648(%eax),%xmm5
addl $8,%ebx
addsd 5648(%ebx),%xmm3
movsd 11312(%ebx),%xmm4
movsd 16960(%ebx),%xmm6
movsd 16968(%ebx),%xmm2
addsd 16960(%eax),%xmm6
addsd 16960(%ebx),%xmm2
movapd %xmm4,%xmm7
subsd -8(%ebx),%xmm5
subsd (%ebx),%xmm3
addsd 10504(%ebx),%xmm4
addsd 11304(%ebx),%xmm7
cmpl 28(%esp),%esi
subsd 5648(%ebx),%xmm5
subsd 5656(%ebx),%xmm3
mulsd %xmm7,%xmm5
mulsd %xmm4,%xmm3
divsd %xmm6,%xmm5
divsd %xmm2,%xmm3
movsd %xmm5,-5656(%ebx)
movsd %xmm3,22624(%ebx)
jl ___dcox86_wl_6_
.L1064:
cmpl $5,%ebp
jle ___dcox86_do_3_
.L1031:
compiler generated code
.L1038:
movl $1608035756, as1+433160
movl $1064193910, as1+433164
movl $1587419913, as1+433128
movl $1064356649, as1+433132
movl as2, %ebp
movl %ebp, 28(%esp)
movl $1, %esi
movl $1, %edi
movl $2, %ebp
movl $202, 20(%esp)
.L1060:
cmpl 28(%esp), %esi
jl .L1011
.L1064:
movl %ebp, %edi
cmpl $5, %ebp
jg .L1031
movl $1, %esi
addl $1, %ebp
addl $101, 20(%esp)
cmpl 28(%esp), %esi
jge .L1064
.L1011:
leal -1(%esi), %ebx
leal (%edi,%edi), %eax
addl %edi, %eax
leal (%edi,%eax,8), %edx
leal (%edi,%edx,4), %eax
leal (%eax,%esi), %edx
movl 20(%esp), %ecx
addl %ebx, %ecx
movsd as1+265080(,%ecx,8), %xmm5
addsd as1+270736(,%ecx,8), %xmm5
addl %ebx, %eax
subsd as1+265080(,%eax,8), %xmm5
subsd as1+270736(,%eax,8), %xmm5
movsd as1+276392(,%edx,8), %xmm4
movapd %xmm4, %xmm7
addsd as1+276392(,%eax,8), %xmm7
mulsd %xmm7, %xmm5
movsd as1+282048(,%eax,8), %xmm6
addsd as1+282048(,%ecx,8), %xmm6
divsd %xmm6, %xmm5
movsd %xmm5, as1+259424(,%edx,8)
movsd as1+265080(,%eax,8), %xmm3
addsd as1+270736(,%eax,8), %xmm3
subsd as1+265080(,%edx,8), %xmm3
subsd as1+270736(,%edx,8), %xmm3
addsd as1+275584(,%edx,8), %xmm4
mulsd %xmm4, %xmm3
movsd as1+282048(,%edx,8), %xmm2
addsd as1+282048(,%eax,8), %xmm2
divsd %xmm2, %xmm3
movsd %xmm3, as1+287704(,%edx,8)
addl $1, %esi
jmp .L1060
.L1031:



Note the loop ___dcox86_wl_6_: - jl ___dcox86_wl_6_ generated by dco. Having such a loop in place enabled further optimizations ( e.g. removal of address calculation, shown in this color, out of loop ).

memory optimizations

dco performs very powerful and comprehensive memory optimizations, capable to determine and solve memory dependencies. Significant code speed ups achieved by these functionality and many optimizations are possible because of it.

In the future we hope to extend this article by showing various examples of memory optimizations achieved by dco. But for now, here is one.

code to be optimized ( the inner loop )
 for ( k=1 ; k<n ; k++ )
{
x[k] = x[k-1] + y[k];
}
dco generated code
.L1078:
___dcox86_wl_1_:
movsd as1+24016(,%edx,8),%xmm7
leal 7(%edx),%ebx
addl $8,%edx
addsd as1+31968(,%edx,8),%xmm7
cmpl %edx,%ecx
movsd %xmm7,as1+23960(,%edx,8)
addsd as1+31976(,%edx,8),%xmm7
movsd %xmm7,as1+23968(,%edx,8)
addsd as1+31984(,%edx,8),%xmm7
movsd %xmm7,as1+23976(,%edx,8)
addsd as1+31992(,%edx,8),%xmm7
movsd %xmm7,as1+23984(,%edx,8)
addsd as1+32000(,%edx,8),%xmm7
movsd %xmm7,as1+23992(,%edx,8)
addsd as1+32008(,%edx,8),%xmm7
movsd %xmm7,as1+24000(,%edx,8)
addsd as1+32016(,%edx,8),%xmm7
movsd %xmm7,as1+24008(,%edx,8)
addsd as1+32024(,%edx,8),%xmm7
movsd %xmm7,as1+24016(,%edx,8)
jne ___dcox86_wl_1_
compiler generated code
.L1078:
movsd as1+24016(,%edx,8), %xmm7
addsd as1+32032(,%edx,8), %xmm7
movsd %xmm7, as1+24024(,%edx,8)
leal 1(%edx), %ebx
movsd as1+24016(,%ebx,8), %xmm6
addsd as1+32032(,%ebx,8), %xmm6
movsd %xmm6, as1+24024(,%ebx,8)
leal 2(%edx), %eax
movsd as1+24016(,%eax,8), %xmm5
addsd as1+32032(,%eax,8), %xmm5
movsd %xmm5, as1+24024(,%eax,8)
leal 3(%edx), %ebx
movsd as1+24016(,%ebx,8), %xmm4
addsd as1+32032(,%ebx,8), %xmm4
movsd %xmm4, as1+24024(,%ebx,8)
leal 4(%edx), %eax
movsd as1+24016(,%eax,8), %xmm3
addsd as1+32032(,%eax,8), %xmm3
movsd %xmm3, as1+24024(,%eax,8)
leal 5(%edx), %ebx
movsd as1+24016(,%ebx,8), %xmm2
addsd as1+32032(,%ebx,8), %xmm2
movsd %xmm2, as1+24024(,%ebx,8)
leal 6(%edx), %eax
movsd as1+24016(,%eax,8), %xmm1
addsd as1+32032(,%eax,8), %xmm1
movsd %xmm1, as1+24024(,%eax,8)
leal 7(%edx), %ebx
movsd as1+24016(,%ebx,8), %xmm0
addsd as1+32032(,%ebx,8), %xmm0
movsd %xmm0, as1+24024(,%ebx,8)
addl $8, %edx
cmpl %edx, %ecx
jne .L1078

You judge by yourself which code is better; just in case, the hint is here .