In
this article you will find samples of
the code, generated by dco,
that
illustrate various
optimization techniques supported by the optimizer. Click here
for the example of the code generated by the SIMDinator ( part of the dco
responsible for
conversion of scalar instructions into SIMD instructions ).
Click
here
for the example of the DSP code, optimized by dco.
Often gcc converts loops into convoluted
set of conditional
jumps
which is difficult to deal with and just runs slower than it could. dco
is able to reconstruct
some loops, which enables it to perform optimizations otherwise not
applicable.
Consider the following code ( part of the Livermore kernel benchmark,
16%
improved by dco
)
for ( j=1 ; j<jn ; j++ ) { za[k][j] = ( zp[k+1][j-1] +zq[k+1][j-1] -zp[k][j-1] -zq[k][j-1] )* ( zr[k][j] +zr[k][j-1] ) / ( zm[k][j-1] +zm[k+1][j-1]); zb[k][j] = ( zp[k][j-1] +zq[k][j-1] -zp[k][j] -zq[k][j] ) * ( zr[k][j] +zr[k-1][j] ) / ( zm[k][j] +zm[k][j-1]); }
|
.L1038: movl as2,%edx movl $1,%esi movl $1608035756,as1+433160 movl $1064193910,as1+433164 movl $2,%ebp movl $1587419913,as1+433128 movl %edx,28(%esp) cmpl 28(%esp),%esi movl $1064356649,as1+433132 movl $202,20(%esp) jl .L1011 cmpl $5,%ebp jg .L1031 ___dcox86_do_3_: movl $1,%esi addl $1,%ebp addl $101,20(%esp) cmpl 28(%esp),%esi jge .L1064 .L1011: movl 20(%esp),%ecx movl $as1+265080,%eax movl %ebp,%ebx imull $808,%ebx imull $8,%ecx leal as1+264272(,%ebx),%ebx addl %ecx,%eax ___dcox86_wl_6_: movsd (%eax),%xmm5 addl $1,%esi movsd (%ebx),%xmm3 addl $8,%eax addsd 5648(%eax),%xmm5 addl $8,%ebx addsd 5648(%ebx),%xmm3 movsd 11312(%ebx),%xmm4 movsd 16960(%ebx),%xmm6 movsd 16968(%ebx),%xmm2 addsd 16960(%eax),%xmm6 addsd 16960(%ebx),%xmm2 movapd %xmm4,%xmm7 subsd -8(%ebx),%xmm5 subsd (%ebx),%xmm3 addsd 10504(%ebx),%xmm4 addsd 11304(%ebx),%xmm7 cmpl 28(%esp),%esi subsd 5648(%ebx),%xmm5 subsd 5656(%ebx),%xmm3 mulsd %xmm7,%xmm5 mulsd %xmm4,%xmm3 divsd %xmm6,%xmm5 divsd %xmm2,%xmm3 movsd %xmm5,-5656(%ebx) movsd %xmm3,22624(%ebx) jl ___dcox86_wl_6_ .L1064: cmpl $5,%ebp jle ___dcox86_do_3_ .L1031:
|
.L1038: movl $1608035756, as1+433160 movl $1064193910, as1+433164 movl $1587419913, as1+433128 movl $1064356649, as1+433132 movl as2, %ebp movl %ebp, 28(%esp) movl $1, %esi movl $1, %edi movl $2, %ebp movl $202, 20(%esp) .L1060: cmpl 28(%esp), %esi jl .L1011 .L1064: movl %ebp, %edi cmpl $5, %ebp jg .L1031 movl $1, %esi addl $1, %ebp addl $101, 20(%esp) cmpl 28(%esp), %esi jge .L1064 .L1011: leal -1(%esi), %ebx leal (%edi,%edi), %eax addl %edi, %eax leal (%edi,%eax,8), %edx leal (%edi,%edx,4), %eax leal (%eax,%esi), %edx movl 20(%esp), %ecx addl %ebx, %ecx movsd as1+265080(,%ecx,8), %xmm5 addsd as1+270736(,%ecx,8), %xmm5 addl %ebx, %eax subsd as1+265080(,%eax,8), %xmm5 subsd as1+270736(,%eax,8), %xmm5 movsd as1+276392(,%edx,8), %xmm4 movapd %xmm4, %xmm7 addsd as1+276392(,%eax,8), %xmm7 mulsd %xmm7, %xmm5 movsd as1+282048(,%eax,8), %xmm6 addsd as1+282048(,%ecx,8), %xmm6 divsd %xmm6, %xmm5 movsd %xmm5, as1+259424(,%edx,8) movsd as1+265080(,%eax,8), %xmm3 addsd as1+270736(,%eax,8), %xmm3 subsd as1+265080(,%edx,8), %xmm3 subsd as1+270736(,%edx,8), %xmm3 addsd as1+275584(,%edx,8), %xmm4 mulsd %xmm4, %xmm3 movsd as1+282048(,%edx,8), %xmm2 addsd as1+282048(,%eax,8), %xmm2 divsd %xmm2, %xmm3 movsd %xmm3, as1+287704(,%edx,8) addl $1, %esi jmp .L1060 .L1031:
| Note the loop ___dcox86_wl_6_:
- jl
___dcox86_wl_6_ generated
by dco. Having such
a loop in place enabled further optimizations ( e.g. removal of address
calculation, shown in this
color, out of loop ). dco performs
very
powerful and
comprehensive memory
optimizations, capable to determine and solve memory dependencies.
Significant code speed ups achieved by these functionality and
many optimizations are possible because of it.
In the future we hope to extend this article by showing various
examples of memory optimizations achieved by
dco. But for now, here is one.
for ( k=1 ; k<n ; k++ ) { x[k] = x[k-1] + y[k]; }
|
.L1078: ___dcox86_wl_1_: movsd as1+24016(,%edx,8),%xmm7 leal 7(%edx),%ebx addl $8,%edx addsd as1+31968(,%edx,8),%xmm7 cmpl %edx,%ecx movsd %xmm7,as1+23960(,%edx,8) addsd as1+31976(,%edx,8),%xmm7 movsd %xmm7,as1+23968(,%edx,8) addsd as1+31984(,%edx,8),%xmm7 movsd %xmm7,as1+23976(,%edx,8) addsd as1+31992(,%edx,8),%xmm7 movsd %xmm7,as1+23984(,%edx,8) addsd as1+32000(,%edx,8),%xmm7 movsd %xmm7,as1+23992(,%edx,8) addsd as1+32008(,%edx,8),%xmm7 movsd %xmm7,as1+24000(,%edx,8) addsd as1+32016(,%edx,8),%xmm7 movsd %xmm7,as1+24008(,%edx,8) addsd as1+32024(,%edx,8),%xmm7 movsd %xmm7,as1+24016(,%edx,8) jne ___dcox86_wl_1_
|
.L1078: movsd as1+24016(,%edx,8), %xmm7 addsd as1+32032(,%edx,8), %xmm7 movsd %xmm7, as1+24024(,%edx,8) leal 1(%edx), %ebx movsd as1+24016(,%ebx,8), %xmm6 addsd as1+32032(,%ebx,8), %xmm6 movsd %xmm6, as1+24024(,%ebx,8) leal 2(%edx), %eax movsd as1+24016(,%eax,8), %xmm5 addsd as1+32032(,%eax,8), %xmm5 movsd %xmm5, as1+24024(,%eax,8) leal 3(%edx), %ebx movsd as1+24016(,%ebx,8), %xmm4 addsd as1+32032(,%ebx,8), %xmm4 movsd %xmm4, as1+24024(,%ebx,8) leal 4(%edx), %eax movsd as1+24016(,%eax,8), %xmm3 addsd as1+32032(,%eax,8), %xmm3 movsd %xmm3, as1+24024(,%eax,8) leal 5(%edx), %ebx movsd as1+24016(,%ebx,8), %xmm2 addsd as1+32032(,%ebx,8), %xmm2 movsd %xmm2, as1+24024(,%ebx,8) leal 6(%edx), %eax movsd as1+24016(,%eax,8), %xmm1 addsd as1+32032(,%eax,8), %xmm1 movsd %xmm1, as1+24024(,%eax,8) leal 7(%edx), %ebx movsd as1+24016(,%ebx,8), %xmm0 addsd as1+32032(,%ebx,8), %xmm0 movsd %xmm0, as1+24024(,%ebx,8) addl $8, %edx cmpl %edx, %ecx jne .L1078
| You judge by yourself which code is
better; just in
case,
the hint is here. |