Michael Abrash's Graphics Programming Black Book Special Edition: Speeding Up C with Assembly Language

Table of Contents

It’s hard to squeeze much more performance from this code by tweaking it, as exemplified by Listing 8.3, a fine-tuned assembly version of FindIDAverage that was produced by looking at the assembly output of MS C/C++ and tightening it. Listing 8.3 eliminates all stack frame access in the inner loop, but that’s about all the tightening there is to do. The result, as shown in Table 8.1, is that Listing 8.3 runs a modest 11 percent faster than Listing 8.1 on a 386. The results could vary considerably, depending on the nature of the data set searched through (average block size and frequency of matches). But, then, understanding the typical and worst case conditions is part of optimization, isn’t it?

LISTING 8.3 L8-3.ASM

; Typically optimized assembly language version of FindIDAverage.
SearchedForID   equ     4      ;Passed parameter offsets in the
BlockPointer    equ     6      ; stack frame (skip over pushed BP
; and the return address)
NextBlock       equ     0      ;Field offsets in struct BlockHeader
BlockCount      equ     2
BLOCK_HEADER_SIZE equ   4      ;Number of bytes in struct BlockHeader
ID              equ     0      ;struct DataElement field offsets
Value           equ     2
DATA_ELEMENT_SIZE equ   4      ;Number of bytes in struct DataElement
        .model  small
        .code
        public  _FindIDAverage

	On 20 MHz 386	On 10 MHz 286


Listing 8.1	294 microseconds	768 microseconds
(MSC with maximum optimization)
Listing 8.3	265	644
(Assembly)
Listing 8.4	212	486
(Optimized assembly)
Listing 8.6	100	207
(Optimized assembly with reorganized data)

Table 8.1 Execution Times of FindIDAverage.

_FindIDAverage  proc    near
        push    bp              ;Save caller’s stack frame
        mov     bp,sp           ;Point to our stack frame
        push    di              ;Preserve C register variables
        push    si
        sub     dx,dx           ;IDMatchSum = 0
        mov     bx,dx           ;IDMatchCount = 0
        mov     si,[bp+BlockPointer]    ;Pointer to first block
        mov     ax,[bp+SearchedForID]   ;ID we’re looking for
; Search through all the linked blocks until the last block
; (marked with a NULL pointer to the next block) has been searched.
BlockLoop:
; Point to the first DataElement entry within this block.
        lea     di,[si+BLOCK_HEADER_SIZE]
; Search through all the DataElement entries within this block
; and accumulate data from all that match the desired ID.
        mov     cx,[si+BlockCount]
        jcxz    DoNextBlock     ;No data in this block
IntraBlockLoop:
        cmp     [di+ID],ax      ;Do we have an ID match?
        jnz     NoMatch         ;No match
        inc     bx              ;We have a match; IDMatchCount++;
        add     dx,[di+Value]   ;IDMatchSum += DataPointer->Value;
NoMatch:
        add     di,DATA_ELEMENT_SIZE ;point to the next element
        loop    IntraBlockLoop
; Point to the next block and continue if that pointer isn’t NULL.
DoNextBlock:
        mov     si,[si+NextBlock] ;Get pointer to the next block
        and     si,si           ;Is it a NULL pointer?
        jnz     BlockLoop       ;No, continue
; Calculate the average of all matches.
        sub     ax,ax           ;Assume we found no matches
        and     bx,bx
        jz      Done            ;We didn’t find any matches, return 0
        xchg    ax,dx           ;Prepare for division
        div     bx              ;Return IDMatchSum / IDMatchCount
Done:   pop     si              ;Restore C register variables
        pop     di
        pop     bp              ;Restore caller’s stack frame
        ret
_FindIDAverage  ENDP
        end

Listing 8.4 tosses some sophisticated optimization techniques into the mix. The loop is unrolled eight times, eliminating a good deal of branching, and SCASW is used instead of CMP [DI],AX. (Note, however, that SCASW is in fact slower than CMP [DI],AX on the 386 and 486, and is sometimes faster on the 286 and 8088 only because it’s shorter and therefore may prefetch faster.) This advanced tweaking produces a 39 percent improvement over the original C code—substantial, but not a tremendous return for the optimization effort invested.

LISTING 8.4 L8-4.ASM

; Heavily optimized assembly language version of FindIDAverage.
; Features an unrolled loop and more efficient pointer use.
SearchedForID   equ     4       ;Passed parameter offsets in the
BlockPointer    equ     6       ; stack frame (skip over pushed BP
                                ; and the return address)
NextBlock       equ     0       ;Field offsets in struct BlockHeader
BlockCount      equ     2
BLOCK_HEADER_SIZE equ   4       ;Number of bytes in struct BlockHeader
ID              equ     0       ;struct DataElement field offsets
Value           equ     2
DATA_ELEMENT_SIZE equ   4       ;Number of bytes in struct DataElement
        .model  small
        .code
        public  _FindIDAverage
_FindIDAverage  proc    near
        push    bp              ;Save caller’s stack frame
        mov     bp,sp           ;Point to our stack frame
        push    di              ;Preserve C register variables
        push    si
        mov     di,ds           ;Prepare for SCASW
        mov     es,di
        cld
        sub     dx,dx           ;IDMatchSum = 0
        mov     bx,dx           ;IDMatchCount = 0
        mov     si,[bp+BlockPointer]    ;Pointer to first block
        mov     ax,[bp+SearchedForID]   ;ID we’re looking for
; Search through all of the linked blocks until the last block
; (marked with a NULL pointer to the next block) has been searched.
BlockLoop:
; Point to the first DataElement entry within this block.
        lea     di,[si+BLOCK_HEADER_SIZE]
; Search through all the DataElement entries within this block
; and accumulate data from all that match the desired ID.
        mov     cx,[si+BlockCount] ;Number of elements in this block
        jcxz    DoNextBlock     ;Skip this block if it’s empty
        mov     bp,cx           ;***stack frame no longer available***
        add     cx,7
        shr     cx,1            ;Number of repetitions of the unrolled
        shr     cx,1            ; loop = (BlockCount + 7) / 8
        shr     cx,1
        and     bp,7            ;Generate the entry point for the
        shl     bp,1            ; first, possibly partial pass through
        jmp     cs:[LoopEntryTable+bp] ; the unrolled loop and
                                ; vector to that entry point
        align   2
LoopEntryTable  label   word
        dw      LoopEntry8,LoopEntry1,LoopEntry2,LoopEntry3
        dw      LoopEntry4,LoopEntry5,LoopEntry6,LoopEntry7
M_IBL   macro   P1
        local   NoMatch
LoopEntry&P1&:
        scasw                   ;Do we have an ID match?
        jnz     NoMatch         ;No match
                                ;We have a match
        inc     bx              ;IDMatchCount++;
        add     dx,[di]         ;IDMatchSum += DataPointer->Value;
NoMatch:
        add     di,DATA_ELEMENT_SIZE-2 ;point to the next element
                                ; (SCASW advanced 2 bytes already)
        endm
        align   2
IntraBlockLoop:
        M_IBL   8
        M_IBL   7
        M_IBL   6
        M_IBL   5
        M_IBL   4
        M_IBL   3
        M_IBL   2
        M_IBL   1
        loop    IntraBlockLoop
; Point to the next block and continue if that pointer isn’t NULL.
DoNextBlock:
        mov     si,[si+NextBlock] ;Get pointer to the next block
        and     si,si           ;Is it a NULL pointer?
        jnz     BlockLoop       ;No, continue
; Calculate the average of all matches.
        sub     ax,ax           ;Assume we found no matches
        and     bx,bx
        jz      Done            ;We didn’t find any matches, return 0
        xchg    ax,dx           ;Prepare for division
        div     bx              ;Return IDMatchSum / IDMatchCount
Done:   pop     si              ;Restore C register variables
        pop     di
        pop     bp              ;Restore caller’s stack frame
        ret
_FindIDAverage  ENDP
        end

Table of Contents