| Previous | Table of Contents | Next |
Its hard to squeeze much more performance from this code by tweaking it, as exemplified by Listing 8.3, a fine-tuned assembly version of FindIDAverage that was produced by looking at the assembly output of MS C/C++ and tightening it. Listing 8.3 eliminates all stack frame access in the inner loop, but thats about all the tightening there is to do. The result, as shown in Table 8.1, is that Listing 8.3 runs a modest 11 percent faster than Listing 8.1 on a 386. The results could vary considerably, depending on the nature of the data set searched through (average block size and frequency of matches). But, then, understanding the typical and worst case conditions is part of optimization, isnt it?
LISTING 8.3 L8-3.ASM
; Typically optimized assembly language version of FindIDAverage.
SearchedForID equ 4 ;Passed parameter offsets in the
BlockPointer equ 6 ; stack frame (skip over pushed BP
; and the return address)
NextBlock equ 0 ;Field offsets in struct BlockHeader
BlockCount equ 2
BLOCK_HEADER_SIZE equ 4 ;Number of bytes in struct BlockHeader
ID equ 0 ;struct DataElement field offsets
Value equ 2
DATA_ELEMENT_SIZE equ 4 ;Number of bytes in struct DataElement
.model small
.code
public _FindIDAverage
| On 20 MHz 386 | On 10 MHz 286 | |
|---|---|---|
| Listing 8.1 | 294 microseconds | 768 microseconds |
| (MSC with maximum optimization) | ||
| Listing 8.3 | 265 | 644 |
| (Assembly) | ||
| Listing 8.4 | 212 | 486 |
| (Optimized assembly) | ||
| Listing 8.6 | 100 | 207 |
| (Optimized assembly with reorganized data) | ||
| Table 8.1 Execution Times of FindIDAverage. | ||
_FindIDAverage proc near
push bp ;Save callers stack frame
mov bp,sp ;Point to our stack frame
push di ;Preserve C register variables
push si
sub dx,dx ;IDMatchSum = 0
mov bx,dx ;IDMatchCount = 0
mov si,[bp+BlockPointer] ;Pointer to first block
mov ax,[bp+SearchedForID] ;ID were looking for
; Search through all the linked blocks until the last block
; (marked with a NULL pointer to the next block) has been searched.
BlockLoop:
; Point to the first DataElement entry within this block.
lea di,[si+BLOCK_HEADER_SIZE]
; Search through all the DataElement entries within this block
; and accumulate data from all that match the desired ID.
mov cx,[si+BlockCount]
jcxz DoNextBlock ;No data in this block
IntraBlockLoop:
cmp [di+ID],ax ;Do we have an ID match?
jnz NoMatch ;No match
inc bx ;We have a match; IDMatchCount++;
add dx,[di+Value] ;IDMatchSum += DataPointer->Value;
NoMatch:
add di,DATA_ELEMENT_SIZE ;point to the next element
loop IntraBlockLoop
; Point to the next block and continue if that pointer isnt NULL.
DoNextBlock:
mov si,[si+NextBlock] ;Get pointer to the next block
and si,si ;Is it a NULL pointer?
jnz BlockLoop ;No, continue
; Calculate the average of all matches.
sub ax,ax ;Assume we found no matches
and bx,bx
jz Done ;We didnt find any matches, return 0
xchg ax,dx ;Prepare for division
div bx ;Return IDMatchSum / IDMatchCount
Done: pop si ;Restore C register variables
pop di
pop bp ;Restore callers stack frame
ret
_FindIDAverage ENDP
end
Listing 8.4 tosses some sophisticated optimization techniques into the mix. The loop is unrolled eight times, eliminating a good deal of branching, and SCASW is used instead of CMP [DI],AX. (Note, however, that SCASW is in fact slower than CMP [DI],AX on the 386 and 486, and is sometimes faster on the 286 and 8088 only because its shorter and therefore may prefetch faster.) This advanced tweaking produces a 39 percent improvement over the original C codesubstantial, but not a tremendous return for the optimization effort invested.
LISTING 8.4 L8-4.ASM
; Heavily optimized assembly language version of FindIDAverage.
; Features an unrolled loop and more efficient pointer use.
SearchedForID equ 4 ;Passed parameter offsets in the
BlockPointer equ 6 ; stack frame (skip over pushed BP
; and the return address)
NextBlock equ 0 ;Field offsets in struct BlockHeader
BlockCount equ 2
BLOCK_HEADER_SIZE equ 4 ;Number of bytes in struct BlockHeader
ID equ 0 ;struct DataElement field offsets
Value equ 2
DATA_ELEMENT_SIZE equ 4 ;Number of bytes in struct DataElement
.model small
.code
public _FindIDAverage
_FindIDAverage proc near
push bp ;Save callers stack frame
mov bp,sp ;Point to our stack frame
push di ;Preserve C register variables
push si
mov di,ds ;Prepare for SCASW
mov es,di
cld
sub dx,dx ;IDMatchSum = 0
mov bx,dx ;IDMatchCount = 0
mov si,[bp+BlockPointer] ;Pointer to first block
mov ax,[bp+SearchedForID] ;ID were looking for
; Search through all of the linked blocks until the last block
; (marked with a NULL pointer to the next block) has been searched.
BlockLoop:
; Point to the first DataElement entry within this block.
lea di,[si+BLOCK_HEADER_SIZE]
; Search through all the DataElement entries within this block
; and accumulate data from all that match the desired ID.
mov cx,[si+BlockCount] ;Number of elements in this block
jcxz DoNextBlock ;Skip this block if its empty
mov bp,cx ;***stack frame no longer available***
add cx,7
shr cx,1 ;Number of repetitions of the unrolled
shr cx,1 ; loop = (BlockCount + 7) / 8
shr cx,1
and bp,7 ;Generate the entry point for the
shl bp,1 ; first, possibly partial pass through
jmp cs:[LoopEntryTable+bp] ; the unrolled loop and
; vector to that entry point
align 2
LoopEntryTable label word
dw LoopEntry8,LoopEntry1,LoopEntry2,LoopEntry3
dw LoopEntry4,LoopEntry5,LoopEntry6,LoopEntry7
M_IBL macro P1
local NoMatch
LoopEntry&P1&:
scasw ;Do we have an ID match?
jnz NoMatch ;No match
;We have a match
inc bx ;IDMatchCount++;
add dx,[di] ;IDMatchSum += DataPointer->Value;
NoMatch:
add di,DATA_ELEMENT_SIZE-2 ;point to the next element
; (SCASW advanced 2 bytes already)
endm
align 2
IntraBlockLoop:
M_IBL 8
M_IBL 7
M_IBL 6
M_IBL 5
M_IBL 4
M_IBL 3
M_IBL 2
M_IBL 1
loop IntraBlockLoop
; Point to the next block and continue if that pointer isnt NULL.
DoNextBlock:
mov si,[si+NextBlock] ;Get pointer to the next block
and si,si ;Is it a NULL pointer?
jnz BlockLoop ;No, continue
; Calculate the average of all matches.
sub ax,ax ;Assume we found no matches
and bx,bx
jz Done ;We didnt find any matches, return 0
xchg ax,dx ;Prepare for division
div bx ;Return IDMatchSum / IDMatchCount
Done: pop si ;Restore C register variables
pop di
pop bp ;Restore callers stack frame
ret
_FindIDAverage ENDP
end
| Previous | Table of Contents | Next |