• This is the x64 assembly-language program I am timing and charting
  • Use flatassembler.net to assemble.
  • taken from my code in https://github.com/tallpeak/euler92
  • See last cell for the source
In [2]:
open System
open System.Diagnostics
open System.Text // StringBuilder
In [3]:
let psi = new System.Diagnostics.ProcessStartInfo()
psi.FileName <- @"C:\Users\aawest\Downloads\FASM\time_fbcd.exe"
psi.RedirectStandardOutput <- true
psi.RedirectStandardError <- true
psi.RedirectStandardInput <- true
psi.UseShellExecute <- false
psi.CreateNoWindow <- false
let p = System.Diagnostics.Process.Start(psi)
let out = new StringBuilder()
while not p.HasExited do 
    out.Append(p.StandardOutput.ReadToEnd()) |> ignore
p.WaitForExit()
out.Append(p.StandardOutput.ReadToEnd())  |> ignore
let sout = out.ToString()
In [4]:
let lines = sout.Split([|"\r\n"|], StringSplitOptions.None)
            |> Seq.skip(2)
In [53]:
let stringIsShort (s:string) = s.Length <= 3
let lineparser (line:string) : double array=
    line.Split([|' '|], StringSplitOptions.RemoveEmptyEntries) 
            |> Array.filter stringIsShort
            |> Array.map Double.Parse
In [79]:
let makeTuple (i:int) (x:Double) = (i, x)
let datas = lines
            |> Seq.map lineparser 
            |> Seq.filter(fun l -> l.Length = 4)
// movingAvg from "Professional F# 2.0":
let movingAvg period data =
     Seq.windowed period data
     |> Seq.map Array.average                 
let transform (sqad:double array seq) (i:int) =
    sqad |> Seq.map ( fun a -> a.[i] )
         |> movingAvg 10 
         |> Seq.mapi(makeTuple) 
         |> Seq.toArray 
let data0 = transform datas 0
let data1 = transform datas 1
let data2 = transform datas 2
let data3 = transform datas 3
In [80]:
Chart.Combine
 [ Chart.Line(data0);
   Chart.Line(data1);
   Chart.Line(data2);
   Chart.Line(data3)
 ] |> Chart.WithSize(800,600)
   |> Display

Done! (Source code is below)

;; time_fbcd.asm
;; By Aaron W. West 2013-12-20 tallpeak@hotmail.com

ITERATIONS = 10000

; timing is consistently around 216 for fbstp regardless of the size of the number (not including printf %x conversion)
; timing is around 230 for div for 9 digits or 460+ for 18 digits
; timing is around 42 to 76 cycles for utoa_mul with 9 to 18 digits
; timing is around 230+ cycles for 9 digits for _itoa, similar to div loop
;
format PE64 console
entry start

include 'INCLUDE\WIN64a.INC' ; win64a?
;include 'C:\dev\fasmw\INCLUDE\API\KERNEL32.INC' ; win64a?
;SetConsoleWindowInfo
;SetConsoleScreenBufferSize
macro timer_save timer {
    RDTSC
    shl rdx,32
    ;mov eax,eax     ; zero top 32 bits
    or  rax,rdx
    mov qword[timer],rax ;avoiding the stack incase we don't pair these properly
}

macro timer_elapsed timer {
    RDTSC
    shl rdx,32
    ;mov eax,eax     ; this seems unnecessary
    or  rax,rdx
    sub rax,qword[timer]
}

UPPERLIMIT = 10000000        ; add up to two 0s; you can compute up to a billion

section '.text' code readable executable

  start:
    ;sub     rsp,8*5     ; reserve stack for API use and make stack dqword aligned
    ;http://board.flatassembler.net/topic.php?t=1953
    invoke GetStdHandle, STD_OUTPUT_HANDLE
    mov [hStdOut], rax

    ;stdcall WINDOW, [hStdOut], 80,50
    ;mov rcx,[hStdOut]
    ;mov rdx,80
    ;mov r8,50
    ;call WINDOW

    ;invoke SetConsoleScreenBufferSize, [handle], dword [coord]
    ;test eax, eax
    ;jz .error


    mov [counter],ITERATIONS     ; iterations to run (one line per iteration)
    lea rcx,[headerline]     ; a header so that people can see what I'm timing
    call [printf]
loop1:

; first time floating point bcd store and pop instruction
    timer_save T0

    mov rax,123456789 ;012345678 ; 18 digit max
    mov qword[ildvalue],rax
    fild qword[ildvalue]
    fbstp tword[ildvalue]

    timer_elapsed T0

    lea rcx,[formattime_hex]
    mov rdx,rax
    movzx r8,word[ildvalue+8]
    mov r9,qword[ildvalue]
    call [printf]

; now time my utoa with multiply by reciprocal method

    timer_save T0

    mov rax,123456789
    lea rdi, [utoabuf+23]
    ;call donothing ; 18 cycles
    call utoa_mul ; 45 cycles , so 27 really

    timer_elapsed T0

    ;lea rcx,[formattime_string]
    ;mov rdx,rax
    ;mov r8,rdi
    ;call [printf]
    invoke printf,formattime_string,rax,rdi

; now time utoa with division method
    timer_save T0

    mov rax,123456789
    lea rdi, [utoabuf+23]
    call utoa_div

    timer_elapsed T0

    lea rcx,[formattime_string]
    mov rdx,rax
    mov r8,rdi
    call [printf]

; now time msvcrt _itoa
    timer_save T0

    mov rcx,123456787
    lea rdx,[utoabuf]
    mov r8,10 ; number base
    call [_itoa]

    timer_elapsed T0

    ;lea rcx,[formattime_string]
    ;mov rdx,rax
    ;lea r8,[utoabuf]
    ;call [printf]
    invoke printf,formattime_string,rax,utoabuf

    lea rcx,[newline]
    call [printf]


    dec [counter]
    jnz loop1

;     invoke printf,press_enter
;     call [getchar]
    call [ExitProcess]
    retq

utoa_div:
    mov rbx,rax ; digits in rbx
    mov rcx, 10
    mov [rdi],byte 0
u2a1:
    dec rdi
    mov rax, rbx
    xor rdx, rdx
    div rcx
    mov rbx, rax
    add rdx,'0'
    mov [rdi], dl
    or  rbx,rbx
    jnz u2a1
    retq

donothing: retq ; for timing the overhead

; multiplication-by-inverse version
; divmod10 can be implemented in two multiplies, so let's try it
utoa_mul:
utoa:
    mov rbx, rax
    mov rcx, 0x199999999999999a  ;Text.Printf.printf"%x"$(2^64+9)`div`10
    mov rsi, 10
    mov [rdi],byte 0
u2am1:
    mov rax, rbx
    mul rcx      ; rax contains modulus in high bits, rdx = div
    mov rbx, rdx ; save div back to rbx
    mul rsi      ; now rdx = modulus (last digit, 0 to 9)
    or  rdx, '0'
    dec rdi
    mov [rdi], dl
    or  rbx, rbx
    jnz u2am1
    retq

;crashes:
WINDOW:
;------------------------------------------------------------
; 
;proc WINDOW uses rcx rdx, handle, x, y
;   Usage: 
;    stdcall WINDOW, output_handle, cols, rows 
;   Returns: 
;    EAX = zero on success, else -1. 
; 
;------------------------------------------------------------ 
    ;local coord COORD
    ;local rect SMALL_RECT

    ;cmp [x], MIN_COLS 
    ;jb .error 
    ;cmp [y], MIN_ROWS 
    ;jb .error 
    ;cmp [x], MAX_COLS 
    ;ja .error 
    ;cmp [y], MAX_ROWS 
    ;ja .error 

    mov r11,rcx
    or r11, r11 ; [handle]
    jz .error
    mov [handle],r11

    jmp .skipestimate
    ; Get the largest size we can size the console window to. 
    invoke GetLargestConsoleWindowSize, [handle]
    mov dword[coord.X], eax

    mov [rect.Left], 0 
    mov [rect.Top], 0 
    mov [rect.Right], 1 
    mov [rect.Bottom], 1 

    ; Set window size to 1,1 in order to set any buffer size. 
    invoke SetConsoleWindowInfo, [handle], TRUE, addr rect.Left
    test eax, eax 
    jz .error 

    ; rect.Right = min(x, coord.X) - 1) 
    mov rax, rdx
    movzx ecx, word [coord.X] 
    sub ecx, eax 
    sbb edx, edx 
    and ecx, edx 
    add eax, ecx 
    dec eax 
    mov [rect.Right], rax

    ; rect.Bottom = min(y, coord.Y) - 1) 
    mov rax, r8
    movzx ecx, word [coord.Y] 
    sub ecx, eax 
    sbb edx, edx 
    and ecx, edx 
    add eax, ecx 
    dec eax 
    mov [rect.Bottom], rax

    ; Define the new console buffer size. 
    mov rax, rdx
    mov [coord.X], ax
    mov rax, r8
    mov [coord.Y], ax

;.skipestimate:
    mov rcx,80
    mov rdx,50
    mov [coord.X],cx
    mov [coord.Y],dx
    ;mov r12,80*65536+25
    ; Set console screen buffer size.
    mov r12,80*25
    invoke SetConsoleScreenBufferSize, [handle], r12 ;dword [coord.X] ; [handle]=[handle]
    test eax, eax 
    jz .error 
    retq


.skipestimate:
    ; Set console screen buffer's window size and position.
    invoke SetConsoleWindowInfo, [handle], TRUE, addr rect.Left  ; [handle]=[handle]
    test eax, eax 
    jz .error 

    xor eax, eax 
    retq

.error:
    mov eax, -1 
    retq
;endp


section '.data' data readable writeable
;; 20 bytes per number because (length $ show $ 2^64) == 20
  T0 dq 0
  counter dq 100
  utoabuf rb 24
  ildvalue dt 0.0
  formatdecimal db '%d',9,0
  formatstring db '%s',9,0
  formattime_string db '%3d %10s  ',0
  formattime_hex db '%3d %llx%llx  ',0
  headerline db 'RDTSC clock timings and output for integer to decimal routines',10
        db 'Fbstp/output   utoa_mul/output  utoa_div/output  itoa/output',10,0
  newline db 10,0
  press_enter db 'Press enter to continue.',0
  _caption db 'Win64 assembly',0
  align 8
  hStdOut dq 0
 handle dq 0
 struct coord
      X dw ?
      Y dw ?
 ends
 struct rect
    Left dq 0
    Top dq 0
    Right dq 79
    Bottom dq 49
 ends


section '.idata' import data readable
  library kernel32, 'kernel32.dll', \
      msvcrt,   'msvcrt.dll'
  import kernel32, ExitProcess, 'ExitProcess', GetStdHandle, 'GetStdHandle', SetConsoleScreenBufferSize, 'SetConsoleScreenBufferSize', \
     SetConsoleWindowInfo, 'SetConsoleWindowInfo', GetLargestConsoleWindowSize, 'GetLargestConsoleWindowSize'
  import msvcrt, printf, 'printf', getchar, 'getchar', _itoa, '_itoa'

  kernel_name db 'KERNEL32.DLL',0
  user_name db 'USER32.DLL',0
;;EOF
In []: