Paste: multithreading

Author: pruned
Mode: factor
Date: Sun, 31 Oct 2010 09:07:26
Plain Text |
USING: accessors alien alien.c-types alien.data alien.syntax
arrays assocs classes.struct combinators compiler.codegen.fixup
compiler.constants cpu.x86 cpu.x86.assembler
cpu.x86.assembler.operands destructors generalizations grouping
hashtables io.binary kernel libc literals locals
math.vectors.simd multiline namespaces prettyprint 
sequences specialized-arrays tools.time 
windows.kernel32 windows.errors alien.accessors utils 
cpu.architecture ;
EXCLUDE: math => float ;
SPECIALIZED-ARRAYS: float double void* uint ;
IN: mt

<PRIVATE

: CreateThread ( lpThreadAttributes dwStackSize lpStartAddress lpParameter dwCreationFlags lpThreadId -- hThread ) 
    [ GetCurrentProcess ] 6 ndip CreateRemoteThread ;

! PAGE_EXECUTE_READWRITE

:: create-thread ( lpStartAddress lpParameter -- hThread )
    f 0 lpStartAddress lpParameter 0 f CreateThread ;

:: wait-threads ( count handles -- )
    count handles t >c-bool INFINITE WaitForMultipleObjects WAIT_FAILED = [ win32-error ] when ;
 
: virtual-alloc ( size -- alien )
    [ GetCurrentProcess f ] dip
    flags{ MEM_COMMIT MEM_RESERVE }
    PAGE_EXECUTE_READWRITE
    VirtualAllocEx ;

: virtual-free ( alien -- )
    [ GetCurrentProcess ] dip
    0
    MEM_RELEASE
    VirtualFreeEx win32-error=0/f ;

PRIVATE>

! %bug?: doesn't seem to work if you yield around !

: par-exec ( data/fptr -- )
!     [ first2 create-thread dup <win32-handle> &dispose ] map
    [ first2 swap create-thread ] map
    [ length ] [ >void*-array wait-threads ] 
    [ [ CloseHandle win32-error=0/f ] each ] tri ;

: par-each ( data fptr -- )
    [ 2array ] curry map par-exec ;

<PRIVATE

TUPLE: code-repository
    { memory-block }
    { ptr }
    { size }
    { table } ;

: <code-repository> ( -- cr )
    f 0 0 H{ } clone code-repository boa ;

: alloc-block ( size -- alien ) virtual-alloc ;
: free-block ( alien -- ) virtual-free ;

:: increase-capacity ( cr -- )
    cr size>> 0 = 
    [
        1024 alloc-block cr memory-block<<
        1024 cr size<<
    ]
    [
        cr size>> 2 * cr size<<
        cr memory-block>> free-block
        cr size>> alloc-block cr memory-block<<
    ] if ;

:: allocate ( code cr -- alien )
    [ cr ptr>> code length + cr size>> > ] [ cr increase-capacity ] while
    cr ptr>> cr memory-block>> <displaced-alien>
    [ code code length memcpy ] keep
    cr ptr>> code length + cr ptr<< ;

: flush-code-repository ( -- )
    code-repository get-global
    {
        [ memory-block>> dup dup expired? not and [ free-block ] [ drop ] if ]
        [ f swap memory-block<< ]
        [ 0 swap ptr<< ]
        [ 0 swap size<< ]
        [ table>> values [ f swap set-second ] each ]
    }
    cleave ;

: revive ( cr -- cr )
    dup memory-block>> expired? [ flush-code-repository ] when ;

PRIVATE>

: get-func ( name -- fptr )
    code-repository get-global revive
    table>> at
    dup second [ second ] 
    [ [ first call( -- fptr ) code-repository get-global allocate dup ] keep set-second ] if ;

<PRIVATE

! cleanup
code-repository get-global [ flush-code-repository ] when
<code-repository> code-repository set-global

: register ( quot name -- ) [ f 2array ] dip code-repository get-global table>> set-at ;

:: link-jumps ( code -- binary )
    3 code nth 3 group :> labels
    code last :> binary
    labels 
    [| l |
        l first rc-relative assert= ! rc-relative
        l third l second - <int>
        l second 4 -
        binary copy
    ] each 
    binary ;    

PRIVATE>

: gen-foo ( -- fptr )
    [ 
        EAX 666 MOV 
        0 RET
    ] with-fixup link-jumps ;
[ gen-foo ] "foo" register

:: gen-rmw ( -- fptr )
    { "ptr" }
    [ 4 * 4 + 2array ] map-index >hashtable :> args
    [
        EDX "ptr" args at stack@ MOV
        EAX EDX [] MOV
        EAX 1 ADD
        EDX [] EAX MOV
!        EAX EDX MOV
        0 RET
    ] with-fixup link-jumps ;
[ gen-rmw ] "rmw" register

:: gen-barrier ( -- fptr )
    { "ptr" }
    [ 4 * 4 + 2array ] map-index >hashtable :> args
    [
        EDX "ptr" args at stack@ MOV
        
        MFENCE
        LOCK EDX [] DEC
        
        <label> dup resolve-label :> loop
        
            PAUSE
            EAX EDX [] MOV
            
        EAX 0 CMP
        loop JNE
                
        0 RET
    ] with-fixup link-jumps ;
[ gen-barrier ] "barrier" register


:: gen-rdtsc ( -- fptr )
    { "ptr" }
    [ 4 * 4 + 2array ] map-index >hashtable :> args
    [
        ECX "ptr" args at stack@ MOV
        RDTSC
        ECX 0 [+] EAX MOV
        ECX 4 [+] EDX MOV        
        
        0 RET
    ] with-fixup link-jumps ;
[ gen-rdtsc ] "rdtsc" register
: rdtsc-test ( struct -- ) "rdtsc" get-func void { pointer: longlong } cdecl alien-indirect ;

STRUCT: scheduler-data
    { buffer int* }
    { start int }
    { end int }
    ;
   
:: gen-scheduler ( -- fptr )
    { "buffer" "start" "end" }
    [ 4 * EAX swap [+] 2array ] map-index >hashtable :> struct
    [
        EBP PUSH
        EBP ESP MOV
        ESP 16 SUB

        0 stack@ EBX MOV
        4 stack@ ESI MOV
        8 stack@ EDI MOV

        EAX EBP 2 4 * [+] MOV
        ESI "buffer" struct at MOV
        EBX "start" struct at MOV
        EDI "end" struct at MOV
        
        <label> dup resolve-label :> loop
        
            EAX ESI EBX 0 [+*8+] MOV
            ECX ESI EBX 4 [+*8+] MOV
            EAX PUSH
            ECX CALL        ! cdecl or stdcall
            ESP EBP MOV
            ESP 16 SUB      ! restore stack regardless
            
        EBX 1 ADD
        EBX EDI CMP
        loop JL
        
        EBX 0 stack@ MOV
        ESI 4 stack@ MOV
        EDI 8 stack@ MOV
        
        ESP EBP MOV
        EBP POP
        0 RET
    ] with-fixup link-jumps ;
[ gen-scheduler ] "scheduler" register
: scheduler ( struct -- ) "scheduler" get-func void { pointer: scheduler-data } cdecl alien-indirect ;

:: sched ( work -- )
    [
        work
        [
            [ [ alien-address ] map ] map
            [ concat >uint-array malloc-copy &free 0 ]
            [ length ] bi
            scheduler-data <struct-boa> malloc-copy &free
        ] map "scheduler" get-func par-each
    ] with-destructors ;
    
: sched-test ( seq -- seq' )
    [
        [ >uint-array malloc-copy &free alien-address ] 
        [ length iota [ 4 * + ] with map ] bi
        [
            [ "rmw" get-func 2array ] map
            2 group sched
        ] keep [ <alien> *int ] map
    ] with-destructors ;
    
:: barrier-test ( k th -- time )
    [
        k 4 * 256 + malloc &free alien-address 256 align <alien>
        k iota [ 4 * swap <displaced-alien> ] with map :> barriers
        barriers [ th swap 0 set-alien-unsigned-cell ] each
        th [ barriers [ "barrier" get-func 2array ] map ] replicate
        [ sched ] benchmark
    ] with-destructors ;
        
:: par-each-bench ( data k fptr -- time )
    [
        data length :> th
        k 4 * 256 + malloc &free alien-address 256 align <alien>
        k iota [ 4 * swap <displaced-alien> ] with map :> barriers
        barriers [ th swap 0 set-alien-unsigned-cell ] each

        k th * 2 * 8 * 256 + malloc &free alien-address 256 align <alien>
        k th * 2 * iota [ 8 * swap <displaced-alien> ] with map 
        2 group k group :> counters
        
        data 
        counters
        [| param thread-counters |
            barriers
            thread-counters
            [| barrier counter-ab |
                counter-ab first "rdtsc" get-func 2array
                barrier "barrier" get-func 2array
                param fptr 2array
                counter-ab second "rdtsc" get-func 2array
                4array
                ! second 1array ! no barriers
            ] 2map concat
        ] 2map
        [ sched ] benchmark k / :> time!
        counters [ [ [ *longlong ] map ] map ] map :> times

        times flip [ [ first2 swap - ] map ] map
        [ supremum ] map but-last ! remove last time, as there is no barrier
        infimum 2.673335 / 
        drop
        ! time!
        
        time
    ] with-destructors ;
                
STRUCT: memtest-data
    { base float* }
    { start int }
    { length int }
    { vec4 float* }
    ;

:: gen-memtest ( -- fptr )
    { "struct" }
    [ 4 * 16 + 2array ] map-index >hashtable :> args
    [
        EBX PUSH
        ESI PUSH
        EDI PUSH

        EAX "struct" args at stack@ MOV
        ESI EAX 0 [+] MOV ! base
        ESI EAX 4 [+] ADD ! start
        EDI EAX 12 [+] MOV ! start
        
        ECX 0 MOV
        EDX EAX 8 [+] MOV ! length
        
        XMM1 EDI 0 [+] MOVDQA ! vec4 load
        XMM1 XMM1 XORPS
        
        <label> dup resolve-label :> Xloop
        XMM0 ESI ECX [+] MOVDQA
        XMM1 XMM0 PADDD

        XMM0 ESI ECX 16 [++] MOVDQA
        XMM1 XMM0 PADDD

        XMM0 ESI ECX 32 [++] MOVDQA
        XMM1 XMM0 PADDD

        XMM0 ESI ECX 48 [++] MOVDQA
        XMM1 XMM0 PADDD
        
        ECX 64 ADD
        ECX EDX CMP
        Xloop JL


        EDI 0 [+] XMM1 MOVDQA ! vec4 store

        EDI POP
        ESI POP
        EBX POP
        0 RET
    ] with-fixup link-jumps ;
[ gen-memtest ] "memtest" register
: memtest ( struct -- ) "memtest" get-func void { pointer: memtest-data } cdecl alien-indirect ;

:: bw-test ( mb loops threads -- time )
    [
        threads 
        [ 
            mb 1024 * aligned-malloc&free
            0
            mb 1024 *
            16 aligned-malloc&free
            memtest-data <struct-boa> malloc-copy &free
        ] replicate

        loops "memtest" get-func par-each-bench
        1000 1000 * /f
    ] with-destructors ;


! 64 256 8 [ bw-test recip ] 3keep * * * .

New Annotation

Summary:
Author:
Mode:
Body: