Paste: loop

Author: pruned
Mode: javascript
Date: Sat, 29 Oct 2011 19:58:22
Plain Text |
fft_tw_2xN_alt(_y, n, m, _table_w)
{
    var y = _y;
    var table_w = begin(_table_w);
    for (k in range(n/2*1i64))
    {
        for (j in range(m*1i64))
        {
            //breakpoint();
            var t = y[k+j*n];
            var s = y[k+n/2+j*n];
            var w_k = _table_w[k];
            var sw = w_k * s;
            y.st[k+j*n] <= t+sw;
            y.st[k+n/2+j*n] <= t-sw;
        }
    }
}

Annotation: more loop

Author: pruned
Mode: javascript
Date: Sat, 29 Oct 2011 20:08:23
Plain Text |
fft_tw_2xN_alt(_y, n, m, _table_w)
{
    var y = _y;
    var table_w = begin(_table_w);
    for (k in range(n/2*1i64))
    {
        for (j in range(m*1i64))
        {
            //breakpoint();
            var t = y[k+j*n];
            //var s = y[k+n/2+j*n] * table_w[k]; // 8% slower
            var s = table_w[k] * y[k+n/2+j*n];
            y.st[k+j*n] <= t + s;
            y.st[k+n/2+j*n] <= t - s;
        }
    }
}

Annotation: assembly loops

Author: pruned
Mode: assembly-x86
Date: Sat, 29 Oct 2011 20:26:19
Plain Text |
00000000`00402762 f3410f10042b    movss   xmm0,dword ptr [r11+rbp]
00000000`00402768 f3410f104ffc    movss   xmm1,dword ptr [r15-4]
00000000`0040276e f3410f1017      movss   xmm2,dword ptr [r15]
00000000`00402773 0f28d8          movaps  xmm3,xmm0
00000000`00402776 f30f59da        mulss   xmm3,xmm2
00000000`0040277a f30f10242b      movss   xmm4,dword ptr [rbx+rbp]
00000000`0040277f 0f28ec          movaps  xmm5,xmm4
00000000`00402782 f30f59e9        mulss   xmm5,xmm1
00000000`00402786 f30f5ceb        subss   xmm5,xmm3
00000000`0040278a f30f101c2a      movss   xmm3,dword ptr [rdx+rbp]
00000000`0040278f 0f28f3          movaps  xmm6,xmm3
00000000`00402792 f30f58f5        addss   xmm6,xmm5 // 6 = 3+5
00000000`00402796 f3410f103c29    movss   xmm7,dword ptr [r9+rbp]
00000000`0040279c f30f11342a      movss   dword ptr [rdx+rbp],xmm6
00000000`004027a1 f30f59c1        mulss   xmm0,xmm1
00000000`004027a5 f30f59e2        mulss   xmm4,xmm2
00000000`004027a9 f30f58e0        addss   xmm4,xmm0
00000000`004027ad 0f28c7          movaps  xmm0,xmm7
00000000`004027b0 f30f58c4        addss   xmm0,xmm4
00000000`004027b4 f3410f110429    movss   dword ptr [r9+rbp],xmm0
00000000`004027ba f30f5cdd        subss   xmm3,xmm5
00000000`004027be f30f111c2b      movss   dword ptr [rbx+rbp],xmm3
00000000`004027c3 f30f5cfc        subss   xmm7,xmm4
00000000`004027c7 f3410f113c2b    movss   dword ptr [r11+rbp],xmm7
00000000`004027cd 4c01c5          add     rbp,r8
00000000`004027d0 49ffcd          dec     r13
00000000`004027d3 758b            jne     main+0x2760 (00000000`00402760)

now 8% faster:



00000000`00402762 f3410f10042b    movss   xmm0,dword ptr [r11+rbp]
00000000`00402768 f3410f104ffc    movss   xmm1,dword ptr [r15-4]
00000000`0040276e f3410f1017      movss   xmm2,dword ptr [r15]
00000000`00402773 0f28da          movaps  xmm3,xmm2
00000000`00402776 f30f59d8        mulss   xmm3,xmm0 // 3 = 0*2
00000000`0040277a f30f10242b      movss   xmm4,dword ptr [rbx+rbp]
00000000`0040277f 0f28e9          movaps  xmm5,xmm1
00000000`00402782 f30f59ec        mulss   xmm5,xmm4 // 5 = 1*4
00000000`00402786 f30f5ceb        subss   xmm5,xmm3 // 5 = 1*4-3
00000000`0040278a f30f101c2a      movss   xmm3,dword ptr [rdx+rbp]
00000000`0040278f 0f28f3          movaps  xmm6,xmm3  
00000000`00402792 f30f58f5        addss   xmm6,xmm5 // 6=3+5
00000000`00402796 f3410f103c29    movss   xmm7,dword ptr [r9+rbp]
00000000`0040279c f30f11342a      movss   dword ptr [rdx+rbp],xmm6
00000000`004027a1 f30f59d4        mulss   xmm2,xmm4
00000000`004027a5 f30f59c8        mulss   xmm1,xmm0
00000000`004027a9 f30f58ca        addss   xmm1,xmm2
00000000`004027ad 0f28c7          movaps  xmm0,xmm7
00000000`004027b0 f30f58c1        addss   xmm0,xmm1
00000000`004027b4 f3410f110429    movss   dword ptr [r9+rbp],xmm0
00000000`004027ba f30f5cdd        subss   xmm3,xmm5
00000000`004027be f30f111c2b      movss   dword ptr [rbx+rbp],xmm3
00000000`004027c3 f30f5cf9        subss   xmm7,xmm1
00000000`004027c7 f3410f113c2b    movss   dword ptr [r11+rbp],xmm7
00000000`004027cd 4c01c5          add     rbp,r8
00000000`004027d0 49ffcd          dec     r13
00000000`004027d3 758b            jne     main+0x2760 (00000000`00402760)

New Annotation

Summary:
Author:
Mode:
Body: