diff options
author | Andy Polyakov <appro@openssl.org> | 2013-12-09 21:02:24 +0100 |
---|---|---|
committer | Andy Polyakov <appro@openssl.org> | 2013-12-09 21:02:24 +0100 |
commit | ec9cc70f72454b8d4a84247c86159613cee83b81 (patch) | |
tree | 504ec4eeaf5d13670389711f7bc01915f6a4a595 /crypto/bn/asm/x86_64-mont5.pl | |
parent | d1671f4f1a39d938499c67efe5d4a14c34c09b31 (diff) |
bn/asm/x86_64-mont5.pl: add MULX/AD*X code path.
This also eliminates code duplication between x86_64-mont and x86_64-mont
and optimizes even original non-MULX code.
Diffstat (limited to 'crypto/bn/asm/x86_64-mont5.pl')
-rwxr-xr-x | crypto/bn/asm/x86_64-mont5.pl | 2863 |
1 files changed, 2440 insertions, 423 deletions
diff --git a/crypto/bn/asm/x86_64-mont5.pl b/crypto/bn/asm/x86_64-mont5.pl index 93257376ac..265ee7d13a 100755 --- a/crypto/bn/asm/x86_64-mont5.pl +++ b/crypto/bn/asm/x86_64-mont5.pl @@ -17,6 +17,13 @@ # is implemented, so that scatter-/gathering can be tuned without # bn_exp.c modifications. +# August 2013. +# +# Add MULX/AD*X code paths and additional interfaces to optimize for +# branch prediction unit. For input lengths that are multiples of 8 +# the np argument is not just modulus value, but one interleaved +# with 0. This is to optimize post-condition... + $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } @@ -74,10 +81,8 @@ $code=<<___; .type bn_mul_mont_gather5,\@function,6 .align 64 bn_mul_mont_gather5: - test \$3,${num}d + test \$7,${num}d jnz .Lmul_enter - cmp \$8,${num}d - jb .Lmul_enter ___ $code.=<<___ if ($addx); mov OPENSSL_ia32cap_P+8(%rip),%r11d @@ -88,6 +93,7 @@ $code.=<<___; .align 16 .Lmul_enter: mov ${num}d,${num}d + mov %rsp,%rax mov `($win64?56:8)`(%rsp),%r10d # load 7th argument push %rbx push %rbp @@ -100,10 +106,8 @@ $code.=<<___ if ($win64); lea -0x28(%rsp),%rsp movaps %xmm6,(%rsp) movaps %xmm7,0x10(%rsp) -.Lmul_alloca: ___ $code.=<<___; - mov %rsp,%rax lea 2($num),%r11 neg %r11 lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+2)) @@ -309,7 +313,7 @@ $code.=<<___; lea 1($i),$i # i++ cmp $num,$i - jl .Louter + jb .Louter xor $i,$i # i=0 and clear CF! mov (%rsp),%rax # tp[0] @@ -345,18 +349,17 @@ $code.=<<___; mov \$1,%rax ___ $code.=<<___ if ($win64); - movaps (%rsi),%xmm6 - movaps 0x10(%rsi),%xmm7 - lea 0x28(%rsi),%rsi + movaps -88(%rsi),%xmm6 + movaps -72(%rsi),%xmm7 ___ $code.=<<___; - mov (%rsi),%r15 - mov 8(%rsi),%r14 - mov 16(%rsi),%r13 - mov 24(%rsi),%r12 - mov 32(%rsi),%rbp - mov 40(%rsi),%rbx - lea 48(%rsi),%rsp + mov -48(%rsi),%r15 + mov -40(%rsi),%r14 + mov -32(%rsi),%r13 + mov -24(%rsi),%r12 + mov -16(%rsi),%rbp + mov -8(%rsi),%rbx + lea (%rsi),%rsp .Lmul_epilogue: ret .size bn_mul_mont_gather5,.-bn_mul_mont_gather5 @@ -366,7 +369,7 @@ my @A=("%r10","%r11"); my @N=("%r13","%rdi"); $code.=<<___; .type bn_mul4x_mont_gather5,\@function,6 -.align 16 +.align 32 bn_mul4x_mont_gather5: .Lmul4x_enter: ___ @@ -376,8 +379,8 @@ $code.=<<___ if ($addx); je .Lmulx4x_enter ___ $code.=<<___; - mov ${num}d,${num}d - mov `($win64?56:8)`(%rsp),%r10d # load 7th argument + .byte 0x67 + mov %rsp,%rax push %rbx push %rbp push %r12 @@ -389,23 +392,78 @@ $code.=<<___ if ($win64); lea -0x28(%rsp),%rsp movaps %xmm6,(%rsp) movaps %xmm7,0x10(%rsp) -.Lmul4x_alloca: ___ $code.=<<___; - mov %rsp,%rax - lea 4($num),%r11 - neg %r11 - lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+4)) - and \$-1024,%rsp # minimize TLB usage + .byte 0x67 + mov ${num}d,%r10d + shl \$3,${num}d + shl \$3+2,%r10d # 4*$num + neg $num # -$num - mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp + ############################################################## + # ensure that stack frame doesn't alias with $aptr+4*$num + # modulo 4096, which covers ret[num], am[num] and n[2*num] + # (see bn_exp.c). this is done to allow memory disambiguation + # logic do its magic. [excessive frame is allocated in order + # to allow bn_from_mont8x to clear it.] + # + lea -64(%rsp,$num,2),%r11 + sub $ap,%r11 + and \$4095,%r11 + cmp %r11,%r10 + jb .Lmul4xsp_alt + sub %r11,%rsp # align with $ap + lea -64(%rsp,$num,2),%rsp # alloca(128+num*8) + jmp .Lmul4xsp_done + +.align 32 +.Lmul4xsp_alt: + lea 4096-64(,$num,2),%r10 + lea -64(%rsp,$num,2),%rsp # alloca(128+num*8) + sub %r10,%r11 + mov \$0,%r10 + cmovc %r10,%r11 + sub %r11,%rsp +.Lmul4xsp_done: + and \$-64,%rsp + neg $num + + mov %rax,40(%rsp) .Lmul4x_body: - mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp - mov %rdx,%r12 # reassign $bp + + call mul4x_internal + + mov 40(%rsp),%rsi # restore %rsp + mov \$1,%rax +___ +$code.=<<___ if ($win64); + movaps -88(%rsi),%xmm6 + movaps -72(%rsi),%xmm7 +___ +$code.=<<___; + mov -48(%rsi),%r15 + mov -40(%rsi),%r14 + mov -32(%rsi),%r13 + mov -24(%rsi),%r12 + mov -16(%rsi),%rbp + mov -8(%rsi),%rbx + lea (%rsi),%rsp +.Lmul4x_epilogue: + ret +.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 + +.type mul4x_internal,\@abi-omnipotent +.align 32 +mul4x_internal: + shl \$5,$num + mov `($win64?56:8)`(%rax),%r10d # load 7th argument + lea 256(%rdx,$num),%r13 + shr \$5,$num # restore $num ___ $bp="%r12"; $STRIDE=2**5*8; # 5 is "window size" $N=$STRIDE/4; # should match cache line size + $tp=$i; $code.=<<___; mov %r10,%r11 shr \$`log($N/8)/log(2)`,%r10 @@ -413,458 +471,1561 @@ $code.=<<___; not %r10 lea .Lmagic_masks(%rip),%rax and \$`2**5/($N/8)-1`,%r10 # 5 is "window size" - lea 96($bp,%r11,8),$bp # pointer within 1st cache line + lea 96(%rdx,%r11,8),$bp # pointer within 1st cache line movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which movq 8(%rax,%r10,8),%xmm5 # cache line contains element + add \$7,%r11 movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument movq 24(%rax,%r10,8),%xmm7 + and \$7,%r11 movq `0*$STRIDE/4-96`($bp),%xmm0 + lea $STRIDE($bp),$tp # borrow $tp movq `1*$STRIDE/4-96`($bp),%xmm1 pand %xmm4,%xmm0 movq `2*$STRIDE/4-96`($bp),%xmm2 pand %xmm5,%xmm1 movq `3*$STRIDE/4-96`($bp),%xmm3 pand %xmm6,%xmm2 + .byte 0x67 por %xmm1,%xmm0 + movq `0*$STRIDE/4-96`($tp),%xmm1 + .byte 0x67 pand %xmm7,%xmm3 + .byte 0x67 por %xmm2,%xmm0 - lea $STRIDE($bp),$bp + movq `1*$STRIDE/4-96`($tp),%xmm2 + .byte 0x67 + pand %xmm4,%xmm1 + .byte 0x67 por %xmm3,%xmm0 + movq `2*$STRIDE/4-96`($tp),%xmm3 movq %xmm0,$m0 # m0=bp[0] + movq `3*$STRIDE/4-96`($tp),%xmm0 + mov %r13,16+8(%rsp) # save end of b[num] + mov $rp, 56+8(%rsp) # save $rp + mov ($n0),$n0 # pull n0[0] value mov ($ap),%rax - - xor $i,$i # i=0 - xor $j,$j # j=0 - - movq `0*$STRIDE/4-96`($bp),%xmm0 - movq `1*$STRIDE/4-96`($bp),%xmm1 - pand %xmm4,%xmm0 - movq `2*$STRIDE/4-96`($bp),%xmm2 - pand %xmm5,%xmm1 + lea ($ap,$num),$ap # end of a[num] + neg $num mov $n0,$m1 mulq $m0 # ap[0]*bp[0] mov %rax,$A[0] mov ($np),%rax - movq `3*$STRIDE/4-96`($bp),%xmm3 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 + pand %xmm5,%xmm2 + pand %xmm6,%xmm3 + por %xmm2,%xmm1 imulq $A[0],$m1 # "tp[0]"*n0 + ############################################################## + # $tp is chosen so that writing to top-most element of the + # vector occurs just "above" references to powers table, + # "above" modulo cache-line size, which effectively precludes + # possibility of memory disambiguation logic failure when + # accessing the table. + # + lea 64+8(%rsp,%r11,8),$tp mov %rdx,$A[1] - por %xmm2,%xmm0 - lea $STRIDE($bp),$bp - por %xmm3,%xmm0 + pand %xmm7,%xmm0 + por %xmm3,%xmm1 + lea 2*$STRIDE($bp),$bp + por %xmm1,%xmm0 mulq $m1 # np[0]*m1 add %rax,$A[0] # discarded - mov 8($ap),%rax + mov 8($ap,$num),%rax adc \$0,%rdx mov %rdx,$N[1] mulq $m0 add %rax,$A[1] - mov 8($np),%rax + mov 16*1($np),%rax # interleaved with 0, therefore 16*n adc \$0,%rdx mov %rdx,$A[0] mulq $m1 add %rax,$N[1] - mov 16($ap),%rax + mov 16($ap,$num),%rax adc \$0,%rdx add $A[1],$N[1] - lea 4($j),$j # j++ + lea 4*8($num),$j # j=4 + lea 16*4($np),$np adc \$0,%rdx - mov $N[1],(%rsp) + mov $N[1],($tp) mov %rdx,$N[0] jmp .L1st4x -.align 16 + +.align 32 .L1st4x: mulq $m0 # ap[j]*bp[0] add %rax,$A[0] - mov -16($np,$j,8),%rax + mov -16*2($np),%rax + lea 32($tp),$tp adc \$0,%rdx mov %rdx,$A[1] mulq $m1 # np[j]*m1 add %rax,$N[0] - mov -8($ap,$j,8),%rax + mov -8($ap,$j),%rax adc \$0,%rdx add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] adc \$0,%rdx - mov $N[0],-24(%rsp,$j,8) # tp[j-1] + mov $N[0],-24($tp) # tp[j-1] mov %rdx,$N[1] mulq $m0 # ap[j]*bp[0] add %rax,$A[1] - mov -8($np,$j,8),%rax + mov -16*1($np),%rax adc \$0,%rdx mov %rdx,$A[0] mulq $m1 # np[j]*m1 add %rax,$N[1] - mov ($ap,$j,8),%rax + mov ($ap,$j),%rax adc \$0,%rdx add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] adc \$0,%rdx - mov $N[1],-16(%rsp,$j,8) # tp[j-1] + mov $N[1],-16($tp) # tp[j-1] mov %rdx,$N[0] mulq $m0 # ap[j]*bp[0] add %rax,$A[0] - mov ($np,$j,8),%rax + mov 16*0($np),%rax adc \$0,%rdx mov %rdx,$A[1] mulq $m1 # np[j]*m1 add %rax,$N[0] - mov 8($ap,$j,8),%rax + mov 8($ap,$j),%rax adc \$0,%rdx add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] adc \$0,%rdx - mov $N[0],-8(%rsp,$j,8) # tp[j-1] + mov $N[0],-8($tp) # tp[j-1] mov %rdx,$N[1] mulq $m0 # ap[j]*bp[0] add %rax,$A[1] - mov 8($np,$j,8),%rax + mov 16*1($np),%rax adc \$0,%rdx - lea 4($j),$j # j++ mov %rdx,$A[0] mulq $m1 # np[j]*m1 add %rax,$N[1] - mov -16($ap,$j,8),%rax + mov 16($ap,$j),%rax adc \$0,%rdx add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] + lea 16*4($np),$np adc \$0,%rdx - mov $N[1],-32(%rsp,$j,8) # tp[j-1] + mov $N[1],($tp) # tp[j-1] mov %rdx,$N[0] - cmp $num,$j - jl .L1st4x + + add \$32,$j # j+=4 + jnz .L1st4x mulq $m0 # ap[j]*bp[0] add %rax,$A[0] - mov -16($np,$j,8),%rax + mov -16*2($np),%rax + lea 32($tp),$tp adc \$0,%rdx mov %rdx,$A[1] mulq $m1 # np[j]*m1 add %rax,$N[0] - mov -8($ap,$j,8),%rax + mov -8($ap),%rax adc \$0,%rdx add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] adc \$0,%rdx - mov $N[0],-24(%rsp,$j,8) # tp[j-1] + mov $N[0],-24($tp) # tp[j-1] mov %rdx,$N[1] mulq $m0 # ap[j]*bp[0] add %rax,$A[1] - mov -8($np,$j,8),%rax + mov -16*1($np),%rax adc \$0,%rdx mov %rdx,$A[0] mulq $m1 # np[j]*m1 add %rax,$N[1] - mov ($ap),%rax # ap[0] + mov ($ap,$num),%rax # ap[0] adc \$0,%rdx add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] adc \$0,%rdx - mov $N[1],-16(%rsp,$j,8) # tp[j-1] + mov $N[1],-16($tp) # tp[j-1] mov %rdx,$N[0] movq %xmm0,$m0 # bp[1] + lea ($np,$num,2),$np # rewind $np xor $N[1],$N[1] add $A[0],$N[0] adc \$0,$N[1] - mov $N[0],-8(%rsp,$j,8) - mov $N[1],(%rsp,$j,8) # store upmost overflow bit + mov $N[0],-8($tp) - lea 1($i),$i # i++ -.align 4 -.Louter4x: - xor $j,$j # j=0 - movq `0*$STRIDE/4-96`($bp),%xmm0 - movq `1*$STRIDE/4-96`($bp),%xmm1 - pand %xmm4,%xmm0 - movq `2*$STRIDE/4-96`($bp),%xmm2 - pand %xmm5,%xmm1 + jmp .Louter4x - mov (%rsp),$A[0] +.align 32 +.Louter4x: + mov ($tp,$num),$A[0] mov $n0,$m1 mulq $m0 # ap[0]*bp[i] add %rax,$A[0] # ap[0]*bp[i]+tp[0] mov ($np),%rax adc \$0,%rdx + movq `0*$STRIDE/4-96`($bp),%xmm0 + movq `1*$STRIDE/4-96`($bp),%xmm1 + pand %xmm4,%xmm0 + movq `2*$STRIDE/4-96`($bp),%xmm2 + pand %xmm5,%xmm1 movq `3*$STRIDE/4-96`($bp),%xmm3 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 imulq $A[0],$m1 # tp[0]*n0 + .byte 0x67 mov %rdx,$A[1] + mov $N[1],($tp) # store upmost overflow bit + pand %xmm6,%xmm2 + por %xmm1,%xmm0 + pand %xmm7,%xmm3 por %xmm2,%xmm0 + lea ($tp,$num),$tp # rewind $tp lea $STRIDE($bp),$bp por %xmm3,%xmm0 mulq $m1 # np[0]*m1 add %rax,$A[0] # "$N[0]", discarded - mov 8($ap),%rax + mov 8($ap,$num),%rax adc \$0,%rdx mov %rdx,$N[1] mulq $m0 # ap[j]*bp[i] add %rax,$A[1] - mov 8($np),%rax + mov 16*1($np),%rax # interleaved with 0, therefore 16*n adc \$0,%rdx - add 8(%rsp),$A[1] # +tp[1] + add 8($tp),$A[1] # +tp[1] adc \$0,%rdx mov %rdx,$A[0] mulq $m1 # np[j]*m1 add %rax,$N[1] - mov 16($ap),%rax + mov 16($ap,$num),%rax adc \$0,%rdx add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j] - lea 4($j),$j # j+=2 + lea 4*8($num),$j # j=4 + lea 16*4($np),$np adc \$0,%rdx mov %rdx,$N[0] jmp .Linner4x -.align 16 + +.align 32 .Linner4x: mulq $m0 # ap[j]*bp[i] add %rax,$A[0] - mov -16($np,$j,8),%rax + mov -16*2($np),%rax adc \$0,%rdx - add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] + add 16($tp),$A[0] # ap[j]*bp[i]+tp[j] + lea 32($tp),$tp adc \$0,%rdx mov %rdx,$A[1] mulq $m1 # np[j]*m1 add %rax,$N[0] - mov -8($ap,$j,8),%rax + mov -8($ap,$j),%rax adc \$0,%rdx add $A[0],$N[0] adc \$0,%rdx - mov $N[1],-32(%rsp,$j,8) # tp[j-1] + mov $N[1],-32($tp) # tp[j-1] mov %rdx,$N[1] mulq $m0 # ap[j]*bp[i] add %rax,$A[1] - mov -8($np,$j,8),%rax + mov -16*1($np),%rax adc \$0,%rdx - add -8(%rsp,$j,8),$A[1] + add -8($tp),$A[1] adc \$0,%rdx mov %rdx,$A[0] mulq $m1 # np[j]*m1 add %rax,$N[1] - mov ($ap,$j,8),%rax + mov ($ap,$j),%rax adc \$0,%rdx add $A[1],$N[1] adc \$0,%rdx - mov $N[0],-24(%rsp,$j,8) # tp[j-1] + mov $N[0],-24($tp) # tp[j-1] mov %rdx,$N[0] mulq $m0 # ap[j]*bp[i] add %rax,$A[0] - mov ($np,$j,8),%rax + mov 16*0($np),%rax adc \$0,%rdx - add (%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] + add ($tp),$A[0] # ap[j]*bp[i]+tp[j] adc \$0,%rdx mov %rdx,$A[1] mulq $m1 # np[j]*m1 add %rax,$N[0] - mov 8($ap,$j,8),%rax + mov 8($ap,$j),%rax adc \$0,%rdx add $A[0],$N[0] adc \$0,%rdx - mov $N[1],-16(%rsp,$j,8) # tp[j-1] + mov $N[1],-16($tp) # tp[j-1] mov %rdx,$N[1] mulq $m0 # ap[j]*bp[i] add %rax,$A[1] - mov 8($np,$j,8),%rax + mov 16*1($np),%rax adc \$0,%rdx - add 8(%rsp,$j,8),$A[1] + add 8($tp),$A[1] adc \$0,%rdx - lea 4($j),$j # j++ mov %rdx,$A[0] mulq $m1 # np[j]*m1 add %rax,$N[1] - mov -16($ap,$j,8),%rax + mov 16($ap,$j),%rax adc \$0,%rdx add $A[1],$N[1] + lea 16*4($np),$np adc \$0,%rdx - mov $N[0],-40(%rsp,$j,8) # tp[j-1] + mov $N[0],-8($tp) # tp[j-1] mov %rdx,$N[0] - cmp $num,$j - jl .Linner4x + + add \$32,$j # j+=4 + jnz .Linner4x mulq $m0 # ap[j]*bp[i] add %rax,$A[0] - mov -16($np,$j,8),%rax + mov -16*2($np),%rax adc \$0,%rdx - add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] + add 16($tp),$A[0] # ap[j]*bp[i]+tp[j] + lea 32($tp),$tp adc \$0,%rdx mov %rdx,$A[1] mulq $m1 # np[j]*m1 add %rax,$N[0] - mov -8($ap,$j,8),%rax + mov -8($ap),%rax adc \$0,%rdx add $A[0],$N[0] adc \$0,%rdx - mov $N[1],-32(%rsp,$j,8) # tp[j-1] + mov $N[1],-32($tp) # tp[j-1] mov %rdx,$N[1] mulq $m0 # ap[j]*bp[i] add %rax,$A[1] - mov -8($np,$j,8),%rax + mov $m1,%rax + mov -16*1($np),$m1 adc \$0,%rdx - add -8(%rsp,$j,8),$A[1] + add -8($tp),$A[1] adc \$0,%rdx - lea 1($i),$i # i++ mov %rdx,$A[0] mulq $m1 # np[j]*m1 add %rax,$N[1] - mov ($ap),%rax # ap[0] + mov ($ap,$num),%rax # ap[0] adc \$0,%rdx add $A[1],$N[1] adc \$0,%rdx - mov $N[0],-24(%rsp,$j,8) # tp[j-1] + mov $N[0],-24($tp) # tp[j-1] mov %rdx,$N[0] movq %xmm0,$m0 # bp[i+1] - mov $N[1],-16(%rsp,$j,8) # tp[j-1] + mov $N[1],-16($tp) # tp[j-1] + lea ($np,$num,2),$np # rewind $np xor $N[1],$N[1] add $A[0],$N[0] adc \$0,$N[1] - add (%rsp,$num,8),$N[0] # pull upmost overflow bit - adc \$0,$N[1] - mov $N[0],-8(%rsp,$j,8) - mov $N[1],(%rsp,$j,8) # store upmost overflow bit + add ($tp),$N[0] # pull upmost overflow bit + adc \$0,$N[1] # upmost overflow bit + mov $N[0],-8($tp) - cmp $num,$i - jl .Louter4x + cmp 16+8(%rsp),$bp + jb .Louter4x ___ -{ -my @ri=("%rax","%rdx",$m0,$m1); +if (1) { $code.=<<___; - mov 16(%rsp,$num,8),$rp # restore $rp - mov 0(%rsp),@ri[0] # tp[0] - pxor %xmm0,%xmm0 - mov 8(%rsp),@ri[1] # tp[1] - shr \$2,$num # num/=4 - lea (%rsp),$ap # borrow ap for tp - xor $i,$i # i=0 and clear CF! - - sub 0($np),@ri[0] - mov 16($ap),@ri[2] # tp[2] - mov 24($ap),@ri[3] # tp[3] - sbb 8($np),@ri[1] - lea -1($num),$j # j=num/4-1 + sub $N[0],$m1 # compare top-most words + adc $j,$j # $j is zero + or $j,$N[1] + xor \$1,$N[1] + lea ($tp,$num),%rbx # tptr in .sqr4x_sub + lea ($np,$N[1],8),%rbp # nptr in .sqr4x_sub + mov %r9,%rcx + sar \$3+2,%rcx # cf=0 + mov 56+8(%rsp),%rdi # rptr in .sqr4x_sub + jmp .Lsqr4x_sub +___ +} else { +my @ri=("%rax",$bp,$m0,$m1); +my $rp="%rdx"; +$code.=<<___ + xor \$1,$N[1] + lea ($tp,$num),$tp # rewind $tp + sar \$5,$num # cf=0 + lea ($np,$N[1],8),$np + mov 56+8(%rsp),$rp # restore $rp jmp .Lsub4x -.align 16 + +.align 32 .Lsub4x: - mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i] - mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i] - sbb 16($np,$i,8),@ri[2] - mov 32($ap,$i,8),@ri[0] # tp[i+1] - mov 40($ap,$i,8),@ri[1] - sbb 24($np,$i,8),@ri[3] - mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i] - mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i] - sbb 32($np,$i,8),@ri[0] - mov 48($ap,$i,8),@ri[2] - mov 56($ap,$i,8),@ri[3] - sbb 40($np,$i,8),@ri[1] - lea 4($i),$i # i++ - dec $j # doesnn't affect CF! + .byte 0x66 + mov 8*0($tp),@ri[0] + mov 8*1($tp),@ri[1] + .byte 0x66 + sbb 16*0($np),@ri[0] + mov 8*2($tp),@ri[2] + sbb 16*1($np),@ri[1] + mov 3*8($tp),@ri[3] + lea 4*8($tp),$tp + sbb 16*2($np),@ri[2] + mov @ri[0],8*0($rp) + sbb 16*3($np),@ri[3] + lea 16*4($np),$np + mov @ri[1],8*1($rp) + mov @ri[2],8*2($rp) + mov @ri[3],8*3($rp) + lea 8*4($rp),$rp + + inc $num jnz .Lsub4x - mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i] - mov 32($ap,$i,8),@ri[0] # load overflow bit - sbb 16($np,$i,8),@ri[2] - mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i] - sbb 24($np,$i,8),@ri[3] - mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i] + ret +___ +} +$code.=<<___; +.size mul4x_internal,.-mul4x_internal +___ +}}} +{{{ +###################################################################### +# void bn_power5( +my $rptr="%rdi"; # BN_ULONG *rptr, +my $aptr="%rsi"; # const BN_ULONG *aptr, +my $bptr="%rdx"; # const void *table, +my $nptr="%rcx"; # const BN_ULONG *nptr, +my $n0 ="%r8"; # const BN_ULONG *n0); +my $num ="%r9"; # int num, has to be divisible by 8 + # int pwr + +my ($i,$j,$tptr)=("%rbp","%rcx",$rptr); +my @A0=("%r10","%r11"); +my @A1=("%r12","%r13"); +my ($a0,$a1,$ai)=("%r14","%r15","%rbx"); - sbb \$0,@ri[0] # handle upmost overflow bit - mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i] - xor $i,$i # i=0 - and @ri[0],$ap - not @ri[0] - mov $rp,$np - and @ri[0],$np - lea -1($num),$j - or $np,$ap # ap=borrow?tp:rp +$code.=<<___; +.globl bn_power5 +.type bn_power5,\@function,6 +.align 32 +bn_power5: +___ +$code.=<<___ if ($addx); + mov OPENSSL_ia32cap_P+8(%rip),%r11d + and \$0x80100,%r11d + cmp \$0x80100,%r11d + je .Lpowerx5_enter +___ +$code.=<<___; + mov %rsp,%rax + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 +___ +$code.=<<___ if ($win64); + lea -0x28(%rsp),%rsp + movaps %xmm6,(%rsp) + movaps %xmm7,0x10(%rsp) +___ +$code.=<<___; + mov ${num}d,%r10d + shl \$3,${num}d # convert $num to bytes + shl \$3+2,%r10d # 4*$num + neg $num + mov ($n0),$n0 # *n0 - movdqu ($ap),%xmm1 - movdqa %xmm0,(%rsp) - movdqu %xmm1,($rp) - jmp .Lcopy4x -.align 16 -.Lcopy4x: # copy or in-place refresh - movdqu 16($ap,$i),%xmm2 - movdqu 32($ap,$i),%xmm1 - movdqa %xmm0,16(%rsp,$i) - movdqu %xmm2,16($rp,$i) - movdqa %xmm0,32(%rsp,$i) - movdqu %xmm1,32($rp,$i) - lea 32($i),$i - dec $j - jnz .Lcopy4x - - shl \$2,$num - movdqu 16($ap,$i),%xmm2 - movdqa %xmm0,16(%rsp,$i) - movdqu %xmm2,16($rp,$i) + ############################################################## + # ensure that stack frame doesn't alias with $aptr+4*$num + # modulo 4096, which covers ret[num], am[num] and n[2*num] + # (see bn_exp.c). this is done to allow memory disambiguation + # logic do its magic. + # + lea -64(%rsp,$num,2),%r11 + sub $aptr,%r11 + and \$4095,%r11 + cmp %r11,%r10 + jb .Lpwr_sp_alt + sub %r11,%rsp # align with $aptr + lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num) + jmp .Lpwr_sp_done + +.align 32 +.Lpwr_sp_alt: + lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num + lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num) + sub %r10,%r11 + mov \$0,%r10 + cmovc %r10,%r11 + sub %r11,%rsp +.Lpwr_sp_done: + and \$-64,%rsp + mov $num,%r10 + neg $num + + ############################################################## + # Stack layout + # + # +0 saved $num, used in reduction section + # +8 &t[2*$num], used in reduction section + # +32 saved *n0 + # +40 saved %rsp + # +48 t[2*$num] + # + mov $n0, 32(%rsp) + mov %rax, 40(%rsp) # save original %rsp +.Lpower5_body: + movq $rptr,%xmm1 # save $rptr + movq $nptr,%xmm2 # save $nptr + movq %r10, %xmm3 # -$num + movq $bptr,%xmm4 + + call __bn_sqr8x_internal + call __bn_sqr8x_internal + call __bn_sqr8x_internal + call __bn_sqr8x_internal + call __bn_sqr8x_internal + + mov %xmm2,$nptr + movq %xmm4,$bptr + mov $aptr,$rptr + mov 40(%rsp),%rax + lea 32(%rsp),$n0 + + call mul4x_internal + + mov 40(%rsp),%rsi # restore %rsp + mov \$1,%rax + mov -48(%rsi),%r15 + mov -40(%rsi),%r14 + mov -32(%rsi),%r13 + mov -24(%rsi),%r12 + mov -16(%rsi),%rbp + mov -8(%rsi),%rbx + lea (%rsi),%rsp +.Lpower5_epilogue: + ret +.size bn_power5,.-bn_power5 + +.globl bn_sqr8x_internal +.hidden bn_sqr8x_internal +.type bn_sqr8x_internal,\@abi-omnipotent +.align 32 +bn_sqr8x_internal: +__bn_sqr8x_internal: + ############################################################## + # Squaring part: + # + # a) multiply-n-add everything but a[i]*a[i]; + # b) shift result of a) by 1 to the left and accumulate + # a[i]*a[i] products; + # + ############################################################## + # a[1]a[0] + # a[2]a[0] + # a[3]a[0] + # a[2]a[1] + # a[4]a[0] + # a[3]a[1] + # a[5]a[0] + # a[4]a[1] + # a[3]a[2] + # a[6]a[0] + # a[5]a[1] + # a[4]a[2] + # a[7]a[0] + # a[6]a[1] + # a[5]a[2] + # a[4]a[3] + # a[7]a[1] + # a[6]a[2] + # a[5]a[3] + # a[7]a[2] + # a[6]a[3] + # a[5]a[4] + # a[7]a[3] + # a[6]a[4] + # a[7]a[4] + # a[6]a[5] + # a[7]a[5] + # a[7]a[6] + # a[1]a[0] + # a[2]a[0] + # a[3]a[0] + # a[4]a[0] + # a[5]a[0] + # a[6]a[0] + # a[7]a[0] + # a[2]a[1] + # a[3]a[1] + # a[4]a[1] + # a[5]a[1] + # a[6]a[1] + # a[7]a[1] + # a[3]a[2] + # a[4]a[2] + # a[5]a[2] + # a[6]a[2] + # a[7]a[2] + # a[4]a[3] + # a[5]a[3] + # a[6]a[3] + # a[7]a[3] + # a[5]a[4] + # a[6]a[4] + # a[7]a[4] + # a[6]a[5] + # a[7]a[5] + # a[7]a[6] + # a[0]a[0] + # a[1]a[1] + # a[2]a[2] + # a[3]a[3] + # a[4]a[4] + # a[5]a[5] + # a[6]a[6] + # a[7]a[7] + + lea 32(%r10),$i # $i=-($num-32) + lea ($aptr,$num),$aptr # end of a[] buffer, ($aptr,$i)=&ap[2] + + mov $num,$j # $j=$num + + # comments apply to $num==8 case + mov -32($aptr,$i),$a0 # a[0] + lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] + mov -24($aptr,$i),%rax # a[1] + lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] + mov -16($aptr,$i),$ai # a[2] + mov %rax,$a1 + + mul $a0 # a[1]*a[0] + mov %rax,$A0[0] # a[1]*a[0] + mov $ai,%rax # a[2] + mov %rdx,$A0[1] + mov $A0[0],-24($tptr,$i) # t[1] + + mul $a0 # a[2]*a[0] + add %rax,$A0[1] + mov $ai,%rax + adc \$0,%rdx + mov $A0[1],-16($tptr,$i) # t[2] + mov %rdx,$A0[0] + + + mov -8($aptr,$i),$ai # a[3] + mul $a1 # a[2]*a[1] + mov %rax,$A1[0] # a[2]*a[1]+t[3] + mov $ai,%rax + mov %rdx,$A1[1] + + lea ($i),$j + mul $a0 # a[3]*a[0] + add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] + mov $ai,%rax + mov %rdx,$A0[1] + adc \$0,$A0[1] + add $A1[0],$A0[0] + adc \$0,$A0[1] + mov $A0[0],-8($tptr,$j) # t[3] + jmp .Lsqr4x_1st + +.align 32 +.Lsqr4x_1st: + mov ($aptr,$j),$ai # a[4] + mul $a1 # a[3]*a[1] + add %rax,$A1[1] # a[3]*a[1]+t[4] + mov $ai,%rax + mov %rdx,$A1[0] + adc \$0,$A1[0] + + mul $a0 # a[4]*a[0] + add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4] + mov $ai,%rax # a[3] + mov 8($aptr,$j),$ai # a[5] + mov %rdx,$A0[0] + adc \$0,$A0[0] + add $A1[1],$A0[1] + adc \$0,$A0[0] + + + mul $a1 # a[4]*a[3] + add %rax,$A1[0] # a[4]*a[3]+t[5] + mov $ai,%rax + mov $A0[1],($tptr,$j) # t[4] + mov %rdx,$A1[1] + adc \$0,$A1[1] + + mul $a0 # a[5]*a[2] + add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5] + mov $ai,%rax + mov 16($aptr,$j),$ai # a[6] + mov %rdx,$A0[1] + adc \$0,$A0[1] + add $A1[0],$A0[0] + adc \$0,$A0[1] + + mul $a1 # a[5]*a[3] + add %rax,$A1[1] # a[5]*a[3]+t[6] + mov $ai,%rax + mov $A0[0],8($tptr,$j) # t[5] + mov %rdx,$A1[0] + adc \$0,$A1[0] + + mul $a0 # a[6]*a[2] + add %rax,$A0[1] # a[6]*a[2]+a[5]*a[3]+t[6] + mov $ai,%rax # a[3] + mov 24($aptr,$j),$ai # a[7] + mov %rdx,$A0[0] + adc \$0,$A0[0] + add $A1[1],$A0[1] + adc \$0,$A0[0] + + + mul $a1 # a[6]*a[5] + add %rax,$A1[0] # a[6]*a[5]+t[7] + mov $ai,%rax + mov $A0[1],16($tptr,$j) # t[6] + mov %rdx,$A1[1] + adc \$0,$A1[1] + lea 32($j),$j + + mul $a0 # a[7]*a[4] + add %rax,$A0[0] # a[7]*a[4]+a[6]*a[5]+t[6] + mov $ai,%rax + mov %rdx,$A0[1] + adc \$0,$A0[1] + add $A1[0],$A0[0] + adc \$0,$A0[1] + mov $A0[0],-8($tptr,$j) # t[7] + + cmp \$0,$j + jne .Lsqr4x_1st + + mul $a1 # a[7]*a[5] + add %rax,$A1[1] + lea 16($i),$i + adc \$0,%rdx + add $A0[1],$A1[1] + adc \$0,%rdx + + mov $A1[1],($tptr) # t[8] + mov %rdx,$A1[0] + mov %rdx,8($tptr) # t[9] + jmp .Lsqr4x_outer + +.align 32 +.Lsqr4x_outer: # comments apply to $num==6 case + mov -32($aptr,$i),$a0 # a[0] + lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] + mov -24($aptr,$i),%rax # a[1] + lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] + mov -16($aptr,$i),$ai # a[2] + mov %rax,$a1 + + mul $a0 # a[1]*a[0] + mov -24($tptr,$i),$A0[0] # t[1] + add %rax,$A0[0] # a[1]*a[0]+t[1] + mov $ai,%rax # a[2] + adc \$0,%rdx + mov $A0[0],-24($tptr,$i) # t[1] + mov %rdx,$A0[1] + + mul $a0 # a[2]*a[0] + add %rax,$A0[1] + mov $ai,%rax + adc \$0,%rdx + add -16($tptr,$i),$A0[1] # a[2]*a[0]+t[2] + mov %rdx,$A0[0] + adc \$0,$A0[0] + mov $A0[1],-16($tptr,$i) # t[2] + + xor $A1[0],$A1[0] + + mov -8($aptr,$i),$ai # a[3] + mul $a1 # a[2]*a[1] + add %rax,$A1[0] # a[2]*a[1]+t[3] + mov $ai,%rax + adc \$0,%rdx + add -8($tptr,$i),$A1[0] + mov %rdx,$A1[1] + adc \$0,$A1[1] + + mul $a0 # a[3]*a[0] + add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] + mov $ai,%rax + adc \$0,%rdx + add $A1[0],$A0[0] + mov %rdx,$A0[1] + adc \$0,$A0[1] + mov $A0[0],-8($tptr,$i) # t[3] + + lea ($i),$j + jmp .Lsqr4x_inner + +.align 32 +.Lsqr4x_inner: + mov ($aptr,$j),$ai # a[4] + mul $a1 # a[3]*a[1] + add %rax,$A1[1] # a[3]*a[1]+t[4] + mov $ai,%rax + mov %rdx,$A1[0] + adc \$0,$A1[0] + add ($tptr,$j),$A1[1] + adc \$0,$A1[0] + + .byte 0x67 + mul $a0 # a[4]*a[0] + add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4] + mov $ai,%rax # a[3] + mov 8($aptr,$j),$ai # a[5] + mov %rdx,$A0[0] + adc \$0,$A0[0] + add $A1[1],$A0[1] + adc \$0,$A0[0] + + mul $a1 # a[4]*a[3] + add %rax,$A1[0] # a[4]*a[3]+t[5] + mov $A0[1],($tptr,$j) # t[4] + mov $ai,%rax + mov %rdx,$A1[1] + adc \$0,$A1[1] + add 8($tptr,$j),$A1[0] + lea 16($j),$j # j++ + adc \$0,$A1[1] + + mul $a0 # a[5]*a[2] + add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5] + mov $ai,%rax + adc \$0,%rdx + add $A1[0],$A0[0] + mov %rdx,$A0[1] + adc \$0,$A0[1] + mov $A0[0],-8($tptr,$j) # t[5], "preloaded t[1]" below + + cmp \$0,$j + jne .Lsqr4x_inner + + .byte 0x67 + mul $a1 # a[5]*a[3] + add %rax,$A1[1] + adc \$0,%rdx + add $A0[1],$A1[1] + adc \$0,%rdx + + mov $A1[1],($tptr) # t[6], "preloaded t[2]" below + mov %rdx,$A1[0] + mov %rdx,8($tptr) # t[7], "preloaded t[3]" below + + add \$16,$i + jnz .Lsqr4x_outer + + # comments apply to $num==4 case + mov -32($aptr),$a0 # a[0] + lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] + mov -24($aptr),%rax # a[1] + lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] + mov -16($aptr),$ai # a[2] + mov %rax,$a1 + + mul $a0 # a[1]*a[0] + add %rax,$A0[0] # a[1]*a[0]+t[1], preloaded t[1] + mov $ai,%rax # a[2] + mov %rdx,$A0[1] + adc \$0,$A0[1] + |