summaryrefslogtreecommitdiffstats
path: root/crypto/bn/asm/x86_64-mont5.pl
diff options
context:
space:
mode:
authorAndy Polyakov <appro@openssl.org>2013-12-09 21:02:24 +0100
committerAndy Polyakov <appro@openssl.org>2013-12-09 21:02:24 +0100
commitec9cc70f72454b8d4a84247c86159613cee83b81 (patch)
tree504ec4eeaf5d13670389711f7bc01915f6a4a595 /crypto/bn/asm/x86_64-mont5.pl
parentd1671f4f1a39d938499c67efe5d4a14c34c09b31 (diff)
bn/asm/x86_64-mont5.pl: add MULX/AD*X code path.
This also eliminates code duplication between x86_64-mont and x86_64-mont and optimizes even original non-MULX code.
Diffstat (limited to 'crypto/bn/asm/x86_64-mont5.pl')
-rwxr-xr-xcrypto/bn/asm/x86_64-mont5.pl2863
1 files changed, 2440 insertions, 423 deletions
diff --git a/crypto/bn/asm/x86_64-mont5.pl b/crypto/bn/asm/x86_64-mont5.pl
index 93257376ac..265ee7d13a 100755
--- a/crypto/bn/asm/x86_64-mont5.pl
+++ b/crypto/bn/asm/x86_64-mont5.pl
@@ -17,6 +17,13 @@
# is implemented, so that scatter-/gathering can be tuned without
# bn_exp.c modifications.
+# August 2013.
+#
+# Add MULX/AD*X code paths and additional interfaces to optimize for
+# branch prediction unit. For input lengths that are multiples of 8
+# the np argument is not just modulus value, but one interleaved
+# with 0. This is to optimize post-condition...
+
$flavour = shift;
$output = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
@@ -74,10 +81,8 @@ $code=<<___;
.type bn_mul_mont_gather5,\@function,6
.align 64
bn_mul_mont_gather5:
- test \$3,${num}d
+ test \$7,${num}d
jnz .Lmul_enter
- cmp \$8,${num}d
- jb .Lmul_enter
___
$code.=<<___ if ($addx);
mov OPENSSL_ia32cap_P+8(%rip),%r11d
@@ -88,6 +93,7 @@ $code.=<<___;
.align 16
.Lmul_enter:
mov ${num}d,${num}d
+ mov %rsp,%rax
mov `($win64?56:8)`(%rsp),%r10d # load 7th argument
push %rbx
push %rbp
@@ -100,10 +106,8 @@ $code.=<<___ if ($win64);
lea -0x28(%rsp),%rsp
movaps %xmm6,(%rsp)
movaps %xmm7,0x10(%rsp)
-.Lmul_alloca:
___
$code.=<<___;
- mov %rsp,%rax
lea 2($num),%r11
neg %r11
lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+2))
@@ -309,7 +313,7 @@ $code.=<<___;
lea 1($i),$i # i++
cmp $num,$i
- jl .Louter
+ jb .Louter
xor $i,$i # i=0 and clear CF!
mov (%rsp),%rax # tp[0]
@@ -345,18 +349,17 @@ $code.=<<___;
mov \$1,%rax
___
$code.=<<___ if ($win64);
- movaps (%rsi),%xmm6
- movaps 0x10(%rsi),%xmm7
- lea 0x28(%rsi),%rsi
+ movaps -88(%rsi),%xmm6
+ movaps -72(%rsi),%xmm7
___
$code.=<<___;
- mov (%rsi),%r15
- mov 8(%rsi),%r14
- mov 16(%rsi),%r13
- mov 24(%rsi),%r12
- mov 32(%rsi),%rbp
- mov 40(%rsi),%rbx
- lea 48(%rsi),%rsp
+ mov -48(%rsi),%r15
+ mov -40(%rsi),%r14
+ mov -32(%rsi),%r13
+ mov -24(%rsi),%r12
+ mov -16(%rsi),%rbp
+ mov -8(%rsi),%rbx
+ lea (%rsi),%rsp
.Lmul_epilogue:
ret
.size bn_mul_mont_gather5,.-bn_mul_mont_gather5
@@ -366,7 +369,7 @@ my @A=("%r10","%r11");
my @N=("%r13","%rdi");
$code.=<<___;
.type bn_mul4x_mont_gather5,\@function,6
-.align 16
+.align 32
bn_mul4x_mont_gather5:
.Lmul4x_enter:
___
@@ -376,8 +379,8 @@ $code.=<<___ if ($addx);
je .Lmulx4x_enter
___
$code.=<<___;
- mov ${num}d,${num}d
- mov `($win64?56:8)`(%rsp),%r10d # load 7th argument
+ .byte 0x67
+ mov %rsp,%rax
push %rbx
push %rbp
push %r12
@@ -389,23 +392,78 @@ $code.=<<___ if ($win64);
lea -0x28(%rsp),%rsp
movaps %xmm6,(%rsp)
movaps %xmm7,0x10(%rsp)
-.Lmul4x_alloca:
___
$code.=<<___;
- mov %rsp,%rax
- lea 4($num),%r11
- neg %r11
- lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+4))
- and \$-1024,%rsp # minimize TLB usage
+ .byte 0x67
+ mov ${num}d,%r10d
+ shl \$3,${num}d
+ shl \$3+2,%r10d # 4*$num
+ neg $num # -$num
- mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
+ ##############################################################
+ # ensure that stack frame doesn't alias with $aptr+4*$num
+ # modulo 4096, which covers ret[num], am[num] and n[2*num]
+ # (see bn_exp.c). this is done to allow memory disambiguation
+ # logic do its magic. [excessive frame is allocated in order
+ # to allow bn_from_mont8x to clear it.]
+ #
+ lea -64(%rsp,$num,2),%r11
+ sub $ap,%r11
+ and \$4095,%r11
+ cmp %r11,%r10
+ jb .Lmul4xsp_alt
+ sub %r11,%rsp # align with $ap
+ lea -64(%rsp,$num,2),%rsp # alloca(128+num*8)
+ jmp .Lmul4xsp_done
+
+.align 32
+.Lmul4xsp_alt:
+ lea 4096-64(,$num,2),%r10
+ lea -64(%rsp,$num,2),%rsp # alloca(128+num*8)
+ sub %r10,%r11
+ mov \$0,%r10
+ cmovc %r10,%r11
+ sub %r11,%rsp
+.Lmul4xsp_done:
+ and \$-64,%rsp
+ neg $num
+
+ mov %rax,40(%rsp)
.Lmul4x_body:
- mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp
- mov %rdx,%r12 # reassign $bp
+
+ call mul4x_internal
+
+ mov 40(%rsp),%rsi # restore %rsp
+ mov \$1,%rax
+___
+$code.=<<___ if ($win64);
+ movaps -88(%rsi),%xmm6
+ movaps -72(%rsi),%xmm7
+___
+$code.=<<___;
+ mov -48(%rsi),%r15
+ mov -40(%rsi),%r14
+ mov -32(%rsi),%r13
+ mov -24(%rsi),%r12
+ mov -16(%rsi),%rbp
+ mov -8(%rsi),%rbx
+ lea (%rsi),%rsp
+.Lmul4x_epilogue:
+ ret
+.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
+
+.type mul4x_internal,\@abi-omnipotent
+.align 32
+mul4x_internal:
+ shl \$5,$num
+ mov `($win64?56:8)`(%rax),%r10d # load 7th argument
+ lea 256(%rdx,$num),%r13
+ shr \$5,$num # restore $num
___
$bp="%r12";
$STRIDE=2**5*8; # 5 is "window size"
$N=$STRIDE/4; # should match cache line size
+ $tp=$i;
$code.=<<___;
mov %r10,%r11
shr \$`log($N/8)/log(2)`,%r10
@@ -413,458 +471,1561 @@ $code.=<<___;
not %r10
lea .Lmagic_masks(%rip),%rax
and \$`2**5/($N/8)-1`,%r10 # 5 is "window size"
- lea 96($bp,%r11,8),$bp # pointer within 1st cache line
+ lea 96(%rdx,%r11,8),$bp # pointer within 1st cache line
movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which
movq 8(%rax,%r10,8),%xmm5 # cache line contains element
+ add \$7,%r11
movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument
movq 24(%rax,%r10,8),%xmm7
+ and \$7,%r11
movq `0*$STRIDE/4-96`($bp),%xmm0
+ lea $STRIDE($bp),$tp # borrow $tp
movq `1*$STRIDE/4-96`($bp),%xmm1
pand %xmm4,%xmm0
movq `2*$STRIDE/4-96`($bp),%xmm2
pand %xmm5,%xmm1
movq `3*$STRIDE/4-96`($bp),%xmm3
pand %xmm6,%xmm2
+ .byte 0x67
por %xmm1,%xmm0
+ movq `0*$STRIDE/4-96`($tp),%xmm1
+ .byte 0x67
pand %xmm7,%xmm3
+ .byte 0x67
por %xmm2,%xmm0
- lea $STRIDE($bp),$bp
+ movq `1*$STRIDE/4-96`($tp),%xmm2
+ .byte 0x67
+ pand %xmm4,%xmm1
+ .byte 0x67
por %xmm3,%xmm0
+ movq `2*$STRIDE/4-96`($tp),%xmm3
movq %xmm0,$m0 # m0=bp[0]
+ movq `3*$STRIDE/4-96`($tp),%xmm0
+ mov %r13,16+8(%rsp) # save end of b[num]
+ mov $rp, 56+8(%rsp) # save $rp
+
mov ($n0),$n0 # pull n0[0] value
mov ($ap),%rax
-
- xor $i,$i # i=0
- xor $j,$j # j=0
-
- movq `0*$STRIDE/4-96`($bp),%xmm0
- movq `1*$STRIDE/4-96`($bp),%xmm1
- pand %xmm4,%xmm0
- movq `2*$STRIDE/4-96`($bp),%xmm2
- pand %xmm5,%xmm1
+ lea ($ap,$num),$ap # end of a[num]
+ neg $num
mov $n0,$m1
mulq $m0 # ap[0]*bp[0]
mov %rax,$A[0]
mov ($np),%rax
- movq `3*$STRIDE/4-96`($bp),%xmm3
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- pand %xmm7,%xmm3
+ pand %xmm5,%xmm2
+ pand %xmm6,%xmm3
+ por %xmm2,%xmm1
imulq $A[0],$m1 # "tp[0]"*n0
+ ##############################################################
+ # $tp is chosen so that writing to top-most element of the
+ # vector occurs just "above" references to powers table,
+ # "above" modulo cache-line size, which effectively precludes
+ # possibility of memory disambiguation logic failure when
+ # accessing the table.
+ #
+ lea 64+8(%rsp,%r11,8),$tp
mov %rdx,$A[1]
- por %xmm2,%xmm0
- lea $STRIDE($bp),$bp
- por %xmm3,%xmm0
+ pand %xmm7,%xmm0
+ por %xmm3,%xmm1
+ lea 2*$STRIDE($bp),$bp
+ por %xmm1,%xmm0
mulq $m1 # np[0]*m1
add %rax,$A[0] # discarded
- mov 8($ap),%rax
+ mov 8($ap,$num),%rax
adc \$0,%rdx
mov %rdx,$N[1]
mulq $m0
add %rax,$A[1]
- mov 8($np),%rax
+ mov 16*1($np),%rax # interleaved with 0, therefore 16*n
adc \$0,%rdx
mov %rdx,$A[0]
mulq $m1
add %rax,$N[1]
- mov 16($ap),%rax
+ mov 16($ap,$num),%rax
adc \$0,%rdx
add $A[1],$N[1]
- lea 4($j),$j # j++
+ lea 4*8($num),$j # j=4
+ lea 16*4($np),$np
adc \$0,%rdx
- mov $N[1],(%rsp)
+ mov $N[1],($tp)
mov %rdx,$N[0]
jmp .L1st4x
-.align 16
+
+.align 32
.L1st4x:
mulq $m0 # ap[j]*bp[0]
add %rax,$A[0]
- mov -16($np,$j,8),%rax
+ mov -16*2($np),%rax
+ lea 32($tp),$tp
adc \$0,%rdx
mov %rdx,$A[1]
mulq $m1 # np[j]*m1
add %rax,$N[0]
- mov -8($ap,$j,8),%rax
+ mov -8($ap,$j),%rax
adc \$0,%rdx
add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
adc \$0,%rdx
- mov $N[0],-24(%rsp,$j,8) # tp[j-1]
+ mov $N[0],-24($tp) # tp[j-1]
mov %rdx,$N[1]
mulq $m0 # ap[j]*bp[0]
add %rax,$A[1]
- mov -8($np,$j,8),%rax
+ mov -16*1($np),%rax
adc \$0,%rdx
mov %rdx,$A[0]
mulq $m1 # np[j]*m1
add %rax,$N[1]
- mov ($ap,$j,8),%rax
+ mov ($ap,$j),%rax
adc \$0,%rdx
add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
adc \$0,%rdx
- mov $N[1],-16(%rsp,$j,8) # tp[j-1]
+ mov $N[1],-16($tp) # tp[j-1]
mov %rdx,$N[0]
mulq $m0 # ap[j]*bp[0]
add %rax,$A[0]
- mov ($np,$j,8),%rax
+ mov 16*0($np),%rax
adc \$0,%rdx
mov %rdx,$A[1]
mulq $m1 # np[j]*m1
add %rax,$N[0]
- mov 8($ap,$j,8),%rax
+ mov 8($ap,$j),%rax
adc \$0,%rdx
add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
adc \$0,%rdx
- mov $N[0],-8(%rsp,$j,8) # tp[j-1]
+ mov $N[0],-8($tp) # tp[j-1]
mov %rdx,$N[1]
mulq $m0 # ap[j]*bp[0]
add %rax,$A[1]
- mov 8($np,$j,8),%rax
+ mov 16*1($np),%rax
adc \$0,%rdx
- lea 4($j),$j # j++
mov %rdx,$A[0]
mulq $m1 # np[j]*m1
add %rax,$N[1]
- mov -16($ap,$j,8),%rax
+ mov 16($ap,$j),%rax
adc \$0,%rdx
add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
+ lea 16*4($np),$np
adc \$0,%rdx
- mov $N[1],-32(%rsp,$j,8) # tp[j-1]
+ mov $N[1],($tp) # tp[j-1]
mov %rdx,$N[0]
- cmp $num,$j
- jl .L1st4x
+
+ add \$32,$j # j+=4
+ jnz .L1st4x
mulq $m0 # ap[j]*bp[0]
add %rax,$A[0]
- mov -16($np,$j,8),%rax
+ mov -16*2($np),%rax
+ lea 32($tp),$tp
adc \$0,%rdx
mov %rdx,$A[1]
mulq $m1 # np[j]*m1
add %rax,$N[0]
- mov -8($ap,$j,8),%rax
+ mov -8($ap),%rax
adc \$0,%rdx
add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
adc \$0,%rdx
- mov $N[0],-24(%rsp,$j,8) # tp[j-1]
+ mov $N[0],-24($tp) # tp[j-1]
mov %rdx,$N[1]
mulq $m0 # ap[j]*bp[0]
add %rax,$A[1]
- mov -8($np,$j,8),%rax
+ mov -16*1($np),%rax
adc \$0,%rdx
mov %rdx,$A[0]
mulq $m1 # np[j]*m1
add %rax,$N[1]
- mov ($ap),%rax # ap[0]
+ mov ($ap,$num),%rax # ap[0]
adc \$0,%rdx
add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
adc \$0,%rdx
- mov $N[1],-16(%rsp,$j,8) # tp[j-1]
+ mov $N[1],-16($tp) # tp[j-1]
mov %rdx,$N[0]
movq %xmm0,$m0 # bp[1]
+ lea ($np,$num,2),$np # rewind $np
xor $N[1],$N[1]
add $A[0],$N[0]
adc \$0,$N[1]
- mov $N[0],-8(%rsp,$j,8)
- mov $N[1],(%rsp,$j,8) # store upmost overflow bit
+ mov $N[0],-8($tp)
- lea 1($i),$i # i++
-.align 4
-.Louter4x:
- xor $j,$j # j=0
- movq `0*$STRIDE/4-96`($bp),%xmm0
- movq `1*$STRIDE/4-96`($bp),%xmm1
- pand %xmm4,%xmm0
- movq `2*$STRIDE/4-96`($bp),%xmm2
- pand %xmm5,%xmm1
+ jmp .Louter4x
- mov (%rsp),$A[0]
+.align 32
+.Louter4x:
+ mov ($tp,$num),$A[0]
mov $n0,$m1
mulq $m0 # ap[0]*bp[i]
add %rax,$A[0] # ap[0]*bp[i]+tp[0]
mov ($np),%rax
adc \$0,%rdx
+ movq `0*$STRIDE/4-96`($bp),%xmm0
+ movq `1*$STRIDE/4-96`($bp),%xmm1
+ pand %xmm4,%xmm0
+ movq `2*$STRIDE/4-96`($bp),%xmm2
+ pand %xmm5,%xmm1
movq `3*$STRIDE/4-96`($bp),%xmm3
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- pand %xmm7,%xmm3
imulq $A[0],$m1 # tp[0]*n0
+ .byte 0x67
mov %rdx,$A[1]
+ mov $N[1],($tp) # store upmost overflow bit
+ pand %xmm6,%xmm2
+ por %xmm1,%xmm0
+ pand %xmm7,%xmm3
por %xmm2,%xmm0
+ lea ($tp,$num),$tp # rewind $tp
lea $STRIDE($bp),$bp
por %xmm3,%xmm0
mulq $m1 # np[0]*m1
add %rax,$A[0] # "$N[0]", discarded
- mov 8($ap),%rax
+ mov 8($ap,$num),%rax
adc \$0,%rdx
mov %rdx,$N[1]
mulq $m0 # ap[j]*bp[i]
add %rax,$A[1]
- mov 8($np),%rax
+ mov 16*1($np),%rax # interleaved with 0, therefore 16*n
adc \$0,%rdx
- add 8(%rsp),$A[1] # +tp[1]
+ add 8($tp),$A[1] # +tp[1]
adc \$0,%rdx
mov %rdx,$A[0]
mulq $m1 # np[j]*m1
add %rax,$N[1]
- mov 16($ap),%rax
+ mov 16($ap,$num),%rax
adc \$0,%rdx
add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j]
- lea 4($j),$j # j+=2
+ lea 4*8($num),$j # j=4
+ lea 16*4($np),$np
adc \$0,%rdx
mov %rdx,$N[0]
jmp .Linner4x
-.align 16
+
+.align 32
.Linner4x:
mulq $m0 # ap[j]*bp[i]
add %rax,$A[0]
- mov -16($np,$j,8),%rax
+ mov -16*2($np),%rax
adc \$0,%rdx
- add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
+ add 16($tp),$A[0] # ap[j]*bp[i]+tp[j]
+ lea 32($tp),$tp
adc \$0,%rdx
mov %rdx,$A[1]
mulq $m1 # np[j]*m1
add %rax,$N[0]
- mov -8($ap,$j,8),%rax
+ mov -8($ap,$j),%rax
adc \$0,%rdx
add $A[0],$N[0]
adc \$0,%rdx
- mov $N[1],-32(%rsp,$j,8) # tp[j-1]
+ mov $N[1],-32($tp) # tp[j-1]
mov %rdx,$N[1]
mulq $m0 # ap[j]*bp[i]
add %rax,$A[1]
- mov -8($np,$j,8),%rax
+ mov -16*1($np),%rax
adc \$0,%rdx
- add -8(%rsp,$j,8),$A[1]
+ add -8($tp),$A[1]
adc \$0,%rdx
mov %rdx,$A[0]
mulq $m1 # np[j]*m1
add %rax,$N[1]
- mov ($ap,$j,8),%rax
+ mov ($ap,$j),%rax
adc \$0,%rdx
add $A[1],$N[1]
adc \$0,%rdx
- mov $N[0],-24(%rsp,$j,8) # tp[j-1]
+ mov $N[0],-24($tp) # tp[j-1]
mov %rdx,$N[0]
mulq $m0 # ap[j]*bp[i]
add %rax,$A[0]
- mov ($np,$j,8),%rax
+ mov 16*0($np),%rax
adc \$0,%rdx
- add (%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
+ add ($tp),$A[0] # ap[j]*bp[i]+tp[j]
adc \$0,%rdx
mov %rdx,$A[1]
mulq $m1 # np[j]*m1
add %rax,$N[0]
- mov 8($ap,$j,8),%rax
+ mov 8($ap,$j),%rax
adc \$0,%rdx
add $A[0],$N[0]
adc \$0,%rdx
- mov $N[1],-16(%rsp,$j,8) # tp[j-1]
+ mov $N[1],-16($tp) # tp[j-1]
mov %rdx,$N[1]
mulq $m0 # ap[j]*bp[i]
add %rax,$A[1]
- mov 8($np,$j,8),%rax
+ mov 16*1($np),%rax
adc \$0,%rdx
- add 8(%rsp,$j,8),$A[1]
+ add 8($tp),$A[1]
adc \$0,%rdx
- lea 4($j),$j # j++
mov %rdx,$A[0]
mulq $m1 # np[j]*m1
add %rax,$N[1]
- mov -16($ap,$j,8),%rax
+ mov 16($ap,$j),%rax
adc \$0,%rdx
add $A[1],$N[1]
+ lea 16*4($np),$np
adc \$0,%rdx
- mov $N[0],-40(%rsp,$j,8) # tp[j-1]
+ mov $N[0],-8($tp) # tp[j-1]
mov %rdx,$N[0]
- cmp $num,$j
- jl .Linner4x
+
+ add \$32,$j # j+=4
+ jnz .Linner4x
mulq $m0 # ap[j]*bp[i]
add %rax,$A[0]
- mov -16($np,$j,8),%rax
+ mov -16*2($np),%rax
adc \$0,%rdx
- add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
+ add 16($tp),$A[0] # ap[j]*bp[i]+tp[j]
+ lea 32($tp),$tp
adc \$0,%rdx
mov %rdx,$A[1]
mulq $m1 # np[j]*m1
add %rax,$N[0]
- mov -8($ap,$j,8),%rax
+ mov -8($ap),%rax
adc \$0,%rdx
add $A[0],$N[0]
adc \$0,%rdx
- mov $N[1],-32(%rsp,$j,8) # tp[j-1]
+ mov $N[1],-32($tp) # tp[j-1]
mov %rdx,$N[1]
mulq $m0 # ap[j]*bp[i]
add %rax,$A[1]
- mov -8($np,$j,8),%rax
+ mov $m1,%rax
+ mov -16*1($np),$m1
adc \$0,%rdx
- add -8(%rsp,$j,8),$A[1]
+ add -8($tp),$A[1]
adc \$0,%rdx
- lea 1($i),$i # i++
mov %rdx,$A[0]
mulq $m1 # np[j]*m1
add %rax,$N[1]
- mov ($ap),%rax # ap[0]
+ mov ($ap,$num),%rax # ap[0]
adc \$0,%rdx
add $A[1],$N[1]
adc \$0,%rdx
- mov $N[0],-24(%rsp,$j,8) # tp[j-1]
+ mov $N[0],-24($tp) # tp[j-1]
mov %rdx,$N[0]
movq %xmm0,$m0 # bp[i+1]
- mov $N[1],-16(%rsp,$j,8) # tp[j-1]
+ mov $N[1],-16($tp) # tp[j-1]
+ lea ($np,$num,2),$np # rewind $np
xor $N[1],$N[1]
add $A[0],$N[0]
adc \$0,$N[1]
- add (%rsp,$num,8),$N[0] # pull upmost overflow bit
- adc \$0,$N[1]
- mov $N[0],-8(%rsp,$j,8)
- mov $N[1],(%rsp,$j,8) # store upmost overflow bit
+ add ($tp),$N[0] # pull upmost overflow bit
+ adc \$0,$N[1] # upmost overflow bit
+ mov $N[0],-8($tp)
- cmp $num,$i
- jl .Louter4x
+ cmp 16+8(%rsp),$bp
+ jb .Louter4x
___
-{
-my @ri=("%rax","%rdx",$m0,$m1);
+if (1) {
$code.=<<___;
- mov 16(%rsp,$num,8),$rp # restore $rp
- mov 0(%rsp),@ri[0] # tp[0]
- pxor %xmm0,%xmm0
- mov 8(%rsp),@ri[1] # tp[1]
- shr \$2,$num # num/=4
- lea (%rsp),$ap # borrow ap for tp
- xor $i,$i # i=0 and clear CF!
-
- sub 0($np),@ri[0]
- mov 16($ap),@ri[2] # tp[2]
- mov 24($ap),@ri[3] # tp[3]
- sbb 8($np),@ri[1]
- lea -1($num),$j # j=num/4-1
+ sub $N[0],$m1 # compare top-most words
+ adc $j,$j # $j is zero
+ or $j,$N[1]
+ xor \$1,$N[1]
+ lea ($tp,$num),%rbx # tptr in .sqr4x_sub
+ lea ($np,$N[1],8),%rbp # nptr in .sqr4x_sub
+ mov %r9,%rcx
+ sar \$3+2,%rcx # cf=0
+ mov 56+8(%rsp),%rdi # rptr in .sqr4x_sub
+ jmp .Lsqr4x_sub
+___
+} else {
+my @ri=("%rax",$bp,$m0,$m1);
+my $rp="%rdx";
+$code.=<<___
+ xor \$1,$N[1]
+ lea ($tp,$num),$tp # rewind $tp
+ sar \$5,$num # cf=0
+ lea ($np,$N[1],8),$np
+ mov 56+8(%rsp),$rp # restore $rp
jmp .Lsub4x
-.align 16
+
+.align 32
.Lsub4x:
- mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
- mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
- sbb 16($np,$i,8),@ri[2]
- mov 32($ap,$i,8),@ri[0] # tp[i+1]
- mov 40($ap,$i,8),@ri[1]
- sbb 24($np,$i,8),@ri[3]
- mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
- mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
- sbb 32($np,$i,8),@ri[0]
- mov 48($ap,$i,8),@ri[2]
- mov 56($ap,$i,8),@ri[3]
- sbb 40($np,$i,8),@ri[1]
- lea 4($i),$i # i++
- dec $j # doesnn't affect CF!
+ .byte 0x66
+ mov 8*0($tp),@ri[0]
+ mov 8*1($tp),@ri[1]
+ .byte 0x66
+ sbb 16*0($np),@ri[0]
+ mov 8*2($tp),@ri[2]
+ sbb 16*1($np),@ri[1]
+ mov 3*8($tp),@ri[3]
+ lea 4*8($tp),$tp
+ sbb 16*2($np),@ri[2]
+ mov @ri[0],8*0($rp)
+ sbb 16*3($np),@ri[3]
+ lea 16*4($np),$np
+ mov @ri[1],8*1($rp)
+ mov @ri[2],8*2($rp)
+ mov @ri[3],8*3($rp)
+ lea 8*4($rp),$rp
+
+ inc $num
jnz .Lsub4x
- mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
- mov 32($ap,$i,8),@ri[0] # load overflow bit
- sbb 16($np,$i,8),@ri[2]
- mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
- sbb 24($np,$i,8),@ri[3]
- mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
+ ret
+___
+}
+$code.=<<___;
+.size mul4x_internal,.-mul4x_internal
+___
+}}}
+ {{{
+######################################################################
+# void bn_power5(
+my $rptr="%rdi"; # BN_ULONG *rptr,
+my $aptr="%rsi"; # const BN_ULONG *aptr,
+my $bptr="%rdx"; # const void *table,
+my $nptr="%rcx"; # const BN_ULONG *nptr,
+my $n0 ="%r8"; # const BN_ULONG *n0);
+my $num ="%r9"; # int num, has to be divisible by 8
+ # int pwr
+
+my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
+my @A0=("%r10","%r11");
+my @A1=("%r12","%r13");
+my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
- sbb \$0,@ri[0] # handle upmost overflow bit
- mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
- xor $i,$i # i=0
- and @ri[0],$ap
- not @ri[0]
- mov $rp,$np
- and @ri[0],$np
- lea -1($num),$j
- or $np,$ap # ap=borrow?tp:rp
+$code.=<<___;
+.globl bn_power5
+.type bn_power5,\@function,6
+.align 32
+bn_power5:
+___
+$code.=<<___ if ($addx);
+ mov OPENSSL_ia32cap_P+8(%rip),%r11d
+ and \$0x80100,%r11d
+ cmp \$0x80100,%r11d
+ je .Lpowerx5_enter
+___
+$code.=<<___;
+ mov %rsp,%rax
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+___
+$code.=<<___ if ($win64);
+ lea -0x28(%rsp),%rsp
+ movaps %xmm6,(%rsp)
+ movaps %xmm7,0x10(%rsp)
+___
+$code.=<<___;
+ mov ${num}d,%r10d
+ shl \$3,${num}d # convert $num to bytes
+ shl \$3+2,%r10d # 4*$num
+ neg $num
+ mov ($n0),$n0 # *n0
- movdqu ($ap),%xmm1
- movdqa %xmm0,(%rsp)
- movdqu %xmm1,($rp)
- jmp .Lcopy4x
-.align 16
-.Lcopy4x: # copy or in-place refresh
- movdqu 16($ap,$i),%xmm2
- movdqu 32($ap,$i),%xmm1
- movdqa %xmm0,16(%rsp,$i)
- movdqu %xmm2,16($rp,$i)
- movdqa %xmm0,32(%rsp,$i)
- movdqu %xmm1,32($rp,$i)
- lea 32($i),$i
- dec $j
- jnz .Lcopy4x
-
- shl \$2,$num
- movdqu 16($ap,$i),%xmm2
- movdqa %xmm0,16(%rsp,$i)
- movdqu %xmm2,16($rp,$i)
+ ##############################################################
+ # ensure that stack frame doesn't alias with $aptr+4*$num
+ # modulo 4096, which covers ret[num], am[num] and n[2*num]
+ # (see bn_exp.c). this is done to allow memory disambiguation
+ # logic do its magic.
+ #
+ lea -64(%rsp,$num,2),%r11
+ sub $aptr,%r11
+ and \$4095,%r11
+ cmp %r11,%r10
+ jb .Lpwr_sp_alt
+ sub %r11,%rsp # align with $aptr
+ lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
+ jmp .Lpwr_sp_done
+
+.align 32
+.Lpwr_sp_alt:
+ lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num
+ lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
+ sub %r10,%r11
+ mov \$0,%r10
+ cmovc %r10,%r11
+ sub %r11,%rsp
+.Lpwr_sp_done:
+ and \$-64,%rsp
+ mov $num,%r10
+ neg $num
+
+ ##############################################################
+ # Stack layout
+ #
+ # +0 saved $num, used in reduction section
+ # +8 &t[2*$num], used in reduction section
+ # +32 saved *n0
+ # +40 saved %rsp
+ # +48 t[2*$num]
+ #
+ mov $n0, 32(%rsp)
+ mov %rax, 40(%rsp) # save original %rsp
+.Lpower5_body:
+ movq $rptr,%xmm1 # save $rptr
+ movq $nptr,%xmm2 # save $nptr
+ movq %r10, %xmm3 # -$num
+ movq $bptr,%xmm4
+
+ call __bn_sqr8x_internal
+ call __bn_sqr8x_internal
+ call __bn_sqr8x_internal
+ call __bn_sqr8x_internal
+ call __bn_sqr8x_internal
+
+ mov %xmm2,$nptr
+ movq %xmm4,$bptr
+ mov $aptr,$rptr
+ mov 40(%rsp),%rax
+ lea 32(%rsp),$n0
+
+ call mul4x_internal
+
+ mov 40(%rsp),%rsi # restore %rsp
+ mov \$1,%rax
+ mov -48(%rsi),%r15
+ mov -40(%rsi),%r14
+ mov -32(%rsi),%r13
+ mov -24(%rsi),%r12
+ mov -16(%rsi),%rbp
+ mov -8(%rsi),%rbx
+ lea (%rsi),%rsp
+.Lpower5_epilogue:
+ ret
+.size bn_power5,.-bn_power5
+
+.globl bn_sqr8x_internal
+.hidden bn_sqr8x_internal
+.type bn_sqr8x_internal,\@abi-omnipotent
+.align 32
+bn_sqr8x_internal:
+__bn_sqr8x_internal:
+ ##############################################################
+ # Squaring part:
+ #
+ # a) multiply-n-add everything but a[i]*a[i];
+ # b) shift result of a) by 1 to the left and accumulate
+ # a[i]*a[i] products;
+ #
+ ##############################################################
+ # a[1]a[0]
+ # a[2]a[0]
+ # a[3]a[0]
+ # a[2]a[1]
+ # a[4]a[0]
+ # a[3]a[1]
+ # a[5]a[0]
+ # a[4]a[1]
+ # a[3]a[2]
+ # a[6]a[0]
+ # a[5]a[1]
+ # a[4]a[2]
+ # a[7]a[0]
+ # a[6]a[1]
+ # a[5]a[2]
+ # a[4]a[3]
+ # a[7]a[1]
+ # a[6]a[2]
+ # a[5]a[3]
+ # a[7]a[2]
+ # a[6]a[3]
+ # a[5]a[4]
+ # a[7]a[3]
+ # a[6]a[4]
+ # a[7]a[4]
+ # a[6]a[5]
+ # a[7]a[5]
+ # a[7]a[6]
+ # a[1]a[0]
+ # a[2]a[0]
+ # a[3]a[0]
+ # a[4]a[0]
+ # a[5]a[0]
+ # a[6]a[0]
+ # a[7]a[0]
+ # a[2]a[1]
+ # a[3]a[1]
+ # a[4]a[1]
+ # a[5]a[1]
+ # a[6]a[1]
+ # a[7]a[1]
+ # a[3]a[2]
+ # a[4]a[2]
+ # a[5]a[2]
+ # a[6]a[2]
+ # a[7]a[2]
+ # a[4]a[3]
+ # a[5]a[3]
+ # a[6]a[3]
+ # a[7]a[3]
+ # a[5]a[4]
+ # a[6]a[4]
+ # a[7]a[4]
+ # a[6]a[5]
+ # a[7]a[5]
+ # a[7]a[6]
+ # a[0]a[0]
+ # a[1]a[1]
+ # a[2]a[2]
+ # a[3]a[3]
+ # a[4]a[4]
+ # a[5]a[5]
+ # a[6]a[6]
+ # a[7]a[7]
+
+ lea 32(%r10),$i # $i=-($num-32)
+ lea ($aptr,$num),$aptr # end of a[] buffer, ($aptr,$i)=&ap[2]
+
+ mov $num,$j # $j=$num
+
+ # comments apply to $num==8 case
+ mov -32($aptr,$i),$a0 # a[0]
+ lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num]
+ mov -24($aptr,$i),%rax # a[1]
+ lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"]
+ mov -16($aptr,$i),$ai # a[2]
+ mov %rax,$a1
+
+ mul $a0 # a[1]*a[0]
+ mov %rax,$A0[0] # a[1]*a[0]
+ mov $ai,%rax # a[2]
+ mov %rdx,$A0[1]
+ mov $A0[0],-24($tptr,$i) # t[1]
+
+ mul $a0 # a[2]*a[0]
+ add %rax,$A0[1]
+ mov $ai,%rax
+ adc \$0,%rdx
+ mov $A0[1],-16($tptr,$i) # t[2]
+ mov %rdx,$A0[0]
+
+
+ mov -8($aptr,$i),$ai # a[3]
+ mul $a1 # a[2]*a[1]
+ mov %rax,$A1[0] # a[2]*a[1]+t[3]
+ mov $ai,%rax
+ mov %rdx,$A1[1]
+
+ lea ($i),$j
+ mul $a0 # a[3]*a[0]
+ add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3]
+ mov $ai,%rax
+ mov %rdx,$A0[1]
+ adc \$0,$A0[1]
+ add $A1[0],$A0[0]
+ adc \$0,$A0[1]
+ mov $A0[0],-8($tptr,$j) # t[3]
+ jmp .Lsqr4x_1st
+
+.align 32
+.Lsqr4x_1st:
+ mov ($aptr,$j),$ai # a[4]
+ mul $a1 # a[3]*a[1]
+ add %rax,$A1[1] # a[3]*a[1]+t[4]
+ mov $ai,%rax
+ mov %rdx,$A1[0]
+ adc \$0,$A1[0]
+
+ mul $a0 # a[4]*a[0]
+ add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4]
+ mov $ai,%rax # a[3]
+ mov 8($aptr,$j),$ai # a[5]
+ mov %rdx,$A0[0]
+ adc \$0,$A0[0]
+ add $A1[1],$A0[1]
+ adc \$0,$A0[0]
+
+
+ mul $a1 # a[4]*a[3]
+ add %rax,$A1[0] # a[4]*a[3]+t[5]
+ mov $ai,%rax
+ mov $A0[1],($tptr,$j) # t[4]
+ mov %rdx,$A1[1]
+ adc \$0,$A1[1]
+
+ mul $a0 # a[5]*a[2]
+ add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5]
+ mov $ai,%rax
+ mov 16($aptr,$j),$ai # a[6]
+ mov %rdx,$A0[1]
+ adc \$0,$A0[1]
+ add $A1[0],$A0[0]
+ adc \$0,$A0[1]
+
+ mul $a1 # a[5]*a[3]
+ add %rax,$A1[1] # a[5]*a[3]+t[6]
+ mov $ai,%rax
+ mov $A0[0],8($tptr,$j) # t[5]
+ mov %rdx,$A1[0]
+ adc \$0,$A1[0]
+
+ mul $a0 # a[6]*a[2]
+ add %rax,$A0[1] # a[6]*a[2]+a[5]*a[3]+t[6]
+ mov $ai,%rax # a[3]
+ mov 24($aptr,$j),$ai # a[7]
+ mov %rdx,$A0[0]
+ adc \$0,$A0[0]
+ add $A1[1],$A0[1]
+ adc \$0,$A0[0]
+
+
+ mul $a1 # a[6]*a[5]
+ add %rax,$A1[0] # a[6]*a[5]+t[7]
+ mov $ai,%rax
+ mov $A0[1],16($tptr,$j) # t[6]
+ mov %rdx,$A1[1]
+ adc \$0,$A1[1]
+ lea 32($j),$j
+
+ mul $a0 # a[7]*a[4]
+ add %rax,$A0[0] # a[7]*a[4]+a[6]*a[5]+t[6]
+ mov $ai,%rax
+ mov %rdx,$A0[1]
+ adc \$0,$A0[1]
+ add $A1[0],$A0[0]
+ adc \$0,$A0[1]
+ mov $A0[0],-8($tptr,$j) # t[7]
+
+ cmp \$0,$j
+ jne .Lsqr4x_1st
+
+ mul $a1 # a[7]*a[5]
+ add %rax,$A1[1]
+ lea 16($i),$i
+ adc \$0,%rdx
+ add $A0[1],$A1[1]
+ adc \$0,%rdx
+
+ mov $A1[1],($tptr) # t[8]
+ mov %rdx,$A1[0]
+ mov %rdx,8($tptr) # t[9]
+ jmp .Lsqr4x_outer
+
+.align 32
+.Lsqr4x_outer: # comments apply to $num==6 case
+ mov -32($aptr,$i),$a0 # a[0]
+ lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num]
+ mov -24($aptr,$i),%rax # a[1]
+ lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"]
+ mov -16($aptr,$i),$ai # a[2]
+ mov %rax,$a1
+
+ mul $a0 # a[1]*a[0]
+ mov -24($tptr,$i),$A0[0] # t[1]
+ add %rax,$A0[0] # a[1]*a[0]+t[1]
+ mov $ai,%rax # a[2]
+ adc \$0,%rdx
+ mov $A0[0],-24($tptr,$i) # t[1]
+ mov %rdx,$A0[1]
+
+ mul $a0 # a[2]*a[0]
+ add %rax,$A0[1]
+ mov $ai,%rax
+ adc \$0,%rdx
+ add -16($tptr,$i),$A0[1] # a[2]*a[0]+t[2]
+ mov %rdx,$A0[0]
+ adc \$0,$A0[0]
+ mov $A0[1],-16($tptr,$i) # t[2]
+
+ xor $A1[0],$A1[0]
+
+ mov -8($aptr,$i),$ai # a[3]
+ mul $a1 # a[2]*a[1]
+ add %rax,$A1[0] # a[2]*a[1]+t[3]
+ mov $ai,%rax
+ adc \$0,%rdx
+ add -8($tptr,$i),$A1[0]
+ mov %rdx,$A1[1]
+ adc \$0,$A1[1]
+
+ mul $a0 # a[3]*a[0]
+ add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3]
+ mov $ai,%rax
+ adc \$0,%rdx
+ add $A1[0],$A0[0]
+ mov %rdx,$A0[1]
+ adc \$0,$A0[1]
+ mov $A0[0],-8($tptr,$i) # t[3]
+
+ lea ($i),$j
+ jmp .Lsqr4x_inner
+
+.align 32
+.Lsqr4x_inner:
+ mov ($aptr,$j),$ai # a[4]
+ mul $a1 # a[3]*a[1]
+ add %rax,$A1[1] # a[3]*a[1]+t[4]
+ mov $ai,%rax
+ mov %rdx,$A1[0]
+ adc \$0,$A1[0]
+ add ($tptr,$j),$A1[1]
+ adc \$0,$A1[0]
+
+ .byte 0x67
+ mul $a0 # a[4]*a[0]
+ add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4]
+ mov $ai,%rax # a[3]
+ mov 8($aptr,$j),$ai # a[5]
+ mov %rdx,$A0[0]
+ adc \$0,$A0[0]
+ add $A1[1],$A0[1]
+ adc \$0,$A0[0]
+
+ mul $a1 # a[4]*a[3]
+ add %rax,$A1[0] # a[4]*a[3]+t[5]
+ mov $A0[1],($tptr,$j) # t[4]
+ mov $ai,%rax
+ mov %rdx,$A1[1]
+ adc \$0,$A1[1]
+ add 8($tptr,$j),$A1[0]
+ lea 16($j),$j # j++
+ adc \$0,$A1[1]
+
+ mul $a0 # a[5]*a[2]
+ add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5]
+ mov $ai,%rax
+ adc \$0,%rdx
+ add $A1[0],$A0[0]
+ mov %rdx,$A0[1]
+ adc \$0,$A0[1]
+ mov $A0[0],-8($tptr,$j) # t[5], "preloaded t[1]" below
+
+ cmp \$0,$j
+ jne .Lsqr4x_inner
+
+ .byte 0x67
+ mul $a1 # a[5]*a[3]
+ add %rax,$A1[1]
+ adc \$0,%rdx
+ add $A0[1],$A1[1]
+ adc \$0,%rdx
+
+ mov $A1[1],($tptr) # t[6], "preloaded t[2]" below
+ mov %rdx,$A1[0]
+ mov %rdx,8($tptr) # t[7], "preloaded t[3]" below
+
+ add \$16,$i
+ jnz .Lsqr4x_outer
+
+ # comments apply to $num==4 case
+ mov -32($aptr),$a0 # a[0]
+ lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num]
+ mov -24($aptr),%rax # a[1]
+ lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"]
+ mov -16($aptr),$ai # a[2]
+ mov %rax,$a1
+
+ mul $a0 # a[1]*a[0]
+ add %rax,$A0[0] # a[1]*a[0]+t[1], preloaded t[1]
+ mov $ai,%rax # a[2]
+ mov %rdx,$A0[1]
+ adc \$0,$A0[1]
+
+ mul $a0 # a[2]*a[0]
+ add %rax,$A0[1]
+ mov $ai,%rax
+ mov $A0[0],-24($tptr) # t[1]
+ mov %rdx,$A0[0]
+ adc \$0,$A0[0]
+ add $A1[1],$A0[1] # a[2]*a[0]+t[2], preloaded t[2]
+ mov -8($aptr),$ai # a[3]
+ adc \$0,$A0[0]
+
+ mul $a1 # a[2]*a[1]
+ add %rax,$A1[0] # a[2]*a[1]+t[3], preloaded t[3]
+ mov $ai,%rax