From 8329e2e776176ce6ff9d7c48c7182943875065bd Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Mon, 17 Oct 2011 17:41:49 +0000 Subject: bn_exp.c: further optimizations using more ideas from http://eprint.iacr.org/2011/239. --- crypto/bn/asm/x86_64-mont5.pl | 83 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 76 insertions(+), 7 deletions(-) (limited to 'crypto/bn/asm/x86_64-mont5.pl') diff --git a/crypto/bn/asm/x86_64-mont5.pl b/crypto/bn/asm/x86_64-mont5.pl index d6e4c63df1..057cda28aa 100755 --- a/crypto/bn/asm/x86_64-mont5.pl +++ b/crypto/bn/asm/x86_64-mont5.pl @@ -607,7 +607,6 @@ $code.=<<___; add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j] lea 4($j),$j # j+=2 adc \$0,%rdx - mov $N[1],(%rsp) # tp[j-1] mov %rdx,$N[0] jmp .Linner4x .align 16 @@ -626,7 +625,7 @@ $code.=<<___; adc \$0,%rdx add $A[0],$N[0] adc \$0,%rdx - mov $N[0],-24(%rsp,$j,8) # tp[j-1] + mov $N[1],-32(%rsp,$j,8) # tp[j-1] mov %rdx,$N[1] mulq $m0 # ap[j]*bp[i] @@ -643,7 +642,7 @@ $code.=<<___; adc \$0,%rdx add $A[1],$N[1] adc \$0,%rdx - mov $N[1],-16(%rsp,$j,8) # tp[j-1] + mov $N[0],-24(%rsp,$j,8) # tp[j-1] mov %rdx,$N[0] mulq $m0 # ap[j]*bp[i] @@ -660,7 +659,7 @@ $code.=<<___; adc \$0,%rdx add $A[0],$N[0] adc \$0,%rdx - mov $N[0],-8(%rsp,$j,8) # tp[j-1] + mov $N[1],-16(%rsp,$j,8) # tp[j-1] mov %rdx,$N[1] mulq $m0 # ap[j]*bp[i] @@ -678,7 +677,7 @@ $code.=<<___; adc \$0,%rdx add $A[1],$N[1] adc \$0,%rdx - mov $N[1],-32(%rsp,$j,8) # tp[j-1] + mov $N[0],-40(%rsp,$j,8) # tp[j-1] mov %rdx,$N[0] cmp $num,$j jl .Linner4x @@ -697,7 +696,7 @@ $code.=<<___; adc \$0,%rdx add $A[0],$N[0] adc \$0,%rdx - mov $N[0],-24(%rsp,$j,8) # tp[j-1] + mov $N[1],-32(%rsp,$j,8) # tp[j-1] mov %rdx,$N[1] mulq $m0 # ap[j]*bp[i] @@ -715,10 +714,11 @@ $code.=<<___; adc \$0,%rdx add $A[1],$N[1] adc \$0,%rdx - mov $N[1],-16(%rsp,$j,8) # tp[j-1] + mov $N[0],-24(%rsp,$j,8) # tp[j-1] mov %rdx,$N[0] movq %xmm0,$m0 # bp[i+1] + mov $N[1],-16(%rsp,$j,8) # tp[j-1] xor $N[1],$N[1] add $A[0],$N[0] @@ -831,6 +831,10 @@ ___ { my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order ("%rdi","%rsi","%rdx","%rcx"); # Unix order +my $out=$inp; +my $STRIDE=2**5*8; +my $N=$STRIDE/4; + $code.=<<___; .globl bn_scatter5 .type bn_scatter5,\@abi-omnipotent @@ -849,6 +853,61 @@ bn_scatter5: .Lscatter_epilogue: ret .size bn_scatter5,.-bn_scatter5 + +.globl bn_gather5 +.type bn_gather5,\@abi-omnipotent +.align 16 +bn_gather5: +___ +$code.=<<___ if ($win64); +.LSEH_begin_bn_gather5: + # I can't trust assembler to use specific encoding:-( + .byte 0x48,0x83,0xec,0x28 #sub \$0x28,%rsp + .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp) + .byte 0x0f,0x29,0x7c,0x24,0x10 #movdqa %xmm7,0x10(%rsp) +___ +$code.=<<___; + mov $idx,%r11 + shr \$`log($N/8)/log(2)`,$idx + and \$`$N/8-1`,%r11 + not $idx + lea .Lmagic_masks(%rip),%rax + and \$`2**5/($N/8)-1`,$idx # 5 is "window size" + lea 96($tbl,%r11,8),$tbl # pointer within 1st cache line + movq 0(%rax,$idx,8),%xmm4 # set of masks denoting which + movq 8(%rax,$idx,8),%xmm5 # cache line contains element + movq 16(%rax,$idx,8),%xmm6 # denoted by 7th argument + movq 24(%rax,$idx,8),%xmm7 + jmp .Lgather +.align 16 +.Lgather: + movq `0*$STRIDE/4-96`($tbl),%xmm0 + movq `1*$STRIDE/4-96`($tbl),%xmm1 + pand %xmm4,%xmm0 + movq `2*$STRIDE/4-96`($tbl),%xmm2 + pand %xmm5,%xmm1 + movq `3*$STRIDE/4-96`($tbl),%xmm3 + pand %xmm6,%xmm2 + por %xmm1,%xmm0 + pand %xmm7,%xmm3 + por %xmm2,%xmm0 + lea $STRIDE($tbl),$tbl + por %xmm3,%xmm0 + + movq %xmm0,($out) # m0=bp[0] + lea 8($out),$out + sub \$1,$num + jnz .Lgather +___ +$code.=<<___ if ($win64); + movaps %xmm6,(%rsp) + movaps %xmm7,0x10(%rsp) + lea 0x28(%rsp),%rsp +___ +$code.=<<___; + ret +.LSEH_end_bn_gather5: +.size bn_gather5,.-bn_gather5 ___ } $code.=<<___; @@ -980,6 +1039,10 @@ mul_handler: .rva .LSEH_end_bn_mul4x_mont_gather5 .rva .LSEH_info_bn_mul4x_mont_gather5 + .rva .LSEH_begin_bn_gather5 + .rva .LSEH_end_bn_gather5 + .rva .LSEH_info_bn_gather5 + .section .xdata .align 8 .LSEH_info_bn_mul_mont_gather5: @@ -992,6 +1055,12 @@ mul_handler: .rva mul_handler .rva .Lmul4x_alloca,.Lmul4x_body,.Lmul4x_epilogue # HandlerData[] .align 8 +.LSEH_info_bn_gather5: + .byte 0x01,0x0d,0x05,0x00 + .byte 0x0d,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7 + .byte 0x08,0x68,0x00,0x00 #movaps (rsp),xmm6 + .byte 0x04,0x42,0x00,0x00 #sub rsp,0x28 +.align 8 ___ } -- cgit v1.2.3