From 317be63875e59efa34be0075eaff3c033ef6969f Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Tue, 26 Jan 2016 16:50:10 +0100 Subject: bn/asm/x86_64-mont5.pl: unify gather procedure in hardly used path and reorganize/harmonize post-conditions. Additional hardening following on from CVE-2016-0702 Reviewed-by: Richard Levitte Reviewed-by: Rich Salz --- crypto/bn/asm/x86_64-mont.pl | 188 +++++++++++++++++++++++++++++-------------- 1 file changed, 126 insertions(+), 62 deletions(-) (limited to 'crypto/bn/asm/x86_64-mont.pl') diff --git a/crypto/bn/asm/x86_64-mont.pl b/crypto/bn/asm/x86_64-mont.pl index 69232cbeb6..29ba1224e3 100755 --- a/crypto/bn/asm/x86_64-mont.pl +++ b/crypto/bn/asm/x86_64-mont.pl @@ -795,7 +795,7 @@ bn_sqr8x_mont: sub %r11,%rsp .Lsqr8x_sp_done: and \$-64,%rsp - mov $num,%r10 + mov $num,%r10 neg $num mov $n0, 32(%rsp) @@ -814,34 +814,87 @@ $code.=<<___ if ($addx); jne .Lsqr8x_nox call bn_sqrx8x_internal # see x86_64-mont5 module - - pxor %xmm0,%xmm0 - lea 48(%rsp),%rax - shr \$3+2,$num - mov 40(%rsp),%rsi # restore %rsp - jmp .Lsqr8x_zero + # %rax top-most carry + # %rbp nptr + # %rcx -8*num + # %r8 end of tp[2*num] + lea (%r8,%rcx),%rbx + mov %rcx,$num + mov %rcx,%rdx + movq %xmm1,$rptr + sar \$3+2,%rcx # %cf=0 + jmp .Lsqr8x_sub .align 32 .Lsqr8x_nox: ___ $code.=<<___; call bn_sqr8x_internal # see x86_64-mont5 module + # %rax top-most carry + # %rbp nptr + # %r8 -8*num + # %rdi end of tp[2*num] + lea (%rdi,$num),%rbx + mov $num,%rcx + mov $num,%rdx + movq %xmm1,$rptr + sar \$3+2,%rcx # %cf=0 + jmp .Lsqr8x_sub +.align 32 +.Lsqr8x_sub: + mov 8*0(%rbx),%r12 + mov 8*1(%rbx),%r13 + mov 8*2(%rbx),%r14 + mov 8*3(%rbx),%r15 + lea 8*4(%rbx),%rbx + sbb 8*0(%rbp),%r12 + sbb 8*1(%rbp),%r13 + sbb 8*2(%rbp),%r14 + sbb 8*3(%rbp),%r15 + lea 8*4(%rbp),%rbp + mov %r12,8*0($rptr) + mov %r13,8*1($rptr) + mov %r14,8*2($rptr) + mov %r15,8*3($rptr) + lea 8*4($rptr),$rptr + inc %rcx # preserves %cf + jnz .Lsqr8x_sub + + sbb \$0,%rax # top-most carry + lea (%rbx,$num),%rbx # rewind + lea ($rptr,$num),$rptr # rewind + + movq %rax,%xmm1 pxor %xmm0,%xmm0 - lea 48(%rsp),%rax - shr \$3+2,$num + pshufd \$0,%xmm1,%xmm1 mov 40(%rsp),%rsi # restore %rsp - jmp .Lsqr8x_zero + jmp .Lsqr8x_cond_copy .align 32 -.Lsqr8x_zero: - movdqa %xmm0,16*0(%rax) # wipe t - movdqa %xmm0,16*1(%rax) - movdqa %xmm0,16*2(%rax) - movdqa %xmm0,16*3(%rax) - lea 16*4(%rax),%rax - dec $num - jnz .Lsqr8x_zero +.Lsqr8x_cond_copy: + movdqa 16*0(%rbx),%xmm2 + movdqa 16*1(%rbx),%xmm3 + lea 16*2(%rbx),%rbx + movdqu 16*0($rptr),%xmm4 + movdqu 16*1($rptr),%xmm5 + lea 16*2($rptr),$rptr + movdqa %xmm0,-16*2(%rbx) # zero tp + movdqa %xmm0,-16*1(%rbx) + movdqa %xmm0,-16*2(%rbx,%rdx) + movdqa %xmm0,-16*1(%rbx,%rdx) + pcmpeqd %xmm1,%xmm0 + pand %xmm1,%xmm2 + pand %xmm1,%xmm3 + pand %xmm0,%xmm4 + pand %xmm0,%xmm5 + pxor %xmm0,%xmm0 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqu %xmm4,-16*2($rptr) + movdqu %xmm5,-16*1($rptr) + add \$32,$num + jnz .Lsqr8x_cond_copy mov \$1,%rax mov -48(%rsi),%r15 @@ -1108,64 +1161,75 @@ $code.=<<___; adc $zero,%r15 # modulo-scheduled sub 0*8($tptr),$zero # pull top-most carry adc %r15,%r14 - mov -8($nptr),$mi sbb %r15,%r15 # top-most carry mov %r14,-1*8($tptr) cmp 16(%rsp),$bptr jne .Lmulx4x_outer - sub %r14,$mi # compare top-most words - sbb $mi,$mi - or $mi,%r15 - - neg $num - xor %rdx,%rdx + lea 64(%rsp),$tptr + sub $num,$nptr # rewind $nptr + neg %r15 + mov $num,%rdx + shr \$3+2,$num # %cf=0 mov 32(%rsp),$rptr # restore rp + jmp .Lmulx4x_sub + +.align 32 +.Lmulx4x_sub: + mov 8*0($tptr),%r11 + mov 8*1($tptr),%r12 + mov 8*2($tptr),%r13 + mov 8*3($tptr),%r14 + lea 8*4($tptr),$tptr + sbb 8*0($nptr),%r11 + sbb 8*1($nptr),%r12 + sbb 8*2($nptr),%r13 + sbb 8*3($nptr),%r14 + lea 8*4($nptr),$nptr + mov %r11,8*0($rptr) + mov %r12,8*1($rptr) + mov %r13,8*2($rptr) + mov %r14,8*3($rptr) + lea 8*4($rptr),$rptr + dec $num # preserves %cf + jnz .Lmulx4x_sub + + sbb \$0,%r15 # top-most carry lea 64(%rsp),$tptr + sub %rdx,$rptr # rewind + movq %r15,%xmm1 pxor %xmm0,%xmm0 - mov 0*8($nptr,$num),%r8 - mov 1*8($nptr,$num),%r9 - neg %r8 - jmp .Lmulx4x_sub_entry + pshufd \$0,%xmm1,%xmm1 + mov 40(%rsp),%rsi # restore %rsp + jmp .Lmulx4x_cond_copy .align 32 -.Lmulx4x_sub: - mov 0*8($nptr,$num),%r8 - mov 1*8($nptr,$num),%r9 - not %r8 -.Lmulx4x_sub_entry: - mov 2*8($nptr,$num),%r10 - not %r9 - and %r15,%r8 - mov 3*8($nptr,$num),%r11 - not %r10 - and %r15,%r9 - not %r11 - and %r15,%r10 - and %r15,%r11 - - neg %rdx # mov %rdx,%cf - adc 0*8($tptr),%r8 - adc 1*8($tptr),%r9 - movdqa %xmm0,($tptr) - adc 2*8($tptr),%r10 - adc 3*8($tptr),%r11 - movdqa %xmm0,16($tptr) - lea 4*8($tptr),$tptr - sbb %rdx,%rdx # mov %cf,%rdx - - mov %r8,0*8($rptr) - mov %r9,1*8($rptr) - mov %r10,2*8($rptr) - mov %r11,3*8($rptr) - lea 4*8($rptr),$rptr +.Lmulx4x_cond_copy: + movdqa 16*0($tptr),%xmm2 + movdqa 16*1($tptr),%xmm3 + lea 16*2($tptr),$tptr + movdqu 16*0($rptr),%xmm4 + movdqu 16*1($rptr),%xmm5 + lea 16*2($rptr),$rptr + movdqa %xmm0,-16*2($tptr) # zero tp + movdqa %xmm0,-16*1($tptr) + pcmpeqd %xmm1,%xmm0 + pand %xmm1,%xmm2 + pand %xmm1,%xmm3 + pand %xmm0,%xmm4 + pand %xmm0,%xmm5 + pxor %xmm0,%xmm0 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqu %xmm4,-16*2($rptr) + movdqu %xmm5,-16*1($rptr) + sub \$32,%rdx + jnz .Lmulx4x_cond_copy - add \$32,$num - jnz .Lmulx4x_sub + mov %rdx,($tptr) - mov 40(%rsp),%rsi # restore %rsp mov \$1,%rax mov -48(%rsi),%r15 mov -40(%rsi),%r14 -- cgit v1.2.3