diff options
Diffstat (limited to 'crypto/bn')
-rwxr-xr-x | crypto/bn/asm/x86_64-mont5.pl | 196 | ||||
-rw-r--r-- | crypto/bn/bn_exp.c | 44 |
2 files changed, 26 insertions, 214 deletions
diff --git a/crypto/bn/asm/x86_64-mont5.pl b/crypto/bn/asm/x86_64-mont5.pl index c03473f13a..1faea0bcf8 100755 --- a/crypto/bn/asm/x86_64-mont5.pl +++ b/crypto/bn/asm/x86_64-mont5.pl @@ -2103,193 +2103,6 @@ __bn_post4x_internal: .size __bn_post4x_internal,.-__bn_post4x_internal ___ } -{ -$code.=<<___; -.globl bn_from_montgomery -.type bn_from_montgomery,\@abi-omnipotent -.align 32 -bn_from_montgomery: -.cfi_startproc - testl \$7,`($win64?"48(%rsp)":"%r9d")` - jz bn_from_mont8x - xor %eax,%eax - ret -.cfi_endproc -.size bn_from_montgomery,.-bn_from_montgomery - -.type bn_from_mont8x,\@function,6 -.align 32 -bn_from_mont8x: -.cfi_startproc - .byte 0x67 - mov %rsp,%rax -.cfi_def_cfa_register %rax - push %rbx -.cfi_push %rbx - push %rbp -.cfi_push %rbp - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 -.Lfrom_prologue: - - shl \$3,${num}d # convert $num to bytes - lea ($num,$num,2),%r10 # 3*$num in bytes - neg $num - mov ($n0),$n0 # *n0 - - ############################################################## - # Ensure that stack frame doesn't alias with $rptr+3*$num - # modulo 4096, which covers ret[num], am[num] and n[num] - # (see bn_exp.c). The stack is allocated to aligned with - # bn_power5's frame, and as bn_from_montgomery happens to be - # last operation, we use the opportunity to cleanse it. - # - lea -320(%rsp,$num,2),%r11 - mov %rsp,%rbp - sub $rptr,%r11 - and \$4095,%r11 - cmp %r11,%r10 - jb .Lfrom_sp_alt - sub %r11,%rbp # align with $aptr - lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) - jmp .Lfrom_sp_done - -.align 32 -.Lfrom_sp_alt: - lea 4096-320(,$num,2),%r10 - lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) - sub %r10,%r11 - mov \$0,%r10 - cmovc %r10,%r11 - sub %r11,%rbp -.Lfrom_sp_done: - and \$-64,%rbp - mov %rsp,%r11 - sub %rbp,%r11 - and \$-4096,%r11 - lea (%rbp,%r11),%rsp - mov (%rsp),%r10 - cmp %rbp,%rsp - ja .Lfrom_page_walk - jmp .Lfrom_page_walk_done - -.Lfrom_page_walk: - lea -4096(%rsp),%rsp - mov (%rsp),%r10 - cmp %rbp,%rsp - ja .Lfrom_page_walk -.Lfrom_page_walk_done: - - mov $num,%r10 - neg $num - - ############################################################## - # Stack layout - # - # +0 saved $num, used in reduction section - # +8 &t[2*$num], used in reduction section - # +32 saved *n0 - # +40 saved %rsp - # +48 t[2*$num] - # - mov $n0, 32(%rsp) - mov %rax, 40(%rsp) # save original %rsp -.cfi_cfa_expression %rsp+40,deref,+8 -.Lfrom_body: - mov $num,%r11 - lea 48(%rsp),%rax - pxor %xmm0,%xmm0 - jmp .Lmul_by_1 - -.align 32 -.Lmul_by_1: - movdqu ($aptr),%xmm1 - movdqu 16($aptr),%xmm2 - movdqu 32($aptr),%xmm3 - movdqa %xmm0,(%rax,$num) - movdqu 48($aptr),%xmm4 - movdqa %xmm0,16(%rax,$num) - .byte 0x48,0x8d,0xb6,0x40,0x00,0x00,0x00 # lea 64($aptr),$aptr - movdqa %xmm1,(%rax) - movdqa %xmm0,32(%rax,$num) - movdqa %xmm2,16(%rax) - movdqa %xmm0,48(%rax,$num) - movdqa %xmm3,32(%rax) - movdqa %xmm4,48(%rax) - lea 64(%rax),%rax - sub \$64,%r11 - jnz .Lmul_by_1 - - movq $rptr,%xmm1 - movq $nptr,%xmm2 - .byte 0x67 - mov $nptr,%rbp - movq %r10, %xmm3 # -num -___ -$code.=<<___ if ($addx); - mov OPENSSL_ia32cap_P+8(%rip),%r11d - and \$0x80108,%r11d - cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1 - jne .Lfrom_mont_nox - - lea (%rax,$num),$rptr - call __bn_sqrx8x_reduction - call __bn_postx4x_internal - - pxor %xmm0,%xmm0 - lea 48(%rsp),%rax - jmp .Lfrom_mont_zero - -.align 32 -.Lfrom_mont_nox: -___ -$code.=<<___; - call __bn_sqr8x_reduction - call __bn_post4x_internal - - pxor %xmm0,%xmm0 - lea 48(%rsp),%rax - jmp .Lfrom_mont_zero - -.align 32 -.Lfrom_mont_zero: - mov 40(%rsp),%rsi # restore %rsp -.cfi_def_cfa %rsi,8 - movdqa %xmm0,16*0(%rax) - movdqa %xmm0,16*1(%rax) - movdqa %xmm0,16*2(%rax) - movdqa %xmm0,16*3(%rax) - lea 16*4(%rax),%rax - sub \$32,$num - jnz .Lfrom_mont_zero - - mov \$1,%rax - mov -48(%rsi),%r15 -.cfi_restore %r15 - mov -40(%rsi),%r14 -.cfi_restore %r14 - mov -32(%rsi),%r13 -.cfi_restore %r13 - mov -24(%rsi),%r12 -.cfi_restore %r12 - mov -16(%rsi),%rbp -.cfi_restore %rbp - mov -8(%rsi),%rbx -.cfi_restore %rbx - lea (%rsi),%rsp -.cfi_def_cfa_register %rsp -.Lfrom_epilogue: - ret -.cfi_endproc -.size bn_from_mont8x,.-bn_from_mont8x -___ -} }}} if ($addx) {{{ @@ -3896,10 +3709,6 @@ mul_handler: .rva .LSEH_begin_bn_power5 .rva .LSEH_end_bn_power5 .rva .LSEH_info_bn_power5 - - .rva .LSEH_begin_bn_from_mont8x - .rva .LSEH_end_bn_from_mont8x - .rva .LSEH_info_bn_from_mont8x ___ $code.=<<___ if ($addx); .rva .LSEH_begin_bn_mulx4x_mont_gather5 @@ -3931,11 +3740,6 @@ $code.=<<___; .byte 9,0,0,0 .rva mul_handler .rva .Lpower5_prologue,.Lpower5_body,.Lpower5_epilogue # HandlerData[] -.align 8 -.LSEH_info_bn_from_mont8x: - .byte 9,0,0,0 - .rva mul_handler - .rva .Lfrom_prologue,.Lfrom_body,.Lfrom_epilogue # HandlerData[] ___ $code.=<<___ if ($addx); .align 8 diff --git a/crypto/bn/bn_exp.c b/crypto/bn/bn_exp.c index 89ce77fbb7..c7b62232f3 100644 --- a/crypto/bn/bn_exp.c +++ b/crypto/bn/bn_exp.c @@ -897,14 +897,21 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, #if defined(OPENSSL_BN_ASM_MONT5) if (window == 5 && top > 1) { /* - * This optimization uses ideas from http://eprint.iacr.org/2011/239, - * specifically optimization of cache-timing attack countermeasures - * and pre-computation optimization. - */ - - /* - * Dedicated window==4 case improves 512-bit RSA sign by ~15%, but as - * 512-bit RSA is hardly relevant, we omit it to spare size... + * This optimization uses ideas from https://eprint.iacr.org/2011/239, + * specifically optimization of cache-timing attack countermeasures, + * pre-computation optimization, and Almost Montgomery Multiplication. + * + * The paper discusses a 4-bit window to optimize 512-bit modular + * exponentiation, used in RSA-1024 with CRT, but RSA-1024 is no longer + * important. + * + * |bn_mul_mont_gather5| and |bn_power5| implement the "almost" + * reduction variant, so the values here may not be fully reduced. + * They are bounded by R (i.e. they fit in |top| words), not |m|. + * Additionally, we pass these "almost" reduced inputs into + * |bn_mul_mont|, which implements the normal reduction variant. + * Given those inputs, |bn_mul_mont| may not give reduced + * output, but it will still produce "almost" reduced output. */ void bn_mul_mont_gather5(BN_ULONG *rp, const BN_ULONG *ap, const void *table, const BN_ULONG *np, @@ -916,9 +923,6 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, const void *table, const BN_ULONG *np, const BN_ULONG *n0, int num, int power); int bn_get_bits5(const BN_ULONG *ap, int off); - int bn_from_montgomery(BN_ULONG *rp, const BN_ULONG *ap, - const BN_ULONG *not_used, const BN_ULONG *np, - const BN_ULONG *n0, int num); BN_ULONG *n0 = mont->n0, *np; @@ -1007,14 +1011,18 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, } } - ret = bn_from_montgomery(tmp.d, tmp.d, NULL, np, n0, top); tmp.top = top; - bn_correct_top(&tmp); - if (ret) { - if (!BN_copy(rr, &tmp)) - ret = 0; - goto err; /* non-zero ret means it's not error */ - } + /* + * The result is now in |tmp| in Montgomery form, but it may not be + * fully reduced. This is within bounds for |BN_from_montgomery| + * (tmp < R <= m*R) so it will, when converting from Montgomery form, + * produce a fully reduced result. + * + * This differs from Figure 2 of the paper, which uses AMM(h, 1) to + * convert from Montgomery form with unreduced output, followed by an + * extra reduction step. In the paper's terminology, we replace + * steps 9 and 10 with MM(h, 1). + */ } else #endif { |