summaryrefslogtreecommitdiffstats
path: root/crypto/bn
diff options
context:
space:
mode:
authorTomas Mraz <tomas@openssl.org>2022-06-09 12:34:55 +0200
committerTomas Mraz <tomas@openssl.org>2022-06-16 15:22:35 +0200
commit0ae365e1f80648f4c52aa3ac9bbc279b6192b23e (patch)
tree3c64a03999de0210c8b171207d79a5d119c3022b /crypto/bn
parentb2feb9f0e394da6570346598837f1b01eb58c028 (diff)
Always end BN_mod_exp_mont_consttime with normal Montgomery reduction.
This partially fixes a bug where, on x86_64, BN_mod_exp_mont_consttime would sometimes return m, the modulus, when it should have returned zero. Thanks to Guido Vranken for reporting it. It is only a partial fix because the same bug also exists in the "rsaz" codepath. The bug only affects zero outputs (with non-zero inputs), so we believe it has no security impact on our cryptographic functions. The fx is to delete lowercase bn_from_montgomery altogether, and have the mont5 path use the same BN_from_montgomery ending as the non-mont5 path. This only impacts the final step of the whole exponentiation and has no measurable perf impact. See the original BoringSSL commit https://boringssl.googlesource.com/boringssl/+/13c9d5c69d04485a7a8840c12185c832026c8315 for further analysis. Original-author: David Benjamin <davidben@google.com> Reviewed-by: Matt Caswell <matt@openssl.org> Reviewed-by: Paul Dale <pauli@openssl.org> (Merged from https://github.com/openssl/openssl/pull/18510)
Diffstat (limited to 'crypto/bn')
-rwxr-xr-xcrypto/bn/asm/x86_64-mont5.pl196
-rw-r--r--crypto/bn/bn_exp.c44
2 files changed, 26 insertions, 214 deletions
diff --git a/crypto/bn/asm/x86_64-mont5.pl b/crypto/bn/asm/x86_64-mont5.pl
index c03473f13a..1faea0bcf8 100755
--- a/crypto/bn/asm/x86_64-mont5.pl
+++ b/crypto/bn/asm/x86_64-mont5.pl
@@ -2103,193 +2103,6 @@ __bn_post4x_internal:
.size __bn_post4x_internal,.-__bn_post4x_internal
___
}
-{
-$code.=<<___;
-.globl bn_from_montgomery
-.type bn_from_montgomery,\@abi-omnipotent
-.align 32
-bn_from_montgomery:
-.cfi_startproc
- testl \$7,`($win64?"48(%rsp)":"%r9d")`
- jz bn_from_mont8x
- xor %eax,%eax
- ret
-.cfi_endproc
-.size bn_from_montgomery,.-bn_from_montgomery
-
-.type bn_from_mont8x,\@function,6
-.align 32
-bn_from_mont8x:
-.cfi_startproc
- .byte 0x67
- mov %rsp,%rax
-.cfi_def_cfa_register %rax
- push %rbx
-.cfi_push %rbx
- push %rbp
-.cfi_push %rbp
- push %r12
-.cfi_push %r12
- push %r13
-.cfi_push %r13
- push %r14
-.cfi_push %r14
- push %r15
-.cfi_push %r15
-.Lfrom_prologue:
-
- shl \$3,${num}d # convert $num to bytes
- lea ($num,$num,2),%r10 # 3*$num in bytes
- neg $num
- mov ($n0),$n0 # *n0
-
- ##############################################################
- # Ensure that stack frame doesn't alias with $rptr+3*$num
- # modulo 4096, which covers ret[num], am[num] and n[num]
- # (see bn_exp.c). The stack is allocated to aligned with
- # bn_power5's frame, and as bn_from_montgomery happens to be
- # last operation, we use the opportunity to cleanse it.
- #
- lea -320(%rsp,$num,2),%r11
- mov %rsp,%rbp
- sub $rptr,%r11
- and \$4095,%r11
- cmp %r11,%r10
- jb .Lfrom_sp_alt
- sub %r11,%rbp # align with $aptr
- lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256)
- jmp .Lfrom_sp_done
-
-.align 32
-.Lfrom_sp_alt:
- lea 4096-320(,$num,2),%r10
- lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256)
- sub %r10,%r11
- mov \$0,%r10
- cmovc %r10,%r11
- sub %r11,%rbp
-.Lfrom_sp_done:
- and \$-64,%rbp
- mov %rsp,%r11
- sub %rbp,%r11
- and \$-4096,%r11
- lea (%rbp,%r11),%rsp
- mov (%rsp),%r10
- cmp %rbp,%rsp
- ja .Lfrom_page_walk
- jmp .Lfrom_page_walk_done
-
-.Lfrom_page_walk:
- lea -4096(%rsp),%rsp
- mov (%rsp),%r10
- cmp %rbp,%rsp
- ja .Lfrom_page_walk
-.Lfrom_page_walk_done:
-
- mov $num,%r10
- neg $num
-
- ##############################################################
- # Stack layout
- #
- # +0 saved $num, used in reduction section
- # +8 &t[2*$num], used in reduction section
- # +32 saved *n0
- # +40 saved %rsp
- # +48 t[2*$num]
- #
- mov $n0, 32(%rsp)
- mov %rax, 40(%rsp) # save original %rsp
-.cfi_cfa_expression %rsp+40,deref,+8
-.Lfrom_body:
- mov $num,%r11
- lea 48(%rsp),%rax
- pxor %xmm0,%xmm0
- jmp .Lmul_by_1
-
-.align 32
-.Lmul_by_1:
- movdqu ($aptr),%xmm1
- movdqu 16($aptr),%xmm2
- movdqu 32($aptr),%xmm3
- movdqa %xmm0,(%rax,$num)
- movdqu 48($aptr),%xmm4
- movdqa %xmm0,16(%rax,$num)
- .byte 0x48,0x8d,0xb6,0x40,0x00,0x00,0x00 # lea 64($aptr),$aptr
- movdqa %xmm1,(%rax)
- movdqa %xmm0,32(%rax,$num)
- movdqa %xmm2,16(%rax)
- movdqa %xmm0,48(%rax,$num)
- movdqa %xmm3,32(%rax)
- movdqa %xmm4,48(%rax)
- lea 64(%rax),%rax
- sub \$64,%r11
- jnz .Lmul_by_1
-
- movq $rptr,%xmm1
- movq $nptr,%xmm2
- .byte 0x67
- mov $nptr,%rbp
- movq %r10, %xmm3 # -num
-___
-$code.=<<___ if ($addx);
- mov OPENSSL_ia32cap_P+8(%rip),%r11d
- and \$0x80108,%r11d
- cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1
- jne .Lfrom_mont_nox
-
- lea (%rax,$num),$rptr
- call __bn_sqrx8x_reduction
- call __bn_postx4x_internal
-
- pxor %xmm0,%xmm0
- lea 48(%rsp),%rax
- jmp .Lfrom_mont_zero
-
-.align 32
-.Lfrom_mont_nox:
-___
-$code.=<<___;
- call __bn_sqr8x_reduction
- call __bn_post4x_internal
-
- pxor %xmm0,%xmm0
- lea 48(%rsp),%rax
- jmp .Lfrom_mont_zero
-
-.align 32
-.Lfrom_mont_zero:
- mov 40(%rsp),%rsi # restore %rsp
-.cfi_def_cfa %rsi,8
- movdqa %xmm0,16*0(%rax)
- movdqa %xmm0,16*1(%rax)
- movdqa %xmm0,16*2(%rax)
- movdqa %xmm0,16*3(%rax)
- lea 16*4(%rax),%rax
- sub \$32,$num
- jnz .Lfrom_mont_zero
-
- mov \$1,%rax
- mov -48(%rsi),%r15
-.cfi_restore %r15
- mov -40(%rsi),%r14
-.cfi_restore %r14
- mov -32(%rsi),%r13
-.cfi_restore %r13
- mov -24(%rsi),%r12
-.cfi_restore %r12
- mov -16(%rsi),%rbp
-.cfi_restore %rbp
- mov -8(%rsi),%rbx
-.cfi_restore %rbx
- lea (%rsi),%rsp
-.cfi_def_cfa_register %rsp
-.Lfrom_epilogue:
- ret
-.cfi_endproc
-.size bn_from_mont8x,.-bn_from_mont8x
-___
-}
}}}
if ($addx) {{{
@@ -3896,10 +3709,6 @@ mul_handler:
.rva .LSEH_begin_bn_power5
.rva .LSEH_end_bn_power5
.rva .LSEH_info_bn_power5
-
- .rva .LSEH_begin_bn_from_mont8x
- .rva .LSEH_end_bn_from_mont8x
- .rva .LSEH_info_bn_from_mont8x
___
$code.=<<___ if ($addx);
.rva .LSEH_begin_bn_mulx4x_mont_gather5
@@ -3931,11 +3740,6 @@ $code.=<<___;
.byte 9,0,0,0
.rva mul_handler
.rva .Lpower5_prologue,.Lpower5_body,.Lpower5_epilogue # HandlerData[]
-.align 8
-.LSEH_info_bn_from_mont8x:
- .byte 9,0,0,0
- .rva mul_handler
- .rva .Lfrom_prologue,.Lfrom_body,.Lfrom_epilogue # HandlerData[]
___
$code.=<<___ if ($addx);
.align 8
diff --git a/crypto/bn/bn_exp.c b/crypto/bn/bn_exp.c
index 89ce77fbb7..c7b62232f3 100644
--- a/crypto/bn/bn_exp.c
+++ b/crypto/bn/bn_exp.c
@@ -897,14 +897,21 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
#if defined(OPENSSL_BN_ASM_MONT5)
if (window == 5 && top > 1) {
/*
- * This optimization uses ideas from http://eprint.iacr.org/2011/239,
- * specifically optimization of cache-timing attack countermeasures
- * and pre-computation optimization.
- */
-
- /*
- * Dedicated window==4 case improves 512-bit RSA sign by ~15%, but as
- * 512-bit RSA is hardly relevant, we omit it to spare size...
+ * This optimization uses ideas from https://eprint.iacr.org/2011/239,
+ * specifically optimization of cache-timing attack countermeasures,
+ * pre-computation optimization, and Almost Montgomery Multiplication.
+ *
+ * The paper discusses a 4-bit window to optimize 512-bit modular
+ * exponentiation, used in RSA-1024 with CRT, but RSA-1024 is no longer
+ * important.
+ *
+ * |bn_mul_mont_gather5| and |bn_power5| implement the "almost"
+ * reduction variant, so the values here may not be fully reduced.
+ * They are bounded by R (i.e. they fit in |top| words), not |m|.
+ * Additionally, we pass these "almost" reduced inputs into
+ * |bn_mul_mont|, which implements the normal reduction variant.
+ * Given those inputs, |bn_mul_mont| may not give reduced
+ * output, but it will still produce "almost" reduced output.
*/
void bn_mul_mont_gather5(BN_ULONG *rp, const BN_ULONG *ap,
const void *table, const BN_ULONG *np,
@@ -916,9 +923,6 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
const void *table, const BN_ULONG *np,
const BN_ULONG *n0, int num, int power);
int bn_get_bits5(const BN_ULONG *ap, int off);
- int bn_from_montgomery(BN_ULONG *rp, const BN_ULONG *ap,
- const BN_ULONG *not_used, const BN_ULONG *np,
- const BN_ULONG *n0, int num);
BN_ULONG *n0 = mont->n0, *np;
@@ -1007,14 +1011,18 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
}
}
- ret = bn_from_montgomery(tmp.d, tmp.d, NULL, np, n0, top);
tmp.top = top;
- bn_correct_top(&tmp);
- if (ret) {
- if (!BN_copy(rr, &tmp))
- ret = 0;
- goto err; /* non-zero ret means it's not error */
- }
+ /*
+ * The result is now in |tmp| in Montgomery form, but it may not be
+ * fully reduced. This is within bounds for |BN_from_montgomery|
+ * (tmp < R <= m*R) so it will, when converting from Montgomery form,
+ * produce a fully reduced result.
+ *
+ * This differs from Figure 2 of the paper, which uses AMM(h, 1) to
+ * convert from Montgomery form with unreduced output, followed by an
+ * extra reduction step. In the paper's terminology, we replace
+ * steps 9 and 10 with MM(h, 1).
+ */
} else
#endif
{