summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rwxr-xr-xcrypto/bn/asm/rsaz-x86_64.pl381
1 files changed, 197 insertions, 184 deletions
diff --git a/crypto/bn/asm/rsaz-x86_64.pl b/crypto/bn/asm/rsaz-x86_64.pl
index c41d8e521e..36fa0efbd0 100755
--- a/crypto/bn/asm/rsaz-x86_64.pl
+++ b/crypto/bn/asm/rsaz-x86_64.pl
@@ -118,7 +118,7 @@ rsaz_512_sqr: # 25-29% faster than rsaz_512_mul
subq \$128+24, %rsp
.cfi_adjust_cfa_offset 128+24
.Lsqr_body:
- movq $mod, %rbp # common argument
+ movq $mod, %xmm1 # common off-load
movq ($inp), %rdx
movq 8($inp), %rax
movq $n0, 128(%rsp)
@@ -136,7 +136,8 @@ $code.=<<___;
.Loop_sqr:
movl $times,128+8(%rsp)
#first iteration
- movq %rdx, %rbx
+ movq %rdx, %rbx # 0($inp)
+ mov %rax, %rbp # 8($inp)
mulq %rdx
movq %rax, %r8
movq 16($inp), %rax
@@ -175,31 +176,29 @@ $code.=<<___;
mulq %rbx
addq %rax, %r14
movq %rbx, %rax
- movq %rdx, %r15
- adcq \$0, %r15
+ adcq \$0, %rdx
- addq %r8, %r8 #shlq \$1, %r8
- movq %r9, %rcx
- adcq %r9, %r9 #shld \$1, %r8, %r9
+ xorq %rcx,%rcx # rcx:r8 = r8 << 1
+ addq %r8, %r8
+ movq %rdx, %r15
+ adcq \$0, %rcx
mulq %rax
- movq %rax, (%rsp)
- addq %rdx, %r8
- adcq \$0, %r9
+ addq %r8, %rdx
+ adcq \$0, %rcx
- movq %r8, 8(%rsp)
- shrq \$63, %rcx
+ movq %rax, (%rsp)
+ movq %rdx, 8(%rsp)
#second iteration
- movq 8($inp), %r8
movq 16($inp), %rax
- mulq %r8
+ mulq %rbp
addq %rax, %r10
movq 24($inp), %rax
movq %rdx, %rbx
adcq \$0, %rbx
- mulq %r8
+ mulq %rbp
addq %rax, %r11
movq 32($inp), %rax
adcq \$0, %rdx
@@ -207,7 +206,7 @@ $code.=<<___;
movq %rdx, %rbx
adcq \$0, %rbx
- mulq %r8
+ mulq %rbp
addq %rax, %r12
movq 40($inp), %rax
adcq \$0, %rdx
@@ -215,7 +214,7 @@ $code.=<<___;
movq %rdx, %rbx
adcq \$0, %rbx
- mulq %r8
+ mulq %rbp
addq %rax, %r13
movq 48($inp), %rax
adcq \$0, %rdx
@@ -223,7 +222,7 @@ $code.=<<___;
movq %rdx, %rbx
adcq \$0, %rbx
- mulq %r8
+ mulq %rbp
addq %rax, %r14
movq 56($inp), %rax
adcq \$0, %rdx
@@ -231,39 +230,39 @@ $code.=<<___;
movq %rdx, %rbx
adcq \$0, %rbx
- mulq %r8
+ mulq %rbp
addq %rax, %r15
- movq %r8, %rax
+ movq %rbp, %rax
adcq \$0, %rdx
addq %rbx, %r15
- movq %rdx, %r8
- movq %r10, %rdx
- adcq \$0, %r8
+ adcq \$0, %rdx
- add %rdx, %rdx
- lea (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10
- movq %r11, %rbx
- adcq %r11, %r11 #shld \$1, %r10, %r11
+ xorq %rbx, %rbx # rbx:r10:r9 = r10:r9 << 1
+ addq %r9, %r9
+ movq %rdx, %r8
+ adcq %r10, %r10
+ adcq \$0, %rbx
mulq %rax
+ addq %rcx, %rax
+ movq 16($inp), %rbp
+ adcq \$0, %rdx
addq %rax, %r9
+ movq 24($inp), %rax
adcq %rdx, %r10
- adcq \$0, %r11
+ adcq \$0, %rbx
movq %r9, 16(%rsp)
movq %r10, 24(%rsp)
- shrq \$63, %rbx
#third iteration
- movq 16($inp), %r9
- movq 24($inp), %rax
- mulq %r9
+ mulq %rbp
addq %rax, %r12
movq 32($inp), %rax
movq %rdx, %rcx
adcq \$0, %rcx
- mulq %r9
+ mulq %rbp
addq %rax, %r13
movq 40($inp), %rax
adcq \$0, %rdx
@@ -271,7 +270,7 @@ $code.=<<___;
movq %rdx, %rcx
adcq \$0, %rcx
- mulq %r9
+ mulq %rbp
addq %rax, %r14
movq 48($inp), %rax
adcq \$0, %rdx
@@ -279,9 +278,7 @@ $code.=<<___;
movq %rdx, %rcx
adcq \$0, %rcx
- mulq %r9
- movq %r12, %r10
- lea (%rbx,%r12,2), %r12 #shld \$1, %rbx, %r12
+ mulq %rbp
addq %rax, %r15
movq 56($inp), %rax
adcq \$0, %rdx
@@ -289,36 +286,40 @@ $code.=<<___;
movq %rdx, %rcx
adcq \$0, %rcx
- mulq %r9
- shrq \$63, %r10
+ mulq %rbp
addq %rax, %r8
- movq %r9, %rax
+ movq %rbp, %rax
adcq \$0, %rdx
addq %rcx, %r8
- movq %rdx, %r9
- adcq \$0, %r9
+ adcq \$0, %rdx
- movq %r13, %rcx
- leaq (%r10,%r13,2), %r13 #shld \$1, %r12, %r13
+ xorq %rcx, %rcx # rcx:r12:r11 = r12:r11 << 1
+ addq %r11, %r11
+ movq %rdx, %r9
+ adcq %r12, %r12
+ adcq \$0, %rcx
mulq %rax
+ addq %rbx, %rax
+ movq 24($inp), %r10
+ adcq \$0, %rdx
addq %rax, %r11
+ movq 32($inp), %rax
adcq %rdx, %r12
- adcq \$0, %r13
+ adcq \$0, %rcx
movq %r11, 32(%rsp)
movq %r12, 40(%rsp)
- shrq \$63, %rcx
#fourth iteration
- movq 24($inp), %r10
- movq 32($inp), %rax
+ mov %rax, %r11 # 32($inp)
mulq %r10
addq %rax, %r14
movq 40($inp), %rax
movq %rdx, %rbx
adcq \$0, %rbx
+ mov %rax, %r12 # 40($inp)
mulq %r10
addq %rax, %r15
movq 48($inp), %rax
@@ -327,9 +328,8 @@ $code.=<<___;
movq %rdx, %rbx
adcq \$0, %rbx
+ mov %rax, %rbp # 48($inp)
mulq %r10
- movq %r14, %r12
- leaq (%rcx,%r14,2), %r14 #shld \$1, %rcx, %r14
addq %rax, %r8
movq 56($inp), %rax
adcq \$0, %rdx
@@ -338,32 +338,33 @@ $code.=<<___;
adcq \$0, %rbx
mulq %r10
- shrq \$63, %r12
addq %rax, %r9
movq %r10, %rax
adcq \$0, %rdx
addq %rbx, %r9
- movq %rdx, %r10
- adcq \$0, %r10
+ adcq \$0, %rdx
- movq %r15, %rbx
- leaq (%r12,%r15,2),%r15 #shld \$1, %r14, %r15
+ xorq %rbx, %rbx # rbx:r13:r14 = r13:r14 << 1
+ addq %r13, %r13
+ movq %rdx, %r10
+ adcq %r14, %r14
+ adcq \$0, %rbx
mulq %rax
+ addq %rcx, %rax
+ adcq \$0, %rdx
addq %rax, %r13
+ movq %r12, %rax # 40($inp)
adcq %rdx, %r14
- adcq \$0, %r15
+ adcq \$0, %rbx
movq %r13, 48(%rsp)
movq %r14, 56(%rsp)
- shrq \$63, %rbx
#fifth iteration
- movq 32($inp), %r11
- movq 40($inp), %rax
mulq %r11
addq %rax, %r8
- movq 48($inp), %rax
+ movq %rbp, %rax # 48($inp)
movq %rdx, %rcx
adcq \$0, %rcx
@@ -371,97 +372,99 @@ $code.=<<___;
addq %rax, %r9
movq 56($inp), %rax
adcq \$0, %rdx
- movq %r8, %r12
- leaq (%rbx,%r8,2), %r8 #shld \$1, %rbx, %r8
addq %rcx, %r9
movq %rdx, %rcx
adcq \$0, %rcx
+ mov %rax, %r14 # 56($inp)
mulq %r11
- shrq \$63, %r12
addq %rax, %r10
movq %r11, %rax
adcq \$0, %rdx
addq %rcx, %r10
- movq %rdx, %r11
- adcq \$0, %r11
+ adcq \$0, %rdx
- movq %r9, %rcx
- leaq (%r12,%r9,2), %r9 #shld \$1, %r8, %r9
+ xorq %rcx, %rcx # rcx:r8:r15 = r8:r15 << 1
+ addq %r15, %r15
+ movq %rdx, %r11
+ adcq %r8, %r8
+ adcq \$0, %rcx
mulq %rax
+ addq %rbx, %rax
+ adcq \$0, %rdx
addq %rax, %r15
+ movq %rbp, %rax # 48($inp)
adcq %rdx, %r8
- adcq \$0, %r9
+ adcq \$0, %rcx
movq %r15, 64(%rsp)
movq %r8, 72(%rsp)
- shrq \$63, %rcx
#sixth iteration
- movq 40($inp), %r12
- movq 48($inp), %rax
mulq %r12
addq %rax, %r10
- movq 56($inp), %rax
+ movq %r14, %rax # 56($inp)
movq %rdx, %rbx
adcq \$0, %rbx
mulq %r12
addq %rax, %r11
movq %r12, %rax
- movq %r10, %r15
- leaq (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10
adcq \$0, %rdx
- shrq \$63, %r15
addq %rbx, %r11
- movq %rdx, %r12
- adcq \$0, %r12
+ adcq \$0, %rdx
- movq %r11, %rbx
- leaq (%r15,%r11,2), %r11 #shld \$1, %r10, %r11
+ xorq %rbx, %rbx # rbx:r10:r9 = r10:r9 << 1
+ addq %r9, %r9
+ movq %rdx, %r12
+ adcq %r10, %r10
+ adcq \$0, %rbx
mulq %rax
+ addq %rcx, %rax
+ adcq \$0, %rdx
addq %rax, %r9
+ movq %r14, %rax # 56($inp)
adcq %rdx, %r10
- adcq \$0, %r11
+ adcq \$0, %rbx
movq %r9, 80(%rsp)
movq %r10, 88(%rsp)
#seventh iteration
- movq 48($inp), %r13
- movq 56($inp), %rax
- mulq %r13
+ mulq %rbp
addq %rax, %r12
- movq %r13, %rax
- movq %rdx, %r13
- adcq \$0, %r13
+ movq %rbp, %rax
+ adcq \$0, %rdx
- xorq %r14, %r14
- shlq \$1, %rbx
- adcq %r12, %r12 #shld \$1, %rbx, %r12
- adcq %r13, %r13 #shld \$1, %r12, %r13
- adcq %r14, %r14 #shld \$1, %r13, %r14
+ xorq %rcx, %rcx # rcx:r12:r11 = r12:r11 << 1
+ addq %r11, %r11
+ movq %rdx, %r13
+ adcq %r12, %r12
+ adcq \$0, %rcx
mulq %rax
+ addq %rbx, %rax
+ adcq \$0, %rdx
addq %rax, %r11
+ movq %r14, %rax # 56($inp)
adcq %rdx, %r12
- adcq \$0, %r13
+ adcq \$0, %rcx
movq %r11, 96(%rsp)
movq %r12, 104(%rsp)
#eighth iteration
- movq 56($inp), %rax
+ xorq %rbx, %rbx # rbx:r13 = r13 << 1
+ addq %r13, %r13
+ adcq \$0, %rbx
+
mulq %rax
- addq %rax, %r13
+ addq %rcx, %rax
adcq \$0, %rdx
-
- addq %rdx, %r14
-
- movq %r13, 112(%rsp)
- movq %r14, 120(%rsp)
+ addq %r13, %rax
+ adcq %rbx, %rdx
movq (%rsp), %r8
movq 8(%rsp), %r9
@@ -471,6 +474,10 @@ $code.=<<___;
movq 40(%rsp), %r13
movq 48(%rsp), %r14
movq 56(%rsp), %r15
+ movq %xmm1, %rbp
+
+ movq %rax, 112(%rsp)
+ movq %rdx, 120(%rsp)
call __rsaz_512_reduce
@@ -502,9 +509,9 @@ $code.=<<___;
.Loop_sqrx:
movl $times,128+8(%rsp)
movq $out, %xmm0 # off-load
- movq %rbp, %xmm1 # off-load
#first iteration
mulx %rax, %r8, %r9
+ mov %rax, %rbx
mulx 16($inp), %rcx, %r10
xor %rbp, %rbp # cf=0, of=0
@@ -512,40 +519,39 @@ $code.=<<___;
mulx 24($inp), %rax, %r11
adcx %rcx, %r9
- mulx 32($inp), %rcx, %r12
+ .byte 0xc4,0x62,0xf3,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($inp), %rcx, %r12
adcx %rax, %r10
- mulx 40($inp), %rax, %r13
+ .byte 0xc4,0x62,0xfb,0xf6,0xae,0x28,0x00,0x00,0x00 # mulx 40($inp), %rax, %r13
adcx %rcx, %r11
- .byte 0xc4,0x62,0xf3,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($inp), %rcx, %r14
+ mulx 48($inp), %rcx, %r14
adcx %rax, %r12
adcx %rcx, %r13
- .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r15
+ mulx 56($inp), %rax, %r15
adcx %rax, %r14
adcx %rbp, %r15 # %rbp is 0
- mov %r9, %rcx
- shld \$1, %r8, %r9
- shl \$1, %r8
-
- xor %ebp, %ebp
- mulx %rdx, %rax, %rdx
- adcx %rdx, %r8
- mov 8($inp), %rdx
- adcx %rbp, %r9
+ mulx %rdx, %rax, $out
+ mov %rbx, %rdx # 8($inp)
+ xor %rcx, %rcx
+ adox %r8, %r8
+ adcx $out, %r8
+ adox %rbp, %rcx
+ adcx %rbp, %rcx
mov %rax, (%rsp)
mov %r8, 8(%rsp)
#second iteration
- mulx 16($inp), %rax, %rbx
+ .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x10,0x00,0x00,0x00 # mulx 16($inp), %rax, %rbx
adox %rax, %r10
adcx %rbx, %r11
- .byte 0xc4,0x62,0xc3,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r8
+ mulx 24($inp), $out, %r8
adox $out, %r11
+ .byte 0x66
adcx %r8, %r12
mulx 32($inp), %rax, %rbx
@@ -563,24 +569,25 @@ $code.=<<___;
.byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r8
adox $out, %r15
adcx %rbp, %r8
+ mulx %rdx, %rax, $out
adox %rbp, %r8
+ .byte 0x48,0x8b,0x96,0x10,0x00,0x00,0x00 # mov 16($inp), %rdx
- mov %r11, %rbx
- shld \$1, %r10, %r11
- shld \$1, %rcx, %r10
-
- xor %ebp,%ebp
- mulx %rdx, %rax, %rcx
- mov 16($inp), %rdx
+ xor %rbx, %rbx
+ adcx %rcx, %rax
+ adox %r9, %r9
+ adcx %rbp, $out
+ adox %r10, %r10
adcx %rax, %r9
- adcx %rcx, %r10
- adcx %rbp, %r11
+ adox %rbp, %rbx
+ adcx $out, %r10
+ adcx %rbp, %rbx
mov %r9, 16(%rsp)
.byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 # mov %r10, 24(%rsp)
#third iteration
- .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r9
+ mulx 24($inp), $out, %r9
adox $out, %r12
adcx %r9, %r13
@@ -588,7 +595,7 @@ $code.=<<___;
adox %rax, %r13
adcx %rcx, %r14
- mulx 40($inp), $out, %r9
+ .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x28,0x00,0x00,0x00 # mulx 40($inp), $out, %r9
adox $out, %r14
adcx %r9, %r15
@@ -596,27 +603,28 @@ $code.=<<___;
adox %rax, %r15
adcx %rcx, %r8
- .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r9
+ mulx 56($inp), $out, %r9
adox $out, %r8
adcx %rbp, %r9
+ mulx %rdx, %rax, $out
adox %rbp, %r9
+ mov 24($inp), %rdx
- mov %r13, %rcx
- shld \$1, %r12, %r13
- shld \$1, %rbx, %r12
-
- xor %ebp, %ebp
- mulx %rdx, %rax, %rdx
+ xor %rcx, %rcx
+ adcx %rbx, %rax
+ adox %r11, %r11
+ adcx %rbp, $out
+ adox %r12, %r12
adcx %rax, %r11
- adcx %rdx, %r12
- mov 24($inp), %rdx
- adcx %rbp, %r13
+ adox %rbp, %rcx
+ adcx $out, %r12
+ adcx %rbp, %rcx
mov %r11, 32(%rsp)
- .byte 0x4c,0x89,0xa4,0x24,0x28,0x00,0x00,0x00 # mov %r12, 40(%rsp)
+ mov %r12, 40(%rsp)
#fourth iteration
- .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00 # mulx 32($inp), %rax, %rbx
+ mulx 32($inp), %rax, %rbx
adox %rax, %r14
adcx %rbx, %r15
@@ -631,25 +639,25 @@ $code.=<<___;
mulx 56($inp), $out, %r10
adox $out, %r9
adcx %rbp, %r10
+ mulx %rdx, %rax, $out
adox %rbp, %r10
+ mov 32($inp), %rdx
- .byte 0x66
- mov %r15, %rbx
- shld \$1, %r14, %r15
- shld \$1, %rcx, %r14
-
- xor %ebp, %ebp
- mulx %rdx, %rax, %rdx
+ xor %rbx, %rbx
+ adcx %rcx, %rax
+ adox %r13, %r13
+ adcx %rbp, $out
+ adox %r14, %r14
adcx %rax, %r13
- adcx %rdx, %r14
- mov 32($inp), %rdx
- adcx %rbp, %r15
+ adox %rbp, %rbx
+ adcx $out, %r14
+ adcx %rbp, %rbx
mov %r13, 48(%rsp)
mov %r14, 56(%rsp)
#fifth iteration
- .byte 0xc4,0x62,0xc3,0xf6,0x9e,0x28,0x00,0x00,0x00 # mulx 40($inp), $out, %r11
+ mulx 40($inp), $out, %r11
adox $out, %r8
adcx %r11, %r9
@@ -660,18 +668,19 @@ $code.=<<___;
mulx 56($inp), $out, %r11
adox $out, %r10
adcx %rbp, %r11
+ mulx %rdx, %rax, $out
+ mov 40($inp), %rdx
adox %rbp, %r11
- mov %r9, %rcx
- shld \$1, %r8, %r9
- shld \$1, %rbx, %r8
-
- xor %ebp, %ebp
- mulx %rdx, %rax, %rdx
+ xor %rcx, %rcx
+ adcx %rbx, %rax
+ adox %r15, %r15
+ adcx %rbp, $out
+ adox %r8, %r8
adcx %rax, %r15
- adcx %rdx, %r8
- mov 40($inp), %rdx
- adcx %rbp, %r9
+ adox %rbp, %rcx
+ adcx $out, %r8
+ adcx %rbp, %rcx
mov %r15, 64(%rsp)
mov %r8, 72(%rsp)
@@ -684,18 +693,19 @@ $code.=<<___;
.byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r12
adox $out, %r11
adcx %rbp, %r12
+ mulx %rdx, %rax, $out
adox %rbp, %r12
+ mov 48($inp), %rdx
- mov %r11, %rbx
- shld \$1, %r10, %r11
- shld \$1, %rcx, %r10
-
- xor %ebp, %ebp
- mulx %rdx, %rax, %rdx
+ xor %rbx, %rbx
+ adcx %rcx, %rax
+ adox %r9, %r9
+ adcx %rbp, $out
+ adox %r10, %r10
adcx %rax, %r9
- adcx %rdx, %r10
- mov 48($inp), %rdx
- adcx %rbp, %r11
+ adcx $out, %r10
+ adox %rbp, %rbx
+ adcx %rbp, %rbx
mov %r9, 80(%rsp)
mov %r10, 88(%rsp)
@@ -705,31 +715,31 @@ $code.=<<___;
adox %rax, %r12
adox %rbp, %r13
- xor %r14, %r14
- shld \$1, %r13, %r14
- shld \$1, %r12, %r13
- shld \$1, %rbx, %r12
-
- xor %ebp, %ebp
- mulx %rdx, %rax, %rdx
- adcx %rax, %r11
- adcx %rdx, %r12
+ mulx %rdx, %rax, $out
+ xor %rcx, %rcx
mov 56($inp), %rdx
- adcx %rbp, %r13
+ adcx %rbx, %rax
+ adox %r11, %r11
+ adcx %rbp, $out
+ adox %r12, %r12
+ adcx %rax, %r11
+ adox %rbp, %rcx
+ adcx $out, %r12
+ adcx %rbp, %rcx
.byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 # mov %r11, 96(%rsp)
.byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 # mov %r12, 104(%rsp)
#eighth iteration
mulx %rdx, %rax, %rdx
- adox %rax, %r13
- adox %rbp, %rdx
+ xor %rbx, %rbx
+ adcx %rcx, %rax
+ adox %r13, %r13
+ adcx %rbp, %rdx
+ adox %rbp, %rbx
+ adcx %r13, %rax
+ adcx %rdx, %rbx
- .byte 0x66
- add %rdx, %r14
-
- movq %r13, 112(%rsp)
- movq %r14, 120(%rsp)
movq %xmm0, $out
movq %xmm1, %rbp
@@ -743,6 +753,9 @@ $code.=<<___;
movq 48(%rsp), %r14
movq 56(%rsp), %r15
+ movq %rax, 112(%rsp)
+ movq %rbx, 120(%rsp)
+
call __rsaz_512_reducex
addq 64(%rsp), %r8