#!/usr/bin/env perl # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # March 2015 # # "Teaser" Montgomery multiplication module for ARMv8. Needs more # work. While it does improve RSA sign performance by 20-30% (less for # longer keys) on most processors, for some reason RSA2048 is not # faster and RSA4096 goes 15-20% slower on Cortex-A57. Multiplication # instruction issue rate is limited on processor in question, meaning # that dedicated squaring procedure is a must. Well, actually all # contemporary AArch64 processors seem to have limited multiplication # issue rate, i.e. they can't issue multiplication every cycle, which # explains moderate improvement coefficients in comparison to # compiler-generated code. Recall that compiler is instructed to use # umulh and therefore uses same amount of multiplication instructions # to do the job. Assembly's edge is to minimize number of "collateral" # instructions and of course instruction scheduling. $flavour = shift; $output = shift; $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or die "can't locate arm-xlate.pl"; open OUT,"| \"$^X\" $xlate $flavour $output"; *STDOUT=*OUT; ($lo0,$hi0,$aj,$m0,$alo,$ahi, $lo1,$hi1,$nj,$m1,$nlo,$nhi, $ovf, $i,$j,$tp,$tj) = map("x$_",6..17,19..24); # int bn_mul_mont( $rp="x0"; # BN_ULONG *rp, $ap="x1"; # const BN_ULONG *ap, $bp="x2"; # const BN_ULONG *bp, $np="x3"; # const BN_ULONG *np, $n0="x4"; # const BN_ULONG *n0, $num="x5"; # int num); $code.=<<___; .text .globl bn_mul_mont .type bn_mul_mont,%function .align 5 bn_mul_mont: stp x29,x30,[sp,#-64]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] ldr $m0,[$bp],#8 // bp[0] sub $tp,sp,$num,lsl#3 ldp $hi0,$aj,[$ap],#16 // ap[0..1] lsl $num,$num,#3 ldr $n0,[$n0] // *n0 and $tp,$tp,#-16 // ABI says so ldp $hi1,$nj,[$np],#16 // np[0..1] mul $lo0,$hi0,$m0 // ap[0]*bp[0] sub $j,$num,#16 // j=num-2 umulh $hi0,$hi0,$m0 mul $alo,$aj,$m0 // ap[1]*bp[0] umulh $ahi,$aj,$m0 mul $m1,$lo0,$n0 // "tp[0]"*n0 mov sp,$tp // alloca mul $lo1,$hi1,$m1 // np[0]*m1 umulh $hi1,$hi1,$m1 mul $nlo,$nj,$m1 // np[1]*m1 adds $lo1,$lo1,$lo0 // discarded umulh $nhi,$nj,$m1 adc $hi1,$hi1,xzr cbz $j,.L1st_skip .L1st: ldr $aj,[$ap],#8 adds $lo0,$alo,$hi0 sub $j,$j,#8 // j-- adc $hi0,$ahi,xzr ldr $nj,[$np],#8 adds $lo1,$nlo,$hi1 mul $alo,$aj,$m0 // ap[j]*bp[0] adc $hi1,$nhi,xzr umulh $ahi,$aj,$m0 adds $lo1,$lo1,$lo0 mul $nlo,$nj,$m1 // np[j]*m1 adc $hi1,$hi1,xzr umulh $nhi,$nj,$m1 str $lo1,[$tp],#8 // tp[j-1] cbnz $j,.L1st .L1st_skip: adds $lo0,$alo,$hi0 sub $ap,$ap,$num // rewind $ap adc $hi0,$ahi,xzr adds $lo1,$nlo,$hi1 sub $np,$np,$num // rewind $np adc $hi1,$nhi,xzr adds $lo1,$lo1,$lo0 sub $i,$num,#8 // i=num-1 adcs $hi1,$hi1,$hi0 adc $ovf,xzr,xzr // upmost overflow bit stp $lo1,$hi1,[$tp] .Louter: ldr $m0,[$bp],#8 // bp[i] ldp $hi0,$aj,[$ap],#16 ldr $tj,[sp] // tp[0] add $tp,sp,#8 mul $lo0,$hi0,$m0 // ap[0]*bp[i] sub $j,$num,#16 // j=num-2 umulh $hi0,$hi0,$m0 ldp $hi1,$nj,[$np],#16 mul $alo,$aj,$m0 // ap[1]*bp[i] adds $lo0,$lo0,$tj umulh $ahi,$aj,$m0 adc $hi0,$hi0,xzr mul $m1,$lo0,$n0 sub $i,$i,#8 // i-- mul $lo1,$hi1,$m1 // np[0]*m1 umulh $hi1,$hi1,$m1 mul $nlo,$nj,$m1 // np[1]*m1 adds $lo1,$lo1,$lo0 umulh $nhi,$nj,$m1 cbz $j,.Linner_skip .Linner: ldr $aj,[$ap],#8 adc $hi1,$hi1,xzr ldr $tj,[$tp],#8 // tp[j] adds $lo0,$alo,$hi0 sub $j,$j,#8 // j-- adc $hi0,$ahi,xzr adds $lo1,$nlo,$hi1 ldr $nj,[$np],#8 adc $hi1,$nhi,xzr mul $alo,$aj,$m0 // ap[j]*bp[i] adds $lo0,$lo0,$tj umulh $ahi,$aj,$m0 adc $hi0,$hi0,xzr mul $nlo,$nj,$m1 // np[j]*m1 adds $lo1,$lo1,$lo0 umulh $nhi,$nj,$m1 str $lo1,[$tp,#-16] // tp[j-1] cbnz $j,.Linner .Linner_skip: ldr $tj,[$tp],#8 // tp[j] adc $hi1,$hi1,xzr adds $lo0,$alo,$hi0 sub $ap,$ap,$num // rewind $ap adc $hi0,$ahi,xzr adds $lo1,$nlo,$hi1 sub $np,$np,$num // rewind $np adc $hi1,$nhi,$ovf adds $lo0,$lo0,$tj adc $hi0,$hi0,xzr adds $lo1,$lo1,$lo0 adcs $hi1,$hi1,$hi0 adc $ovf,xzr,xzr // upmost overflow bit stp $lo1,$hi1,[$tp,#-16] cbnz $i,.Louter // Final step. We see if result is larger than modulus, and // if it is, subtract the modulus. But comparison implies // subtraction. So we subtract modulus, see if it borrowed, // and conditionally copy original value. ldr $tj,[sp] // tp[0] add $tp,sp,#8 ldr $nj,[$np],#8 // np[0] subs $j,$num,#8 // j=num-1 and clear borrow mov $ap,$rp .Lsub: sbcs $aj,$tj,$nj // tp[j]-np[j] ldr $tj,[$tp],#8 sub $j,$j,#8 // j-- ldr $nj,[$np],#8 str $aj,[$ap],#8 // rp[j]=tp[j]-np[j] cbnz $j,.Lsub sbcs $aj,$tj,$nj sbcs $ovf,$ovf,xzr // did it borrow? str $aj,[$ap],#8 // rp[num-1] ldr $tj,[sp] // tp[0] add $tp,sp,#8 ldr $aj,[$rp],#8 // rp[0] sub $num,$num,#8 // num-- nop .Lcond_copy: sub $num,$num,#8 // num-- csel $nj,$aj,$tj,cs // did it borrow? ldr $tj,[$tp],#8 ldr $aj,[$rp],#8 str xzr,[$tp,#-16] // wipe tp str $nj,[$rp,#-16] cbnz $num,.Lcond_copy csel $nj,$aj,$tj,cs str xzr,[$tp,#-8] // wipe tp str $nj,[$rp,#-8] ldp x19,x20,[x29,#16] mov sp,x29 ldp x21,x22,[x29,#32] ldp x23,x24,[x29,#48] ldr x29,[sp],#64 ret .size bn_mul_mont,.-bn_mul_mont .asciz "Montgomery Multiplication for ARMv8, CRYPTOGAMS by " .align 4 ___ print $code; close STDOUT;