diff options
author | Andy Polyakov <appro@openssl.org> | 2010-11-29 20:52:43 +0000 |
---|---|---|
committer | Andy Polyakov <appro@openssl.org> | 2010-11-29 20:52:43 +0000 |
commit | e822c756b66024d49ab936bf77b745206660fcd2 (patch) | |
tree | c5c6cd2bec509720a0753e245bd3731e99c6de83 /crypto/aes | |
parent | 300b1d76fe27541c662ca606a6a201b2718e0c65 (diff) |
s390x assembler pack: adapt for -m31 build, see commentary in Configure
for more details.
Diffstat (limited to 'crypto/aes')
-rw-r--r-- | crypto/aes/asm/aes-s390x.pl | 167 |
1 files changed, 95 insertions, 72 deletions
diff --git a/crypto/aes/asm/aes-s390x.pl b/crypto/aes/asm/aes-s390x.pl index 4be64e3e51..db963c9df0 100644 --- a/crypto/aes/asm/aes-s390x.pl +++ b/crypto/aes/asm/aes-s390x.pl @@ -60,6 +60,26 @@ # maximum, but *on average* it would be as much as ~98%. Meaning that # worst case is unlike, it's like hitting ravine on plateau. +# November 2010. +# +# Adapt for -m31 build. If kernel supports what's called "highgprs" +# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit +# instructions and achieve "64-bit" performance even in 31-bit legacy +# application context. The feature is not specific to any particular +# processor, as long as it's "z-CPU". Latter implies that the code +# remains z/Architecture specific. On z990 it was measured to perform +# 2x better than code generated by gcc 4.3. + +$flavour = shift; + +if ($flavour =~ /3[12]/) { + $SIZE_T=4; + $g=""; +} else { + $SIZE_T=8; + $g="g"; +} + while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; @@ -82,6 +102,8 @@ $rounds="%r13"; $ra="%r14"; $sp="%r15"; +$stdframe=16*$SIZE_T+4*8; + sub _data_word() { my $i; while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; } @@ -223,7 +245,7 @@ $code.=<<___ if (!$softonly); .Lesoft: ___ $code.=<<___; - stmg %r3,$ra,24($sp) + stm${g} %r3,$ra,3*$SIZE_T($sp) llgf $s0,0($inp) llgf $s1,4($inp) @@ -233,20 +255,20 @@ $code.=<<___; larl $tbl,AES_Te bras $ra,_s390x_AES_encrypt - lg $out,24($sp) + l${g} $out,3*$SIZE_T($sp) st $s0,0($out) st $s1,4($out) st $s2,8($out) st $s3,12($out) - lmg %r6,$ra,48($sp) + lm${g} %r6,$ra,6*$SIZE_T($sp) br $ra .size AES_encrypt,.-AES_encrypt .type _s390x_AES_encrypt,\@function .align 16 _s390x_AES_encrypt: - stg $ra,152($sp) + st${g} $ra,`$stdframe-$SIZE_T`($sp) x $s0,0($key) x $s1,4($key) x $s2,8($key) @@ -410,7 +432,7 @@ _s390x_AES_encrypt: or $s2,$i3 or $s3,$t3 - lg $ra,152($sp) + l${g} $ra,`$stdframe-$SIZE_T`($sp) xr $s0,$t0 xr $s1,$t2 x $s2,24($key) @@ -549,7 +571,7 @@ $code.=<<___ if (!$softonly); .Ldsoft: ___ $code.=<<___; - stmg %r3,$ra,24($sp) + stm${g} %r3,$ra,3*$SIZE_T($sp) llgf $s0,0($inp) llgf $s1,4($inp) @@ -559,20 +581,20 @@ $code.=<<___; larl $tbl,AES_Td bras $ra,_s390x_AES_decrypt - lg $out,24($sp) + l${g} $out,3*$SIZE_T($sp) st $s0,0($out) st $s1,4($out) st $s2,8($out) st $s3,12($out) - lmg %r6,$ra,48($sp) + lm${g} %r6,$ra,6*$SIZE_T($sp) br $ra .size AES_decrypt,.-AES_decrypt .type _s390x_AES_decrypt,\@function .align 16 _s390x_AES_decrypt: - stg $ra,152($sp) + st${g} $ra,`$stdframe-$SIZE_T`($sp) x $s0,0($key) x $s1,4($key) x $s2,8($key) @@ -716,7 +738,7 @@ _s390x_AES_decrypt: nr $i1,$mask nr $i2,$mask - lg $ra,152($sp) + l${g} $ra,`$stdframe-$SIZE_T`($sp) or $s1,$t1 l $t0,16($key) l $t1,20($key) @@ -750,9 +772,9 @@ $code.=<<___; .align 16 AES_set_encrypt_key: lghi $t0,0 - clgr $inp,$t0 + cl${g}r $inp,$t0 je .Lminus1 - clgr $key,$t0 + cl${g}r $key,$t0 je .Lminus1 lghi $t0,128 @@ -810,7 +832,7 @@ ___ $code.=<<___; .align 16 .Lekey_internal: - stmg %r6,%r13,48($sp) # all non-volatile regs + stm${g} %r6,%r13,6*$SIZE_T($sp) # all non-volatile regs larl $tbl,AES_Te+2048 @@ -871,7 +893,7 @@ $code.=<<___; la $t3,4($t3) # i++ brct $rounds,.L128_loop lghi %r2,0 - lmg %r6,%r13,48($sp) + lm${g} %r6,%r13,6*$SIZE_T($sp) br $ra .align 16 @@ -919,7 +941,7 @@ $code.=<<___; st $s3,36($key) brct $rounds,.L192_continue lghi %r2,0 - lmg %r6,%r13,48($sp) + lm${g} %r6,%r13,6*$SIZE_T($sp) br $ra .align 16 @@ -981,7 +1003,7 @@ $code.=<<___; st $s3,44($key) brct $rounds,.L256_continue lghi %r2,0 - lmg %r6,%r13,48($sp) + lm${g} %r6,%r13,6*$SIZE_T($sp) br $ra .align 16 @@ -1032,11 +1054,11 @@ $code.=<<___; .type AES_set_decrypt_key,\@function .align 16 AES_set_decrypt_key: - stg $key,32($sp) # I rely on AES_set_encrypt_key to - stg $ra,112($sp) # save non-volatile registers! + st${g} $key,4*$SIZE_T($sp) # I rely on AES_set_encrypt_key to + st${g} $ra,14*$SIZE_T($sp) # save non-volatile registers! bras $ra,AES_set_encrypt_key - lg $key,32($sp) - lg $ra,112($sp) + l${g} $key,4*$SIZE_T($sp) + l${g} $ra,14*$SIZE_T($sp) ltgr %r2,%r2 bnzr $ra ___ @@ -1051,11 +1073,11 @@ $code.=<<___ if (!$softonly); .align 16 .Ldkey_internal: - stg $key,32($sp) - stg $ra,40($sp) + st${g} $key,4*$SIZE_T($sp) + st${g} $ra,14*$SIZE_T($sp) bras $ra,.Lekey_internal - lg $key,32($sp) - lg $ra,40($sp) + l${g} $key,4*$SIZE_T($sp) + l${g} $ra,14*$SIZE_T($sp) ___ $code.=<<___; @@ -1136,7 +1158,7 @@ $code.=<<___; la $key,4($key) brct $rounds,.Lmix - lmg %r6,%r13,48($sp)# as was saved by AES_set_encrypt_key! + lm${g} %r6,%r13,6*$SIZE_T($sp)# as was saved by AES_set_encrypt_key! lghi %r2,0 br $ra .size AES_set_decrypt_key,.-AES_set_decrypt_key @@ -1176,7 +1198,7 @@ $code.=<<___ if (!$softonly); l %r0,240($key) # load kmc code lghi $key,15 # res=len%16, len-=res; ngr $key,$len - slgr $len,$key + sl${g}r $len,$key la %r1,16($sp) # parameter block - ivec || key jz .Lkmc_truncated .long 0xb92f0042 # kmc %r4,%r2 @@ -1194,34 +1216,34 @@ $code.=<<___ if (!$softonly); tmll %r0,0x80 jnz .Lkmc_truncated_dec lghi %r1,0 - stg %r1,128($sp) - stg %r1,136($sp) + stg %r1,16*$SIZE_T($sp) + stg %r1,16*$SIZE_T+8($sp) bras %r1,1f - mvc 128(1,$sp),0($inp) + mvc 16*$SIZE_T(1,$sp),0($inp) 1: ex $key,0(%r1) la %r1,16($sp) # restore parameter block - la $inp,128($sp) + la $inp,16*$SIZE_T($sp) lghi $len,16 .long 0xb92f0042 # kmc %r4,%r2 j .Lkmc_done .align 16 .Lkmc_truncated_dec: - stg $out,64($sp) - la $out,128($sp) + st${g} $out,4*$SIZE_T($sp) + la $out,16*$SIZE_T($sp) lghi $len,16 .long 0xb92f0042 # kmc %r4,%r2 - lg $out,64($sp) + l${g} $out,4*$SIZE_T($sp) bras %r1,2f - mvc 0(1,$out),128($sp) + mvc 0(1,$out),16*$SIZE_T($sp) 2: ex $key,0(%r1) j .Lkmc_done .align 16 .Lcbc_software: ___ $code.=<<___; - stmg $key,$ra,40($sp) + stm${g} $key,$ra,5*$SIZE_T($sp) lhi %r0,0 - cl %r0,164($sp) + cl %r0,`$stdframe+$SIZE_T-4`($sp) je .Lcbc_decrypt larl $tbl,AES_Te @@ -1232,10 +1254,10 @@ $code.=<<___; llgf $s3,12($ivp) lghi $t0,16 - slgr $len,$t0 + sl${g}r $len,$t0 brc 4,.Lcbc_enc_tail # if borrow .Lcbc_enc_loop: - stmg $inp,$out,16($sp) + stm${g} $inp,$out,2*$SIZE_T($sp) x $s0,0($inp) x $s1,4($inp) x $s2,8($inp) @@ -1244,7 +1266,7 @@ $code.=<<___; bras $ra,_s390x_AES_encrypt - lmg $inp,$key,16($sp) + lm${g} $inp,$key,2*$SIZE_T($sp) st $s0,0($out) st $s1,4($out) st $s2,8($out) @@ -1253,33 +1275,33 @@ $code.=<<___; la $inp,16($inp) la $out,16($out) lghi $t0,16 - ltgr $len,$len + lt${g}r $len,$len jz .Lcbc_enc_done - slgr $len,$t0 + sl${g}r $len,$t0 brc 4,.Lcbc_enc_tail # if borrow j .Lcbc_enc_loop .align 16 .Lcbc_enc_done: - lg $ivp,48($sp) + l${g} $ivp,6*$SIZE_T($sp) st $s0,0($ivp) st $s1,4($ivp) st $s2,8($ivp) st $s3,12($ivp) - lmg %r7,$ra,56($sp) + lm${g} %r7,$ra,7*$SIZE_T($sp) br $ra .align 16 .Lcbc_enc_tail: aghi $len,15 lghi $t0,0 - stg $t0,128($sp) - stg $t0,136($sp) + stg $t0,16*$SIZE_T($sp) + stg $t0,16*$SIZE_T+8($sp) bras $t1,3f - mvc 128(1,$sp),0($inp) + mvc 16*$SIZE_T(1,$sp),0($inp) 3: ex $len,0($t1) lghi $len,0 - la $inp,128($sp) + la $inp,16*$SIZE_T($sp) j .Lcbc_enc_loop .align 16 @@ -1288,10 +1310,10 @@ $code.=<<___; lg $t0,0($ivp) lg $t1,8($ivp) - stmg $t0,$t1,128($sp) + stmg $t0,$t1,16*$SIZE_T($sp) .Lcbc_dec_loop: - stmg $inp,$out,16($sp) + stm${g} $inp,$out,2*$SIZE_T($sp) llgf $s0,0($inp) llgf $s1,4($inp) llgf $s2,8($inp) @@ -1300,7 +1322,7 @@ $code.=<<___; bras $ra,_s390x_AES_decrypt - lmg $inp,$key,16($sp) + lm${g} $inp,$key,2*$SIZE_T($sp) sllg $s0,$s0,32 sllg $s2,$s2,32 lr $s0,$s1 @@ -1308,15 +1330,15 @@ $code.=<<___; lg $t0,0($inp) lg $t1,8($inp) - xg $s0,128($sp) - xg $s2,136($sp) + xg $s0,16*$SIZE_T($sp) + xg $s2,16*$SIZE_T+8($sp) lghi $s1,16 - slgr $len,$s1 + sl${g}r $len,$s1 brc 4,.Lcbc_dec_tail # if borrow brc 2,.Lcbc_dec_done # if zero stg $s0,0($out) stg $s2,8($out) - stmg $t0,$t1,128($sp) + stmg $t0,$t1,16*$SIZE_T($sp) la $inp,16($inp) la $out,16($out) @@ -1326,7 +1348,7 @@ $code.=<<___; stg $s0,0($out) stg $s2,8($out) .Lcbc_dec_exit: - lmg $ivp,$ra,48($sp) + lm${g} %r6,$ra,6*$SIZE_T($sp) stmg $t0,$t1,0($ivp) br $ra @@ -1334,10 +1356,10 @@ $code.=<<___; .align 16 .Lcbc_dec_tail: aghi $len,15 - stg $s0,128($sp) - stg $s2,136($sp) + stg $s0,16*$SIZE_T($sp) + stg $s2,16*$SIZE_T+8($sp) bras $s1,4f - mvc 0(1,$out),128($sp) + mvc 0(1,$out),16*$SIZE_T($sp) 4: ex $len,0($s1) j .Lcbc_dec_exit .size AES_cbc_encrypt,.-AES_cbc_encrypt @@ -1359,6 +1381,7 @@ $code.=<<___; .type AES_ctr32_encrypt,\@function .align 16 AES_ctr32_encrypt: + llgfr $len,$len # safe in ctr32 subroutine even in 64-bit case ___ $code.=<<___ if (!$softonly); l %r0,240($key) @@ -1366,7 +1389,7 @@ $code.=<<___ if (!$softonly); clr %r0,%r1 jl .Lctr32_software - stmg %r6,$s3,48($sp) + stm${g} %r6,$s3,6*$SIZE_T($sp) slgr $out,$inp la %r1,0($key) # %r1 is permanent copy of $key @@ -1388,14 +1411,14 @@ $code.=<<___ if (!$softonly); la $sp,1024($s0) # alloca srlg $fp,$fp,4 # convert bytes to blocks, minimum 16 - stg $s2,0($sp) # back-chain - stg $fp,8($sp) + st${g} $s2,0($sp) # back-chain + st${g} $fp,$SIZE_T($sp) slgr $len,$fp brc 1,.Lctr32_hw_loop # not zero, no borrow algr $fp,$len # input is shorter than allocated buffer lghi $len,0 - stg $fp,8($sp) + st${g} $fp,$SIZE_T($sp) .Lctr32_hw_loop: la $s2,16($sp) @@ -1432,8 +1455,8 @@ $code.=<<___ if (!$softonly); lghi $len,0 brc 4+1,.Lctr32_hw_loop # not zero - lg $s0,0($sp) - lg $s1,8($sp) + l${g} $s0,0($sp) + l${g} $s1,$SIZE_T($sp) la $s2,16($sp) .Lctr32_hw_zap: stg $s0,0($s2) @@ -1442,30 +1465,30 @@ $code.=<<___ if (!$softonly); brct $s1,.Lctr32_hw_zap la $sp,0($s0) - lmg %r6,$s3,48($sp) + lm${g} %r6,$s3,6*$SIZE_T($sp) br $ra .align 16 .Lctr32_software: ___ $code.=<<___; - stmg $key,$ra,40($sp) - slgr $out,$inp + stm${g} $key,$ra,5*$SIZE_T($sp) + sl${g}r $out,$inp larl $tbl,AES_Te llgf $t1,12($ivp) .Lctr32_loop: - stmg $inp,$len,16($sp) + stm${g} $inp,$len,2*$SIZE_T($sp) llgf $s0,0($ivp) llgf $s1,4($ivp) llgf $s2,8($ivp) lgr $s3,$t1 - st $t1,128($sp) + st $t1,16*$SIZE_T($sp) lgr %r4,$key bras $ra,_s390x_AES_encrypt - lmg $inp,$ivp,16($sp) - llgf $t1,128($sp) + lm${g} $inp,$ivp,2*$SIZE_T($sp) + llgf $t1,16*$SIZE_T($sp) x $s0,0($inp) x $s1,4($inp) x $s2,8($inp) @@ -1479,7 +1502,7 @@ $code.=<<___; ahi $t1,1 # 32-bit increment brct $len,.Lctr32_loop - lmg %r6,$ra,48($sp) + lm${g} %r6,$ra,6*$SIZE_T($sp) br $ra .size AES_ctr32_encrypt,.-AES_ctr32_encrypt ___ |