summaryrefslogtreecommitdiffstats
path: root/crypto/aes
diff options
context:
space:
mode:
authorH.J. Lu <hongjiu.lu@intel.com>2020-01-09 06:20:09 -0800
committerTomas Mraz <tmraz@fedoraproject.org>2020-02-18 18:03:16 +0100
commit0d51cf3ccc0224def10c32b6defd4a77a1b4322a (patch)
tree989d96b36f3e257d2403c22275a6929447894d27 /crypto/aes
parent21542a48ab542dc4d687a15e19c11318df58f72e (diff)
x86_64: Don't assume 8-byte pointer size
Since pointer in x32 is 4 bytes, add x86_64-support.pl to define pointer_size and pointer_register based on flavour to support stuctures like: struct { void *ptr; int blocks; } This fixes 90-test_sslapi.t on x32. Verified with $ ./Configure shared linux-x86_64 $ make $ make test and $ ./Configure shared linux-x32 $ make $ make test Reviewed-by: Richard Levitte <levitte@openssl.org> Reviewed-by: Tomas Mraz <tmraz@fedoraproject.org> (Merged from https://github.com/openssl/openssl/pull/10988)
Diffstat (limited to 'crypto/aes')
-rw-r--r--crypto/aes/asm/aesni-mb-x86_64.pl84
1 files changed, 56 insertions, 28 deletions
diff --git a/crypto/aes/asm/aesni-mb-x86_64.pl b/crypto/aes/asm/aesni-mb-x86_64.pl
index 3b2b569481..0b86285d30 100644
--- a/crypto/aes/asm/aesni-mb-x86_64.pl
+++ b/crypto/aes/asm/aesni-mb-x86_64.pl
@@ -54,6 +54,11 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86_64-support.pl";
+
+$ptr_size=&pointer_size($flavour);
+
$avx=0;
if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
@@ -88,6 +93,8 @@ $inp="%rdi"; # 1st arg
$key="%rsi"; # 2nd arg
$num="%edx";
+$inp_elm_size=2*$ptr_size+8+16;
+
@inptr=map("%r$_",(8..11));
@outptr=map("%r$_",(12..15));
@@ -163,21 +170,25 @@ $code.=<<___;
.Lenc4x_body:
movdqu ($key),$zero # 0-round key
lea 0x78($key),$key # size optimization
- lea 40*2($inp),$inp
+ lea $inp_elm_size*2($inp),$inp
.Lenc4x_loop_grande:
mov $num,24(%rsp) # original $num
xor $num,$num
___
for($i=0;$i<4;$i++) {
+ $inptr_reg=&pointer_register($flavour,@inptr[$i]);
+ $outptr_reg=&pointer_register($flavour,@outptr[$i]);
$code.=<<___;
- mov `40*$i+16-40*2`($inp),$one # borrow $one for number of blocks
- mov `40*$i+0-40*2`($inp),@inptr[$i]
+ # borrow $one for number of blocks
+ mov `$inp_elm_size*$i+2*$ptr_size-$inp_elm_size*2`($inp),$one
+ mov `$inp_elm_size*$i+0-$inp_elm_size*2`($inp),$inptr_reg
cmp $num,$one
- mov `40*$i+8-40*2`($inp),@outptr[$i]
+ mov `$inp_elm_size*$i+$ptr_size-$inp_elm_size*2`($inp),$outptr_reg
cmovg $one,$num # find maximum
test $one,$one
- movdqu `40*$i+24-40*2`($inp),@out[$i] # load IV
+ # load IV
+ movdqu `$inp_elm_size*$i+2*$ptr_size+8-$inp_elm_size*2`($inp),@out[$i]
mov $one,`32+4*$i`(%rsp) # initialize counters
cmovle %rsp,@inptr[$i] # cancel input
___
@@ -335,14 +346,15 @@ $code.=<<___;
#pxor @inp[0],@out[0]
#pxor @inp[1],@out[1]
- #movdqu @out[0],`40*0+24-40*2`($inp) # output iv FIX ME!
+ # output iv FIX ME!
+ #movdqu @out[0],`$inp_elm_size*0+2*$ptr_size+8-$inp_elm_size*2`($inp)
#pxor @inp[2],@out[2]
- #movdqu @out[1],`40*1+24-40*2`($inp)
+ #movdqu @out[1],`$inp_elm_size*1+2*$ptr_size+8-$inp_elm_size*2`($inp)
#pxor @inp[3],@out[3]
- #movdqu @out[2],`40*2+24-40*2`($inp) # won't fix, let caller
- #movdqu @out[3],`40*3+24-40*2`($inp) # figure this out...
+ #movdqu @out[2],`$inp_elm_size*2+2*$ptr_size+8-$inp_elm_size*2`($inp) # won't fix, let caller
+ #movdqu @out[3],`$inp_elm_size*3+2*$ptr_size+8-$inp_elm_size*2`($inp) # figure this out...
- lea `40*4`($inp),$inp
+ lea `$inp_elm_size*4`($inp),$inp
dec $num
jnz .Lenc4x_loop_grande
@@ -440,21 +452,25 @@ $code.=<<___;
.Ldec4x_body:
movdqu ($key),$zero # 0-round key
lea 0x78($key),$key # size optimization
- lea 40*2($inp),$inp
+ lea $inp_elm_size*2($inp),$inp
.Ldec4x_loop_grande:
mov $num,24(%rsp) # original $num
xor $num,$num
___
for($i=0;$i<4;$i++) {
+ $inptr_reg=&pointer_register($flavour,@inptr[$i]);
+ $outptr_reg=&pointer_register($flavour,@outptr[$i]);
$code.=<<___;
- mov `40*$i+16-40*2`($inp),$one # borrow $one for number of blocks
- mov `40*$i+0-40*2`($inp),@inptr[$i]
+ # borrow $one for number of blocks
+ mov `$inp_elm_size*$i+2*$ptr_size-$inp_elm_size*2`($inp),$one
+ mov `$inp_elm_size*$i+0-$inp_elm_size*2`($inp),$inptr_reg
cmp $num,$one
- mov `40*$i+8-40*2`($inp),@outptr[$i]
+ mov `$inp_elm_size*$i+$ptr_size-$inp_elm_size*2`($inp),$outptr_reg
cmovg $one,$num # find maximum
test $one,$one
- movdqu `40*$i+24-40*2`($inp),@inp[$i] # load IV
+ # load IV
+ movdqu `$inp_elm_size*$i+2*$ptr_size+8-$inp_elm_size*2`($inp),@inp[$i]
mov $one,`32+4*$i`(%rsp) # initialize counters
cmovle %rsp,@inptr[$i] # cancel input
___
@@ -610,7 +626,7 @@ $code.=<<___;
.cfi_def_cfa %rax,8
mov 24(%rsp),$num
- lea `40*4`($inp),$inp
+ lea `$inp_elm_size*4`($inp),$inp
dec $num
jnz .Ldec4x_loop_grande
@@ -709,7 +725,7 @@ $code.=<<___;
vzeroupper
vmovdqu ($key),$zero # 0-round key
lea 0x78($key),$key # size optimization
- lea 40*4($inp),$inp
+ lea `$inp_elm_size*4`($inp),$inp
shr \$1,$num
.Lenc8x_loop_grande:
@@ -718,14 +734,20 @@ $code.=<<___;
___
for($i=0;$i<8;$i++) {
my $temp = $i ? $offload : $offset;
+ $ptr_reg=&pointer_register($flavour,@ptr[$i]);
+ $temp_reg=&pointer_register($flavour,$temp);
$code.=<<___;
- mov `40*$i+16-40*4`($inp),$one # borrow $one for number of blocks
- mov `40*$i+0-40*4`($inp),@ptr[$i] # input pointer
+ # borrow $one for number of blocks
+ mov `$inp_elm_size*$i+2*$ptr_size-$inp_elm_size*4`($inp),$one
+ # input pointer
+ mov `$inp_elm_size*$i+0-$inp_elm_size*4`($inp),$ptr_reg
cmp $num,$one
- mov `40*$i+8-40*4`($inp),$temp # output pointer
+ # output pointer
+ mov `$inp_elm_size*$i+$ptr_size-$inp_elm_size*4`($inp),$temp_reg
cmovg $one,$num # find maximum
test $one,$one
- vmovdqu `40*$i+24-40*4`($inp),@out[$i] # load IV
+ # load IV
+ vmovdqu `$inp_elm_size*$i+2*$ptr_size+8-$inp_elm_size*4`($inp),@out[$i]
mov $one,`32+4*$i`(%rsp) # initialize counters
cmovle %rsp,@ptr[$i] # cancel input
sub @ptr[$i],$temp # distance between input and output
@@ -910,7 +932,7 @@ $code.=<<___;
mov 16(%rsp),%rax # original %rsp
.cfi_def_cfa %rax,8
#mov 24(%rsp),$num
- #lea `40*8`($inp),$inp
+ #lea `$inp_elm_size*8`($inp),$inp
#dec $num
#jnz .Lenc8x_loop_grande
@@ -1002,7 +1024,7 @@ $code.=<<___;
vzeroupper
vmovdqu ($key),$zero # 0-round key
lea 0x78($key),$key # size optimization
- lea 40*4($inp),$inp
+ lea `$inp_elm_size*4`($inp),$inp
shr \$1,$num
.Ldec8x_loop_grande:
@@ -1011,14 +1033,20 @@ $code.=<<___;
___
for($i=0;$i<8;$i++) {
my $temp = $i ? $offload : $offset;
+ $ptr_reg=&pointer_register($flavour,@ptr[$i]);
+ $temp_reg=&pointer_register($flavour,$temp);
$code.=<<___;
- mov `40*$i+16-40*4`($inp),$one # borrow $one for number of blocks
- mov `40*$i+0-40*4`($inp),@ptr[$i] # input pointer
+ # borrow $one for number of blocks
+ mov `$inp_elm_size*$i+2*$ptr_size-$inp_elm_size*4`($inp),$one
+ # input pointer
+ mov `$inp_elm_size*$i+0-$inp_elm_size*4`($inp),$ptr_reg
cmp $num,$one
- mov `40*$i+8-40*4`($inp),$temp # output pointer
+ # output pointer
+ mov `$inp_elm_size*$i+$ptr_size-$inp_elm_size*4`($inp),$temp_reg
cmovg $one,$num # find maximum
test $one,$one
- vmovdqu `40*$i+24-40*4`($inp),@out[$i] # load IV
+ # load IV
+ vmovdqu `$inp_elm_size*$i+2*$ptr_size+8-$inp_elm_size*4`($inp),@out[$i]
mov $one,`32+4*$i`(%rsp) # initialize counters
cmovle %rsp,@ptr[$i] # cancel input
sub @ptr[$i],$temp # distance between input and output
@@ -1234,7 +1262,7 @@ $code.=<<___;
mov 16(%rsp),%rax # original %rsp
.cfi_def_cfa %rax,8
#mov 24(%rsp),$num
- #lea `40*8`($inp),$inp
+ #lea `$inp_elm_size*8`($inp),$inp
#dec $num
#jnz .Ldec8x_loop_grande