diff options
author | H.J. Lu <hongjiu.lu@intel.com> | 2020-01-09 06:20:09 -0800 |
---|---|---|
committer | Tomas Mraz <tmraz@fedoraproject.org> | 2020-02-18 18:03:16 +0100 |
commit | 0d51cf3ccc0224def10c32b6defd4a77a1b4322a (patch) | |
tree | 989d96b36f3e257d2403c22275a6929447894d27 /crypto/aes | |
parent | 21542a48ab542dc4d687a15e19c11318df58f72e (diff) |
x86_64: Don't assume 8-byte pointer size
Since pointer in x32 is 4 bytes, add x86_64-support.pl to define
pointer_size and pointer_register based on flavour to support
stuctures like:
struct { void *ptr; int blocks; }
This fixes 90-test_sslapi.t on x32. Verified with
$ ./Configure shared linux-x86_64
$ make
$ make test
and
$ ./Configure shared linux-x32
$ make
$ make test
Reviewed-by: Richard Levitte <levitte@openssl.org>
Reviewed-by: Tomas Mraz <tmraz@fedoraproject.org>
(Merged from https://github.com/openssl/openssl/pull/10988)
Diffstat (limited to 'crypto/aes')
-rw-r--r-- | crypto/aes/asm/aesni-mb-x86_64.pl | 84 |
1 files changed, 56 insertions, 28 deletions
diff --git a/crypto/aes/asm/aesni-mb-x86_64.pl b/crypto/aes/asm/aesni-mb-x86_64.pl index 3b2b569481..0b86285d30 100644 --- a/crypto/aes/asm/aesni-mb-x86_64.pl +++ b/crypto/aes/asm/aesni-mb-x86_64.pl @@ -54,6 +54,11 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; +push(@INC,"${dir}","${dir}../../perlasm"); +require "x86_64-support.pl"; + +$ptr_size=&pointer_size($flavour); + $avx=0; if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` @@ -88,6 +93,8 @@ $inp="%rdi"; # 1st arg $key="%rsi"; # 2nd arg $num="%edx"; +$inp_elm_size=2*$ptr_size+8+16; + @inptr=map("%r$_",(8..11)); @outptr=map("%r$_",(12..15)); @@ -163,21 +170,25 @@ $code.=<<___; .Lenc4x_body: movdqu ($key),$zero # 0-round key lea 0x78($key),$key # size optimization - lea 40*2($inp),$inp + lea $inp_elm_size*2($inp),$inp .Lenc4x_loop_grande: mov $num,24(%rsp) # original $num xor $num,$num ___ for($i=0;$i<4;$i++) { + $inptr_reg=&pointer_register($flavour,@inptr[$i]); + $outptr_reg=&pointer_register($flavour,@outptr[$i]); $code.=<<___; - mov `40*$i+16-40*2`($inp),$one # borrow $one for number of blocks - mov `40*$i+0-40*2`($inp),@inptr[$i] + # borrow $one for number of blocks + mov `$inp_elm_size*$i+2*$ptr_size-$inp_elm_size*2`($inp),$one + mov `$inp_elm_size*$i+0-$inp_elm_size*2`($inp),$inptr_reg cmp $num,$one - mov `40*$i+8-40*2`($inp),@outptr[$i] + mov `$inp_elm_size*$i+$ptr_size-$inp_elm_size*2`($inp),$outptr_reg cmovg $one,$num # find maximum test $one,$one - movdqu `40*$i+24-40*2`($inp),@out[$i] # load IV + # load IV + movdqu `$inp_elm_size*$i+2*$ptr_size+8-$inp_elm_size*2`($inp),@out[$i] mov $one,`32+4*$i`(%rsp) # initialize counters cmovle %rsp,@inptr[$i] # cancel input ___ @@ -335,14 +346,15 @@ $code.=<<___; #pxor @inp[0],@out[0] #pxor @inp[1],@out[1] - #movdqu @out[0],`40*0+24-40*2`($inp) # output iv FIX ME! + # output iv FIX ME! + #movdqu @out[0],`$inp_elm_size*0+2*$ptr_size+8-$inp_elm_size*2`($inp) #pxor @inp[2],@out[2] - #movdqu @out[1],`40*1+24-40*2`($inp) + #movdqu @out[1],`$inp_elm_size*1+2*$ptr_size+8-$inp_elm_size*2`($inp) #pxor @inp[3],@out[3] - #movdqu @out[2],`40*2+24-40*2`($inp) # won't fix, let caller - #movdqu @out[3],`40*3+24-40*2`($inp) # figure this out... + #movdqu @out[2],`$inp_elm_size*2+2*$ptr_size+8-$inp_elm_size*2`($inp) # won't fix, let caller + #movdqu @out[3],`$inp_elm_size*3+2*$ptr_size+8-$inp_elm_size*2`($inp) # figure this out... - lea `40*4`($inp),$inp + lea `$inp_elm_size*4`($inp),$inp dec $num jnz .Lenc4x_loop_grande @@ -440,21 +452,25 @@ $code.=<<___; .Ldec4x_body: movdqu ($key),$zero # 0-round key lea 0x78($key),$key # size optimization - lea 40*2($inp),$inp + lea $inp_elm_size*2($inp),$inp .Ldec4x_loop_grande: mov $num,24(%rsp) # original $num xor $num,$num ___ for($i=0;$i<4;$i++) { + $inptr_reg=&pointer_register($flavour,@inptr[$i]); + $outptr_reg=&pointer_register($flavour,@outptr[$i]); $code.=<<___; - mov `40*$i+16-40*2`($inp),$one # borrow $one for number of blocks - mov `40*$i+0-40*2`($inp),@inptr[$i] + # borrow $one for number of blocks + mov `$inp_elm_size*$i+2*$ptr_size-$inp_elm_size*2`($inp),$one + mov `$inp_elm_size*$i+0-$inp_elm_size*2`($inp),$inptr_reg cmp $num,$one - mov `40*$i+8-40*2`($inp),@outptr[$i] + mov `$inp_elm_size*$i+$ptr_size-$inp_elm_size*2`($inp),$outptr_reg cmovg $one,$num # find maximum test $one,$one - movdqu `40*$i+24-40*2`($inp),@inp[$i] # load IV + # load IV + movdqu `$inp_elm_size*$i+2*$ptr_size+8-$inp_elm_size*2`($inp),@inp[$i] mov $one,`32+4*$i`(%rsp) # initialize counters cmovle %rsp,@inptr[$i] # cancel input ___ @@ -610,7 +626,7 @@ $code.=<<___; .cfi_def_cfa %rax,8 mov 24(%rsp),$num - lea `40*4`($inp),$inp + lea `$inp_elm_size*4`($inp),$inp dec $num jnz .Ldec4x_loop_grande @@ -709,7 +725,7 @@ $code.=<<___; vzeroupper vmovdqu ($key),$zero # 0-round key lea 0x78($key),$key # size optimization - lea 40*4($inp),$inp + lea `$inp_elm_size*4`($inp),$inp shr \$1,$num .Lenc8x_loop_grande: @@ -718,14 +734,20 @@ $code.=<<___; ___ for($i=0;$i<8;$i++) { my $temp = $i ? $offload : $offset; + $ptr_reg=&pointer_register($flavour,@ptr[$i]); + $temp_reg=&pointer_register($flavour,$temp); $code.=<<___; - mov `40*$i+16-40*4`($inp),$one # borrow $one for number of blocks - mov `40*$i+0-40*4`($inp),@ptr[$i] # input pointer + # borrow $one for number of blocks + mov `$inp_elm_size*$i+2*$ptr_size-$inp_elm_size*4`($inp),$one + # input pointer + mov `$inp_elm_size*$i+0-$inp_elm_size*4`($inp),$ptr_reg cmp $num,$one - mov `40*$i+8-40*4`($inp),$temp # output pointer + # output pointer + mov `$inp_elm_size*$i+$ptr_size-$inp_elm_size*4`($inp),$temp_reg cmovg $one,$num # find maximum test $one,$one - vmovdqu `40*$i+24-40*4`($inp),@out[$i] # load IV + # load IV + vmovdqu `$inp_elm_size*$i+2*$ptr_size+8-$inp_elm_size*4`($inp),@out[$i] mov $one,`32+4*$i`(%rsp) # initialize counters cmovle %rsp,@ptr[$i] # cancel input sub @ptr[$i],$temp # distance between input and output @@ -910,7 +932,7 @@ $code.=<<___; mov 16(%rsp),%rax # original %rsp .cfi_def_cfa %rax,8 #mov 24(%rsp),$num - #lea `40*8`($inp),$inp + #lea `$inp_elm_size*8`($inp),$inp #dec $num #jnz .Lenc8x_loop_grande @@ -1002,7 +1024,7 @@ $code.=<<___; vzeroupper vmovdqu ($key),$zero # 0-round key lea 0x78($key),$key # size optimization - lea 40*4($inp),$inp + lea `$inp_elm_size*4`($inp),$inp shr \$1,$num .Ldec8x_loop_grande: @@ -1011,14 +1033,20 @@ $code.=<<___; ___ for($i=0;$i<8;$i++) { my $temp = $i ? $offload : $offset; + $ptr_reg=&pointer_register($flavour,@ptr[$i]); + $temp_reg=&pointer_register($flavour,$temp); $code.=<<___; - mov `40*$i+16-40*4`($inp),$one # borrow $one for number of blocks - mov `40*$i+0-40*4`($inp),@ptr[$i] # input pointer + # borrow $one for number of blocks + mov `$inp_elm_size*$i+2*$ptr_size-$inp_elm_size*4`($inp),$one + # input pointer + mov `$inp_elm_size*$i+0-$inp_elm_size*4`($inp),$ptr_reg cmp $num,$one - mov `40*$i+8-40*4`($inp),$temp # output pointer + # output pointer + mov `$inp_elm_size*$i+$ptr_size-$inp_elm_size*4`($inp),$temp_reg cmovg $one,$num # find maximum test $one,$one - vmovdqu `40*$i+24-40*4`($inp),@out[$i] # load IV + # load IV + vmovdqu `$inp_elm_size*$i+2*$ptr_size+8-$inp_elm_size*4`($inp),@out[$i] mov $one,`32+4*$i`(%rsp) # initialize counters cmovle %rsp,@ptr[$i] # cancel input sub @ptr[$i],$temp # distance between input and output @@ -1234,7 +1262,7 @@ $code.=<<___; mov 16(%rsp),%rax # original %rsp .cfi_def_cfa %rax,8 #mov 24(%rsp),$num - #lea `40*8`($inp),$inp + #lea `$inp_elm_size*8`($inp),$inp #dec $num #jnz .Ldec8x_loop_grande |