From 0d51cf3ccc0224def10c32b6defd4a77a1b4322a Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Thu, 9 Jan 2020 06:20:09 -0800 Subject: x86_64: Don't assume 8-byte pointer size Since pointer in x32 is 4 bytes, add x86_64-support.pl to define pointer_size and pointer_register based on flavour to support stuctures like: struct { void *ptr; int blocks; } This fixes 90-test_sslapi.t on x32. Verified with $ ./Configure shared linux-x86_64 $ make $ make test and $ ./Configure shared linux-x32 $ make $ make test Reviewed-by: Richard Levitte Reviewed-by: Tomas Mraz (Merged from https://github.com/openssl/openssl/pull/10988) --- crypto/aes/asm/aesni-mb-x86_64.pl | 84 ++++++++++++++++++++++++++------------- 1 file changed, 56 insertions(+), 28 deletions(-) (limited to 'crypto/aes') diff --git a/crypto/aes/asm/aesni-mb-x86_64.pl b/crypto/aes/asm/aesni-mb-x86_64.pl index 3b2b569481..0b86285d30 100644 --- a/crypto/aes/asm/aesni-mb-x86_64.pl +++ b/crypto/aes/asm/aesni-mb-x86_64.pl @@ -54,6 +54,11 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; +push(@INC,"${dir}","${dir}../../perlasm"); +require "x86_64-support.pl"; + +$ptr_size=&pointer_size($flavour); + $avx=0; if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` @@ -88,6 +93,8 @@ $inp="%rdi"; # 1st arg $key="%rsi"; # 2nd arg $num="%edx"; +$inp_elm_size=2*$ptr_size+8+16; + @inptr=map("%r$_",(8..11)); @outptr=map("%r$_",(12..15)); @@ -163,21 +170,25 @@ $code.=<<___; .Lenc4x_body: movdqu ($key),$zero # 0-round key lea 0x78($key),$key # size optimization - lea 40*2($inp),$inp + lea $inp_elm_size*2($inp),$inp .Lenc4x_loop_grande: mov $num,24(%rsp) # original $num xor $num,$num ___ for($i=0;$i<4;$i++) { + $inptr_reg=&pointer_register($flavour,@inptr[$i]); + $outptr_reg=&pointer_register($flavour,@outptr[$i]); $code.=<<___; - mov `40*$i+16-40*2`($inp),$one # borrow $one for number of blocks - mov `40*$i+0-40*2`($inp),@inptr[$i] + # borrow $one for number of blocks + mov `$inp_elm_size*$i+2*$ptr_size-$inp_elm_size*2`($inp),$one + mov `$inp_elm_size*$i+0-$inp_elm_size*2`($inp),$inptr_reg cmp $num,$one - mov `40*$i+8-40*2`($inp),@outptr[$i] + mov `$inp_elm_size*$i+$ptr_size-$inp_elm_size*2`($inp),$outptr_reg cmovg $one,$num # find maximum test $one,$one - movdqu `40*$i+24-40*2`($inp),@out[$i] # load IV + # load IV + movdqu `$inp_elm_size*$i+2*$ptr_size+8-$inp_elm_size*2`($inp),@out[$i] mov $one,`32+4*$i`(%rsp) # initialize counters cmovle %rsp,@inptr[$i] # cancel input ___ @@ -335,14 +346,15 @@ $code.=<<___; #pxor @inp[0],@out[0] #pxor @inp[1],@out[1] - #movdqu @out[0],`40*0+24-40*2`($inp) # output iv FIX ME! + # output iv FIX ME! + #movdqu @out[0],`$inp_elm_size*0+2*$ptr_size+8-$inp_elm_size*2`($inp) #pxor @inp[2],@out[2] - #movdqu @out[1],`40*1+24-40*2`($inp) + #movdqu @out[1],`$inp_elm_size*1+2*$ptr_size+8-$inp_elm_size*2`($inp) #pxor @inp[3],@out[3] - #movdqu @out[2],`40*2+24-40*2`($inp) # won't fix, let caller - #movdqu @out[3],`40*3+24-40*2`($inp) # figure this out... + #movdqu @out[2],`$inp_elm_size*2+2*$ptr_size+8-$inp_elm_size*2`($inp) # won't fix, let caller + #movdqu @out[3],`$inp_elm_size*3+2*$ptr_size+8-$inp_elm_size*2`($inp) # figure this out... - lea `40*4`($inp),$inp + lea `$inp_elm_size*4`($inp),$inp dec $num jnz .Lenc4x_loop_grande @@ -440,21 +452,25 @@ $code.=<<___; .Ldec4x_body: movdqu ($key),$zero # 0-round key lea 0x78($key),$key # size optimization - lea 40*2($inp),$inp + lea $inp_elm_size*2($inp),$inp .Ldec4x_loop_grande: mov $num,24(%rsp) # original $num xor $num,$num ___ for($i=0;$i<4;$i++) { + $inptr_reg=&pointer_register($flavour,@inptr[$i]); + $outptr_reg=&pointer_register($flavour,@outptr[$i]); $code.=<<___; - mov `40*$i+16-40*2`($inp),$one # borrow $one for number of blocks - mov `40*$i+0-40*2`($inp),@inptr[$i] + # borrow $one for number of blocks + mov `$inp_elm_size*$i+2*$ptr_size-$inp_elm_size*2`($inp),$one + mov `$inp_elm_size*$i+0-$inp_elm_size*2`($inp),$inptr_reg cmp $num,$one - mov `40*$i+8-40*2`($inp),@outptr[$i] + mov `$inp_elm_size*$i+$ptr_size-$inp_elm_size*2`($inp),$outptr_reg cmovg $one,$num # find maximum test $one,$one - movdqu `40*$i+24-40*2`($inp),@inp[$i] # load IV + # load IV + movdqu `$inp_elm_size*$i+2*$ptr_size+8-$inp_elm_size*2`($inp),@inp[$i] mov $one,`32+4*$i`(%rsp) # initialize counters cmovle %rsp,@inptr[$i] # cancel input ___ @@ -610,7 +626,7 @@ $code.=<<___; .cfi_def_cfa %rax,8 mov 24(%rsp),$num - lea `40*4`($inp),$inp + lea `$inp_elm_size*4`($inp),$inp dec $num jnz .Ldec4x_loop_grande @@ -709,7 +725,7 @@ $code.=<<___; vzeroupper vmovdqu ($key),$zero # 0-round key lea 0x78($key),$key # size optimization - lea 40*4($inp),$inp + lea `$inp_elm_size*4`($inp),$inp shr \$1,$num .Lenc8x_loop_grande: @@ -718,14 +734,20 @@ $code.=<<___; ___ for($i=0;$i<8;$i++) { my $temp = $i ? $offload : $offset; + $ptr_reg=&pointer_register($flavour,@ptr[$i]); + $temp_reg=&pointer_register($flavour,$temp); $code.=<<___; - mov `40*$i+16-40*4`($inp),$one # borrow $one for number of blocks - mov `40*$i+0-40*4`($inp),@ptr[$i] # input pointer + # borrow $one for number of blocks + mov `$inp_elm_size*$i+2*$ptr_size-$inp_elm_size*4`($inp),$one + # input pointer + mov `$inp_elm_size*$i+0-$inp_elm_size*4`($inp),$ptr_reg cmp $num,$one - mov `40*$i+8-40*4`($inp),$temp # output pointer + # output pointer + mov `$inp_elm_size*$i+$ptr_size-$inp_elm_size*4`($inp),$temp_reg cmovg $one,$num # find maximum test $one,$one - vmovdqu `40*$i+24-40*4`($inp),@out[$i] # load IV + # load IV + vmovdqu `$inp_elm_size*$i+2*$ptr_size+8-$inp_elm_size*4`($inp),@out[$i] mov $one,`32+4*$i`(%rsp) # initialize counters cmovle %rsp,@ptr[$i] # cancel input sub @ptr[$i],$temp # distance between input and output @@ -910,7 +932,7 @@ $code.=<<___; mov 16(%rsp),%rax # original %rsp .cfi_def_cfa %rax,8 #mov 24(%rsp),$num - #lea `40*8`($inp),$inp + #lea `$inp_elm_size*8`($inp),$inp #dec $num #jnz .Lenc8x_loop_grande @@ -1002,7 +1024,7 @@ $code.=<<___; vzeroupper vmovdqu ($key),$zero # 0-round key lea 0x78($key),$key # size optimization - lea 40*4($inp),$inp + lea `$inp_elm_size*4`($inp),$inp shr \$1,$num .Ldec8x_loop_grande: @@ -1011,14 +1033,20 @@ $code.=<<___; ___ for($i=0;$i<8;$i++) { my $temp = $i ? $offload : $offset; + $ptr_reg=&pointer_register($flavour,@ptr[$i]); + $temp_reg=&pointer_register($flavour,$temp); $code.=<<___; - mov `40*$i+16-40*4`($inp),$one # borrow $one for number of blocks - mov `40*$i+0-40*4`($inp),@ptr[$i] # input pointer + # borrow $one for number of blocks + mov `$inp_elm_size*$i+2*$ptr_size-$inp_elm_size*4`($inp),$one + # input pointer + mov `$inp_elm_size*$i+0-$inp_elm_size*4`($inp),$ptr_reg cmp $num,$one - mov `40*$i+8-40*4`($inp),$temp # output pointer + # output pointer + mov `$inp_elm_size*$i+$ptr_size-$inp_elm_size*4`($inp),$temp_reg cmovg $one,$num # find maximum test $one,$one - vmovdqu `40*$i+24-40*4`($inp),@out[$i] # load IV + # load IV + vmovdqu `$inp_elm_size*$i+2*$ptr_size+8-$inp_elm_size*4`($inp),@out[$i] mov $one,`32+4*$i`(%rsp) # initialize counters cmovle %rsp,@ptr[$i] # cancel input sub @ptr[$i],$temp # distance between input and output @@ -1234,7 +1262,7 @@ $code.=<<___; mov 16(%rsp),%rax # original %rsp .cfi_def_cfa %rax,8 #mov 24(%rsp),$num - #lea `40*8`($inp),$inp + #lea `$inp_elm_size*8`($inp),$inp #dec $num #jnz .Ldec8x_loop_grande -- cgit v1.2.3