diff options
author | Min Zhou <zhoumin@loongson.cn> | 2023-12-13 22:40:14 +0800 |
---|---|---|
committer | Tomas Mraz <tomas@openssl.org> | 2023-12-27 10:15:29 +0100 |
commit | 3d68e2937ee5c50eacef5f4c34abdf7c0e4dc479 (patch) | |
tree | 62032cc26380c6a302b45a3db9d6d447d0366bc7 /crypto/md5 | |
parent | 9277ed0a4fc082807ad8d8f66925fb7968437cf6 (diff) |
md5: add assembly implementation for loongarch64
This change can improve md5 performance by using a hand-optimized
assembly implementation of the inner loop of md5 calculation.
This implementation refered to md5-x86_64.pl and made more effort
to reorder instructions for separating data dependencies as much
as possible.
Test with:
$ openssl speed md5
3A5000
type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes 16384 bytes
md5 45061.04k 130440.75k 291105.28k 421101.23k 484639.27k 488320.43k
md5-modified 47179.95k 139015.57k 308836.69k 445963.26k 512540.67k 518215.00k
+5% +7% +6% +6% +6% +6%
3A6000
type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes 16384 bytes
md5 60070.06k 161822.76k 325817.60k 438017.02k 486864.21k 492243.31k
md5-modified 62827.74k 170294.04k 343795.03k 463324.50k 515831.13k 520060.93k
+5% +5% +6% +6% +6% +6%
Signed-off-by: Min Zhou <zhoumin@loongson.cn>
Co-authored-by: Xi Ruoyao <xry111@xry111.site>
Reviewed-by: Shane Lontis <shane.lontis@oracle.com>
Reviewed-by: Tomas Mraz <tomas@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/21704)
Diffstat (limited to 'crypto/md5')
-rwxr-xr-x | crypto/md5/asm/md5-loongarch64.pl | 297 | ||||
-rw-r--r-- | crypto/md5/build.info | 4 | ||||
-rw-r--r-- | crypto/md5/md5_local.h | 3 |
3 files changed, 303 insertions, 1 deletions
diff --git a/crypto/md5/asm/md5-loongarch64.pl b/crypto/md5/asm/md5-loongarch64.pl new file mode 100755 index 0000000000..61f8749a3a --- /dev/null +++ b/crypto/md5/asm/md5-loongarch64.pl @@ -0,0 +1,297 @@ +#! /usr/bin/env perl +# Author: Min Zhou <zhoumin@loongson.cn> +# Copyright 2023 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# Reference to crypto/md5/asm/md5-x86_64.pl +# MD5 optimized for LoongArch. + +use strict; + +my $code; + +my ($zero,$ra,$tp,$sp,$fp)=map("\$r$_",(0..3,22)); +my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$r$_",(4..11)); +my ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7,$t8,$x)=map("\$r$_",(12..21)); + +my $output; +for (@ARGV) { $output=$_ if (/\w[\w\-]*\.\w+$/); } +open STDOUT,">$output"; + +# round1_step() does: +# dst = x + ((dst + F(x,y,z) + X[k] + T_i) <<< s) +# $t1 = y ^ z +# $t2 = dst + X[k_next] +sub round1_step +{ + my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_; + my $T_i_h = ($T_i & 0xfffff000) >> 12; + my $T_i_l = $T_i & 0xfff; + +# In LoongArch we have to use two instructions of lu12i.w and ori to load a +# 32-bit immediate into a general register. Meanwhile, the instruction lu12i.w +# treats the 20-bit immediate as a signed number. So if the T_i_h is greater +# than or equal to (1<<19), we need provide lu12i.w a corresponding negative +# number whose complement equals to the sign extension of T_i_h. + +# The details of the instruction lu12i.w can be found as following: +# https://loongson.github.io/LoongArch-Documentation/LoongArch-Vol1-EN.html#_lu12i_w_lu32i_d_lu52i_d + + $T_i_h = -((1<<32) - (0xfff00000 | $T_i_h)) if ($T_i_h >= (1<<19)); + + $code .= " ld.w $t0,$a1,0 /* (NEXT STEP) X[0] */\n" if ($pos == -1); + $code .= " xor $t1,$y,$z /* y ^ z */\n" if ($pos == -1); + $code .= " add.w $t2,$dst,$t0 /* dst + X[k] */\n" if ($pos == -1); + $code .= <<EOF; + lu12i.w $t8,$T_i_h /* load bits [31:12] of constant */ + and $t1,$x,$t1 /* x & ... */ + ori $t8,$t8,$T_i_l /* load bits [11:0] of constant */ + xor $t1,$z,$t1 /* z ^ ... */ + add.w $t7,$t2,$t8 /* dst + X[k] + Const */ + ld.w $t0,$a1,$k_next*4 /* (NEXT STEP) X[$k_next] */ + add.w $dst,$t7,$t1 /* dst += ... */ + add.w $t2,$z,$t0 /* (NEXT STEP) dst + X[$k_next] */ +EOF + + $code .= " rotri.w $dst,$dst,32-$s /* dst <<< s */\n"; + if ($pos != 1) { + $code .= " xor $t1,$x,$y /* (NEXT STEP) y ^ z */\n"; + } else { + $code .= " move $t0,$a7 /* (NEXT ROUND) $t0 = z' (copy of z) */\n"; + $code .= " nor $t1,$zero,$a7 /* (NEXT ROUND) $t1 = not z' (copy of not z) */\n"; + } + $code .= " add.w $dst,$dst,$x /* dst += x */\n"; +} + +# round2_step() does: +# dst = x + ((dst + G(x,y,z) + X[k] + T_i) <<< s) +# $t0 = z' (copy of z for the next step) +# $t1 = not z' (copy of not z for the next step) +# $t2 = dst + X[k_next] +sub round2_step +{ + my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_; + my $T_i_h = ($T_i & 0xfffff000) >> 12; + my $T_i_l = $T_i & 0xfff; + $T_i_h = -((1<<32) - (0xfff00000 | $T_i_h)) if ($T_i_h >= (1<<19)); + + $code .= <<EOF; + lu12i.w $t8,$T_i_h /* load bits [31:12] of Constant */ + and $t0,$x,$t0 /* x & z */ + ori $t8,$t8,$T_i_l /* load bits [11:0] of Constant */ + and $t1,$y,$t1 /* y & (not z) */ + add.w $t7,$t2,$t8 /* dst + X[k] + Const */ + or $t1,$t0,$t1 /* (y & (not z)) | (x & z) */ + ld.w $t0,$a1,$k_next*4 /* (NEXT STEP) X[$k_next] */ + add.w $dst,$t7,$t1 /* dst += ... */ + add.w $t2,$z,$t0 /* (NEXT STEP) dst + X[$k_next] */ +EOF + + $code .= " rotri.w $dst,$dst,32-$s /* dst <<< s */\n"; + if ($pos != 1) { + $code .= " move $t0,$y /* (NEXT STEP) z' = $y */\n"; + $code .= " nor $t1,$zero,$y /* (NEXT STEP) not z' = not $y */\n"; + } else { + $code .= " xor $t1,$a6,$a7 /* (NEXT ROUND) $t1 = y ^ z */\n"; + } + $code .= " add.w $dst,$dst,$x /* dst += x */\n"; +} + +# round3_step() does: +# dst = x + ((dst + H(x,y,z) + X[k] + T_i) <<< s) +# $t1 = y ^ z +# $t2 = dst + X[k_next] +sub round3_step +{ + my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_; + my $T_i_h = ($T_i & 0xfffff000) >> 12; + my $T_i_l = $T_i & 0xfff; + $T_i_h = -((1<<32) - (0xfff00000 | $T_i_h)) if ($T_i_h >= (1<<19)); + + $code .= <<EOF; + lu12i.w $t8,$T_i_h /* load bits [31:12] of Constant */ + xor $t1,$x,$t1 /* x ^ ... */ + ori $t8,$t8,$T_i_l /* load bits [11:0] of Constant */ + add.w $t7,$t2,$t8 /* dst + X[k] + Const */ + ld.w $t0,$a1,$k_next*4 /* (NEXT STEP) X[$k_next] */ + add.w $dst,$t7,$t1 /* dst += ... */ + add.w $t2,$z,$t0 /* (NEXT STEP) dst + X[$k_next] */ +EOF + + $code .= " rotri.w $dst,$dst,32-$s /* dst <<< s */\n"; + if ($pos != 1) { + $code .= " xor $t1,$x,$y /* (NEXT STEP) y ^ z */\n"; + } else { + $code .= " nor $t1,$zero,$a7 /* (NEXT ROUND) $t1 = not z */\n"; + } + $code .= " add.w $dst,$dst,$x /* dst += x */\n"; +} + +# round4_step() does: +# dst = x + ((dst + I(x,y,z) + X[k] + T_i) <<< s) +# $t1 = not z' (copy of not z for the next step) +# $t2 = dst + X[k_next] +sub round4_step +{ + my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_; + my $T_i_h = ($T_i & 0xfffff000) >> 12; + my $T_i_l = $T_i & 0xfff; + $T_i_h = -((1<<32) - (0xfff00000 | $T_i_h)) if ($T_i_h >= (1<<19)); + + $code .= <<EOF; + lu12i.w $t8,$T_i_h /* load bits [31:12] of Constant */ + or $t1,$x,$t1 /* x | ... */ + ori $t8,$t8,$T_i_l /* load bits [11:0] of Constant */ + xor $t1,$y,$t1 /* y ^ ... */ + add.w $t7,$t2,$t8 /* dst + X[k] + Const */ +EOF + + if ($pos != 1) { + $code .= " ld.w $t0,$a1,$k_next*4 /* (NEXT STEP) X[$k_next] */\n"; + $code .= " add.w $dst,$t7,$t1 /* dst += ... */\n"; + $code .= " add.w $t2,$z,$t0 /* (NEXT STEP) dst + X[$k_next] */\n"; + $code .= " rotri.w $dst,$dst,32-$s /* dst <<< s */\n"; + $code .= " nor $t1,$zero,$y /* (NEXT STEP) not z' = not $y */\n"; + $code .= " add.w $dst,$dst,$x /* dst += x */\n"; + } else { + $code .= " add.w $a4,$t3,$a4 /* (NEXT LOOP) add old value of A */\n"; + $code .= " add.w $dst,$t7,$t1 /* dst += ... */\n"; + $code .= " add.w $a7,$t6,$a7 /* (NEXT LOOP) add old value of D */\n"; + $code .= " rotri.w $dst,$dst,32-$s /* dst <<< s */\n"; + $code .= " addi.d $a1,$a1,64 /* (NEXT LOOP) ptr += 64 */\n"; + $code .= " add.w $dst,$dst,$x /* dst += x */\n"; + } +} + +$code .= <<EOF; +.text + +.globl ossl_md5_block_asm_data_order +.type ossl_md5_block_asm_data_order function +ossl_md5_block_asm_data_order: + # $a0 = arg #1 (ctx, MD5_CTX pointer) + # $a1 = arg #2 (ptr, data pointer) + # $a2 = arg #3 (nbr, number of 16-word blocks to process) + beqz $a2,.Lend # cmp nbr with 0, jmp if nbr == 0 + + # ptr is '$a1' + # end is '$a3' + slli.d $t0,$a2,6 + add.d $a3,$a1,$t0 + + # A is '$a4' + # B is '$a5' + # C is '$a6' + # D is '$a7' + ld.w $a4,$a0,0 # a4 = ctx->A + ld.w $a5,$a0,4 # a5 = ctx->B + ld.w $a6,$a0,8 # a6 = ctx->C + ld.w $a7,$a0,12 # a7 = ctx->D + +# BEGIN of loop over 16-word blocks +.align 6 +.Lloop: + # save old values of A, B, C, D + move $t3,$a4 + move $t4,$a5 + move $t5,$a6 + move $t6,$a7 + + preld 0,$a1,0 + preld 0,$a1,64 +EOF + +round1_step(-1, $a4, $a5, $a6, $a7, '1', 0xd76aa478, '7'); +round1_step(0, $a7, $a4, $a5, $a6, '2', 0xe8c7b756, '12'); +round1_step(0, $a6, $a7, $a4, $a5, '3', 0x242070db, '17'); +round1_step(0, $a5, $a6, $a7, $a4, '4', 0xc1bdceee, '22'); +round1_step(0, $a4, $a5, $a6, $a7, '5', 0xf57c0faf, '7'); +round1_step(0, $a7, $a4, $a5, $a6, '6', 0x4787c62a, '12'); +round1_step(0, $a6, $a7, $a4, $a5, '7', 0xa8304613, '17'); +round1_step(0, $a5, $a6, $a7, $a4, '8', 0xfd469501, '22'); +round1_step(0, $a4, $a5, $a6, $a7, '9', 0x698098d8, '7'); +round1_step(0, $a7, $a4, $a5, $a6, '10', 0x8b44f7af, '12'); +round1_step(0, $a6, $a7, $a4, $a5, '11', 0xffff5bb1, '17'); +round1_step(0, $a5, $a6, $a7, $a4, '12', 0x895cd7be, '22'); +round1_step(0, $a4, $a5, $a6, $a7, '13', 0x6b901122, '7'); +round1_step(0, $a7, $a4, $a5, $a6, '14', 0xfd987193, '12'); +round1_step(0, $a6, $a7, $a4, $a5, '15', 0xa679438e, '17'); +round1_step(1, $a5, $a6, $a7, $a4, '1', 0x49b40821, '22'); + +round2_step(-1, $a4, $a5, $a6, $a7, '6', 0xf61e2562, '5'); +round2_step(0, $a7, $a4, $a5, $a6, '11', 0xc040b340, '9'); +round2_step(0, $a6, $a7, $a4, $a5, '0', 0x265e5a51, '14'); +round2_step(0, $a5, $a6, $a7, $a4, '5', 0xe9b6c7aa, '20'); +round2_step(0, $a4, $a5, $a6, $a7, '10', 0xd62f105d, '5'); +round2_step(0, $a7, $a4, $a5, $a6, '15', 0x2441453, '9'); +round2_step(0, $a6, $a7, $a4, $a5, '4', 0xd8a1e681, '14'); +round2_step(0, $a5, $a6, $a7, $a4, '9', 0xe7d3fbc8, '20'); +round2_step(0, $a4, $a5, $a6, $a7, '14', 0x21e1cde6, '5'); +round2_step(0, $a7, $a4, $a5, $a6, '3', 0xc33707d6, '9'); +round2_step(0, $a6, $a7, $a4, $a5, '8', 0xf4d50d87, '14'); +round2_step(0, $a5, $a6, $a7, $a4, '13', 0x455a14ed, '20'); +round2_step(0, $a4, $a5, $a6, $a7, '2', 0xa9e3e905, '5'); +round2_step(0, $a7, $a4, $a5, $a6, '7', 0xfcefa3f8, '9'); +round2_step(0, $a6, $a7, $a4, $a5, '12', 0x676f02d9, '14'); +round2_step(1, $a5, $a6, $a7, $a4, '5', 0x8d2a4c8a, '20'); + +round3_step(-1, $a4, $a5, $a6, $a7, '8', 0xfffa3942, '4'); +round3_step(0, $a7, $a4, $a5, $a6, '11', 0x8771f681, '11'); +round3_step(0, $a6, $a7, $a4, $a5, '14', 0x6d9d6122, '16'); +round3_step(0, $a5, $a6, $a7, $a4, '1', 0xfde5380c, '23'); +round3_step(0, $a4, $a5, $a6, $a7, '4', 0xa4beea44, '4'); +round3_step(0, $a7, $a4, $a5, $a6, '7', 0x4bdecfa9, '11'); +round3_step(0, $a6, $a7, $a4, $a5, '10', 0xf6bb4b60, '16'); +round3_step(0, $a5, $a6, $a7, $a4, '13', 0xbebfbc70, '23'); +round3_step(0, $a4, $a5, $a6, $a7, '0', 0x289b7ec6, '4'); +round3_step(0, $a7, $a4, $a5, $a6, '3', 0xeaa127fa, '11'); +round3_step(0, $a6, $a7, $a4, $a5, '6', 0xd4ef3085, '16'); +round3_step(0, $a5, $a6, $a7, $a4, '9', 0x4881d05, '23'); +round3_step(0, $a4, $a5, $a6, $a7, '12', 0xd9d4d039, '4'); +round3_step(0, $a7, $a4, $a5, $a6, '15', 0xe6db99e5, '11'); +round3_step(0, $a6, $a7, $a4, $a5, '2', 0x1fa27cf8, '16'); +round3_step(1, $a5, $a6, $a7, $a4, '0', 0xc4ac5665, '23'); + +round4_step(-1, $a4, $a5, $a6, $a7, '7', 0xf4292244, '6'); +round4_step(0, $a7, $a4, $a5, $a6, '14', 0x432aff97, '10'); +round4_step(0, $a6, $a7, $a4, $a5, '5', 0xab9423a7, '15'); +round4_step(0, $a5, $a6, $a7, $a4, '12', 0xfc93a039, '21'); +round4_step(0, $a4, $a5, $a6, $a7, '3', 0x655b59c3, '6'); +round4_step(0, $a7, $a4, $a5, $a6, '10', 0x8f0ccc92, '10'); +round4_step(0, $a6, $a7, $a4, $a5, '1', 0xffeff47d, '15'); +round4_step(0, $a5, $a6, $a7, $a4, '8', 0x85845dd1, '21'); +round4_step(0, $a4, $a5, $a6, $a7, '15', 0x6fa87e4f, '6'); +round4_step(0, $a7, $a4, $a5, $a6, '6', 0xfe2ce6e0, '10'); +round4_step(0, $a6, $a7, $a4, $a5, '13', 0xa3014314, '15'); +round4_step(0, $a5, $a6, $a7, $a4, '4', 0x4e0811a1, '21'); +round4_step(0, $a4, $a5, $a6, $a7, '11', 0xf7537e82, '6'); +round4_step(0, $a7, $a4, $a5, $a6, '2', 0xbd3af235, '10'); +round4_step(0, $a6, $a7, $a4, $a5, '9', 0x2ad7d2bb, '15'); +round4_step(1, $a5, $a6, $a7, $a4, '0', 0xeb86d391, '21'); + +$code .= <<EOF; + # add old values of B, C + add.w $a5,$t4,$a5 + add.w $a6,$t5,$a6 + + bltu $a1,$a3,.Lloop # jmp if ptr < end + + st.w $a4,$a0,0 # ctx->A = A + st.w $a5,$a0,4 # ctx->B = B + st.w $a6,$a0,8 # ctx->C = C + st.w $a7,$a0,12 # ctx->D = D + +.Lend: + jr $ra +.size ossl_md5_block_asm_data_order,.-ossl_md5_block_asm_data_order +EOF + +$code =~ s/\`([^\`]*)\`/eval($1)/gem; + +print $code; + +close STDOUT; diff --git a/crypto/md5/build.info b/crypto/md5/build.info index 5d89a37883..307e5037ee 100644 --- a/crypto/md5/build.info +++ b/crypto/md5/build.info @@ -5,6 +5,7 @@ IF[{- !$disabled{asm} -}] $MD5ASM_x86=md5-586.S $MD5ASM_x86_64=md5-x86_64.s $MD5ASM_aarch64=md5-aarch64.S + $MD5ASM_loongarch64=md5-loongarch64.S $MD5ASM_sparcv9=md5-sparcv9.S # Now that we have defined all the arch specific variables, use the @@ -39,5 +40,8 @@ GENERATE[md5-x86_64.s]=asm/md5-x86_64.pl GENERATE[md5-aarch64.S]=asm/md5-aarch64.pl INCLUDE[md5-aarch64.o]=.. +GENERATE[md5-loongarch64.S]=asm/md5-loongarch64.pl +INCLUDE[md5-loongarch64.o]=.. + GENERATE[md5-sparcv9.S]=asm/md5-sparcv9.pl INCLUDE[md5-sparcv9.o]=.. diff --git a/crypto/md5/md5_local.h b/crypto/md5/md5_local.h index 894567c3f5..fab8bb9dae 100644 --- a/crypto/md5/md5_local.h +++ b/crypto/md5/md5_local.h @@ -15,7 +15,8 @@ #ifdef MD5_ASM # if defined(__i386) || defined(__i386__) || defined(_M_IX86) || \ defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || \ - defined(_M_X64) || defined(__aarch64__) + defined(_M_X64) || defined(__aarch64__) || \ + (defined(__loongarch__) && __loongarch_grlen == 64) # define md5_block_data_order ossl_md5_block_asm_data_order # elif defined(__ia64) || defined(__ia64__) || defined(_M_IA64) # define md5_block_data_order ossl_md5_block_asm_data_order |