From b9b91dad9f1cae0b218fcb57a4545027c4951678 Mon Sep 17 00:00:00 2001 From: Jonathan Swinney Date: Wed, 27 Oct 2021 16:50:30 +0000 Subject: md5: add assembly implementation for aarch64 This change improves md5 performance significantly by using a hand-optimized assembly implementation of the inner loop of md5 calculation. The instructions are carefully ordered to separate data dependencies as much as possible. Test with: $ openssl speed md5 AWS Graviton 2 type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes 16384 bytes md5 46990.60k 132778.65k 270376.96k 364718.08k 405962.75k 409201.32k md5-modified 51725.23k 152236.22k 323469.14k 453869.57k 514102.61k 519056.04k +10% +15% +20% +24% +27% +27% Apple M1 type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes 16384 bytes md5 74634.39k 195561.25k 375434.45k 491004.23k 532361.40k 536636.48k md5-modified 84637.11k 229017.09k 444609.62k 588069.50k 655114.24k 660850.56k +13% +17% +18% +20% +23% +23% Reviewed-by: Matt Caswell Reviewed-by: Paul Dale (Merged from https://github.com/openssl/openssl/pull/16928) (cherry picked from commit 04904a0fff639c058d38b355d75485ca5dde0a89) --- crypto/md5/asm/md5-aarch64.pl | 693 ++++++++++++++++++++++++++++++++++++++++++ crypto/md5/build.info | 2 + crypto/md5/md5_local.h | 3 +- 3 files changed, 697 insertions(+), 1 deletion(-) create mode 100755 crypto/md5/asm/md5-aarch64.pl diff --git a/crypto/md5/asm/md5-aarch64.pl b/crypto/md5/asm/md5-aarch64.pl new file mode 100755 index 0000000000..94d727fc9c --- /dev/null +++ b/crypto/md5/asm/md5-aarch64.pl @@ -0,0 +1,693 @@ +#! /usr/bin/env perl +# Copyright 2021-2022 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License 2.0 (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# MD5 optimized for aarch64. + +use strict; + +my $code; + +#no warnings qw(uninitialized); +# $output is the last argument if it looks like a file (it has an extension) +# $flavour is the first argument if it doesn't look like a file +my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; +my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; my $dir=$1; my $xlate; +( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or +die "can't locate arm-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour \"$output\"" + or die "can't call $xlate: $1"; +*STDOUT=*OUT; + +$code .= <A and state->B + ldp w12, w13, [x0, #8] // Load MD5 state->C and state->D +.align 5 +ossl_md5_blocks_loop: + eor x17, x12, x13 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z) + and x16, x17, x11 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z) + ldp x15, x3, [x1] // Load 4 words of input data0 M[0]/0 + eor x14, x16, x13 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z) + movz x9, #0xa478 // Load lower half of constant 0xd76aa478 + movk x9, #0xd76a, lsl #16 // Load upper half of constant 0xd76aa478 + add w8, w10, w15 // Add dest value + add w7, w8, w9 // Add constant 0xd76aa478 + add w6, w7, w14 // Add aux function result + ror w6, w6, #25 // Rotate left s=7 bits + eor x5, x11, x12 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z) + add w4, w11, w6 // Add X parameter round 1 A=FF(A, B, C, D, 0xd76aa478, s=7, M[0]) + and x8, x5, x4 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z) + eor x17, x8, x12 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z) + movz x16, #0xb756 // Load lower half of constant 0xe8c7b756 + movk x16, #0xe8c7, lsl #16 // Load upper half of constant 0xe8c7b756 + lsr x20, x15, #32 // Right shift high input value containing M[1] + add w9, w13, w20 // Add dest value + add w7, w9, w16 // Add constant 0xe8c7b756 + add w14, w7, w17 // Add aux function result + ror w14, w14, #20 // Rotate left s=12 bits + eor x6, x4, x11 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z) + add w5, w4, w14 // Add X parameter round 1 D=FF(D, A, B, C, 0xe8c7b756, s=12, M[1]) + and x8, x6, x5 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z) + eor x9, x8, x11 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z) + movz x16, #0x70db // Load lower half of constant 0x242070db + movk x16, #0x2420, lsl #16 // Load upper half of constant 0x242070db + add w7, w12, w3 // Add dest value + add w17, w7, w16 // Add constant 0x242070db + add w14, w17, w9 // Add aux function result + ror w14, w14, #15 // Rotate left s=17 bits + eor x6, x5, x4 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z) + add w8, w5, w14 // Add X parameter round 1 C=FF(C, D, A, B, 0x242070db, s=17, M[2]) + and x7, x6, x8 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z) + eor x16, x7, x4 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z) + movz x9, #0xceee // Load lower half of constant 0xc1bdceee + movk x9, #0xc1bd, lsl #16 // Load upper half of constant 0xc1bdceee + lsr x21, x3, #32 // Right shift high input value containing M[3] + add w14, w11, w21 // Add dest value + add w6, w14, w9 // Add constant 0xc1bdceee + add w7, w6, w16 // Add aux function result + ror w7, w7, #10 // Rotate left s=22 bits + eor x17, x8, x5 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z) + add w9, w8, w7 // Add X parameter round 1 B=FF(B, C, D, A, 0xc1bdceee, s=22, M[3]) + ldp x14, x7, [x1, #16] // Load 4 words of input data0 M[4]/0w + and x16, x17, x9 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z) + eor x6, x16, x5 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z) + movz x16, #0xfaf // Load lower half of constant 0xf57c0faf + movk x16, #0xf57c, lsl #16 // Load upper half of constant 0xf57c0faf + add w17, w4, w14 // Add dest value + add w16, w17, w16 // Add constant 0xf57c0faf + add w4, w16, w6 // Add aux function result + ror w4, w4, #25 // Rotate left s=7 bits + eor x16, x9, x8 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z) + add w17, w9, w4 // Add X parameter round 1 A=FF(A, B, C, D, 0xf57c0faf, s=7, M[4]) + and x16, x16, x17 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z) + eor x6, x16, x8 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z) + movz x4, #0xc62a // Load lower half of constant 0x4787c62a + movk x4, #0x4787, lsl #16 // Load upper half of constant 0x4787c62a + lsr x22, x14, #32 // Right shift high input value containing M[5] + add w16, w5, w22 // Add dest value + add w16, w16, w4 // Add constant 0x4787c62a + add w5, w16, w6 // Add aux function result + ror w5, w5, #20 // Rotate left s=12 bits + eor x4, x17, x9 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z) + add w19, w17, w5 // Add X parameter round 1 D=FF(D, A, B, C, 0x4787c62a, s=12, M[5]) + and x6, x4, x19 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z) + eor x5, x6, x9 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z) + movz x4, #0x4613 // Load lower half of constant 0xa8304613 + movk x4, #0xa830, lsl #16 // Load upper half of constant 0xa8304613 + add w6, w8, w7 // Add dest value + add w8, w6, w4 // Add constant 0xa8304613 + add w4, w8, w5 // Add aux function result + ror w4, w4, #15 // Rotate left s=17 bits + eor x6, x19, x17 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z) + add w8, w19, w4 // Add X parameter round 1 C=FF(C, D, A, B, 0xa8304613, s=17, M[6]) + and x5, x6, x8 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z) + eor x4, x5, x17 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z) + movz x6, #0x9501 // Load lower half of constant 0xfd469501 + movk x6, #0xfd46, lsl #16 // Load upper half of constant 0xfd469501 + lsr x23, x7, #32 // Right shift high input value containing M[7] + add w9, w9, w23 // Add dest value + add w5, w9, w6 // Add constant 0xfd469501 + add w9, w5, w4 // Add aux function result + ror w9, w9, #10 // Rotate left s=22 bits + eor x6, x8, x19 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z) + add w4, w8, w9 // Add X parameter round 1 B=FF(B, C, D, A, 0xfd469501, s=22, M[7]) + ldp x5, x16, [x1, #32] // Load 4 words of input data0 M[8]/0 + and x9, x6, x4 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z) + eor x6, x9, x19 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z) + movz x9, #0x98d8 // Load lower half of constant 0x698098d8 + movk x9, #0x6980, lsl #16 // Load upper half of constant 0x698098d8 + add w17, w17, w5 // Add dest value + add w9, w17, w9 // Add constant 0x698098d8 + add w17, w9, w6 // Add aux function result + ror w17, w17, #25 // Rotate left s=7 bits + eor x9, x4, x8 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z) + add w6, w4, w17 // Add X parameter round 1 A=FF(A, B, C, D, 0x698098d8, s=7, M[8]) + and x17, x9, x6 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z) + eor x9, x17, x8 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z) + movz x17, #0xf7af // Load lower half of constant 0x8b44f7af + movk x17, #0x8b44, lsl #16 // Load upper half of constant 0x8b44f7af + lsr x24, x5, #32 // Right shift high input value containing M[9] + add w19, w19, w24 // Add dest value + add w17, w19, w17 // Add constant 0x8b44f7af + add w19, w17, w9 // Add aux function result + ror w19, w19, #20 // Rotate left s=12 bits + eor x9, x6, x4 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z) + add w17, w6, w19 // Add X parameter round 1 D=FF(D, A, B, C, 0x8b44f7af, s=12, M[9]) + and x9, x9, x17 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z) + eor x9, x9, x4 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z) + movz x11, #0x5bb1 // Load lower half of constant 0xffff5bb1 + movk x11, #0xffff, lsl #16 // Load upper half of constant 0xffff5bb1 + add w8, w8, w16 // Add dest value + add w8, w8, w11 // Add constant 0xffff5bb1 + add w8, w8, w9 // Add aux function result + ror w8, w8, #15 // Rotate left s=17 bits + eor x9, x17, x6 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z) + add w8, w17, w8 // Add X parameter round 1 C=FF(C, D, A, B, 0xffff5bb1, s=17, M[10]) + and x9, x9, x8 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z) + eor x9, x9, x6 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z) + movz x11, #0xd7be // Load lower half of constant 0x895cd7be + movk x11, #0x895c, lsl #16 // Load upper half of constant 0x895cd7be + lsr x25, x16, #32 // Right shift high input value containing M[11] + add w4, w4, w25 // Add dest value + add w4, w4, w11 // Add constant 0x895cd7be + add w9, w4, w9 // Add aux function result + ror w9, w9, #10 // Rotate left s=22 bits + eor x4, x8, x17 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z) + add w9, w8, w9 // Add X parameter round 1 B=FF(B, C, D, A, 0x895cd7be, s=22, M[11]) + ldp x11, x12, [x1, #48] // Load 4 words of input data0 M[12]/0 + and x4, x4, x9 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z) + eor x4, x4, x17 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z) + movz x19, #0x1122 // Load lower half of constant 0x6b901122 + movk x19, #0x6b90, lsl #16 // Load upper half of constant 0x6b901122 + add w6, w6, w11 // Add dest value + add w6, w6, w19 // Add constant 0x6b901122 + add w4, w6, w4 // Add aux function result + ror w4, w4, #25 // Rotate left s=7 bits + eor x6, x9, x8 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z) + add w4, w9, w4 // Add X parameter round 1 A=FF(A, B, C, D, 0x6b901122, s=7, M[12]) + and x6, x6, x4 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z) + eor x6, x6, x8 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z) + movz x19, #0x7193 // Load lower half of constant 0xfd987193 + movk x19, #0xfd98, lsl #16 // Load upper half of constant 0xfd987193 + lsr x26, x11, #32 // Right shift high input value containing M[13] + add w17, w17, w26 // Add dest value + add w17, w17, w19 // Add constant 0xfd987193 + add w17, w17, w6 // Add aux function result + ror w17, w17, #20 // Rotate left s=12 bits + eor x6, x4, x9 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z) + add w17, w4, w17 // Add X parameter round 1 D=FF(D, A, B, C, 0xfd987193, s=12, M[13]) + and x6, x6, x17 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z) + eor x6, x6, x9 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z) + movz x13, #0x438e // Load lower half of constant 0xa679438e + movk x13, #0xa679, lsl #16 // Load upper half of constant 0xa679438e + add w8, w8, w12 // Add dest value + add w8, w8, w13 // Add constant 0xa679438e + add w8, w8, w6 // Add aux function result + ror w8, w8, #15 // Rotate left s=17 bits + eor x6, x17, x4 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z) + add w8, w17, w8 // Add X parameter round 1 C=FF(C, D, A, B, 0xa679438e, s=17, M[14]) + and x6, x6, x8 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z) + eor x6, x6, x4 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z) + movz x13, #0x821 // Load lower half of constant 0x49b40821 + movk x13, #0x49b4, lsl #16 // Load upper half of constant 0x49b40821 + lsr x27, x12, #32 // Right shift high input value containing M[15] + add w9, w9, w27 // Add dest value + add w9, w9, w13 // Add constant 0x49b40821 + add w9, w9, w6 // Add aux function result + ror w9, w9, #10 // Rotate left s=22 bits + bic x6, x8, x17 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + add w9, w8, w9 // Add X parameter round 1 B=FF(B, C, D, A, 0x49b40821, s=22, M[15]) + and x13, x9, x17 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + movz x13, #0x2562 // Load lower half of constant 0xf61e2562 + movk x13, #0xf61e, lsl #16 // Load upper half of constant 0xf61e2562 + add w4, w4, w20 // Add dest value + add w4, w4, w13 // Add constant 0xf61e2562 + add w4, w4, w6 // Add aux function result + ror w4, w4, #27 // Rotate left s=5 bits + bic x6, x9, x8 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + add w4, w9, w4 // Add X parameter round 2 A=GG(A, B, C, D, 0xf61e2562, s=5, M[1]) + and x13, x4, x8 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + movz x13, #0xb340 // Load lower half of constant 0xc040b340 + movk x13, #0xc040, lsl #16 // Load upper half of constant 0xc040b340 + add w17, w17, w7 // Add dest value + add w17, w17, w13 // Add constant 0xc040b340 + add w17, w17, w6 // Add aux function result + ror w17, w17, #23 // Rotate left s=9 bits + bic x6, x4, x9 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + add w17, w4, w17 // Add X parameter round 2 D=GG(D, A, B, C, 0xc040b340, s=9, M[6]) + and x13, x17, x9 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + movz x13, #0x5a51 // Load lower half of constant 0x265e5a51 + movk x13, #0x265e, lsl #16 // Load upper half of constant 0x265e5a51 + add w8, w8, w25 // Add dest value + add w8, w8, w13 // Add constant 0x265e5a51 + add w8, w8, w6 // Add aux function result + ror w8, w8, #18 // Rotate left s=14 bits + bic x6, x17, x4 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + add w8, w17, w8 // Add X parameter round 2 C=GG(C, D, A, B, 0x265e5a51, s=14, M[11]) + and x13, x8, x4 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + movz x13, #0xc7aa // Load lower half of constant 0xe9b6c7aa + movk x13, #0xe9b6, lsl #16 // Load upper half of constant 0xe9b6c7aa + add w9, w9, w15 // Add dest value + add w9, w9, w13 // Add constant 0xe9b6c7aa + add w9, w9, w6 // Add aux function result + ror w9, w9, #12 // Rotate left s=20 bits + bic x6, x8, x17 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + add w9, w8, w9 // Add X parameter round 2 B=GG(B, C, D, A, 0xe9b6c7aa, s=20, M[0]) + and x13, x9, x17 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + movz x13, #0x105d // Load lower half of constant 0xd62f105d + movk x13, #0xd62f, lsl #16 // Load upper half of constant 0xd62f105d + add w4, w4, w22 // Add dest value + add w4, w4, w13 // Add constant 0xd62f105d + add w4, w4, w6 // Add aux function result + ror w4, w4, #27 // Rotate left s=5 bits + bic x6, x9, x8 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + add w4, w9, w4 // Add X parameter round 2 A=GG(A, B, C, D, 0xd62f105d, s=5, M[5]) + and x13, x4, x8 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + movz x13, #0x1453 // Load lower half of constant 0x2441453 + movk x13, #0x244, lsl #16 // Load upper half of constant 0x2441453 + add w17, w17, w16 // Add dest value + add w17, w17, w13 // Add constant 0x2441453 + add w17, w17, w6 // Add aux function result + ror w17, w17, #23 // Rotate left s=9 bits + bic x6, x4, x9 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + add w17, w4, w17 // Add X parameter round 2 D=GG(D, A, B, C, 0x2441453, s=9, M[10]) + and x13, x17, x9 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + movz x13, #0xe681 // Load lower half of constant 0xd8a1e681 + movk x13, #0xd8a1, lsl #16 // Load upper half of constant 0xd8a1e681 + add w8, w8, w27 // Add dest value + add w8, w8, w13 // Add constant 0xd8a1e681 + add w8, w8, w6 // Add aux function result + ror w8, w8, #18 // Rotate left s=14 bits + bic x6, x17, x4 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + add w8, w17, w8 // Add X parameter round 2 C=GG(C, D, A, B, 0xd8a1e681, s=14, M[15]) + and x13, x8, x4 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + movz x13, #0xfbc8 // Load lower half of constant 0xe7d3fbc8 + movk x13, #0xe7d3, lsl #16 // Load upper half of constant 0xe7d3fbc8 + add w9, w9, w14 // Add dest value + add w9, w9, w13 // Add constant 0xe7d3fbc8 + add w9, w9, w6 // Add aux function result + ror w9, w9, #12 // Rotate left s=20 bits + bic x6, x8, x17 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + add w9, w8, w9 // Add X parameter round 2 B=GG(B, C, D, A, 0xe7d3fbc8, s=20, M[4]) + and x13, x9, x17 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + movz x13, #0xcde6 // Load lower half of constant 0x21e1cde6 + movk x13, #0x21e1, lsl #16 // Load upper half of constant 0x21e1cde6 + add w4, w4, w24 // Add dest value + add w4, w4, w13 // Add constant 0x21e1cde6 + add w4, w4, w6 // Add aux function result + ror w4, w4, #27 // Rotate left s=5 bits + bic x6, x9, x8 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + add w4, w9, w4 // Add X parameter round 2 A=GG(A, B, C, D, 0x21e1cde6, s=5, M[9]) + and x13, x4, x8 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + movz x13, #0x7d6 // Load lower half of constant 0xc33707d6 + movk x13, #0xc337, lsl #16 // Load upper half of constant 0xc33707d6 + add w17, w17, w12 // Add dest value + add w17, w17, w13 // Add constant 0xc33707d6 + add w17, w17, w6 // Add aux function result + ror w17, w17, #23 // Rotate left s=9 bits + bic x6, x4, x9 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + add w17, w4, w17 // Add X parameter round 2 D=GG(D, A, B, C, 0xc33707d6, s=9, M[14]) + and x13, x17, x9 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + movz x13, #0xd87 // Load lower half of constant 0xf4d50d87 + movk x13, #0xf4d5, lsl #16 // Load upper half of constant 0xf4d50d87 + add w8, w8, w21 // Add dest value + add w8, w8, w13 // Add constant 0xf4d50d87 + add w8, w8, w6 // Add aux function result + ror w8, w8, #18 // Rotate left s=14 bits + bic x6, x17, x4 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + add w8, w17, w8 // Add X parameter round 2 C=GG(C, D, A, B, 0xf4d50d87, s=14, M[3]) + and x13, x8, x4 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + movz x13, #0x14ed // Load lower half of constant 0x455a14ed + movk x13, #0x455a, lsl #16 // Load upper half of constant 0x455a14ed + add w9, w9, w5 // Add dest value + add w9, w9, w13 // Add constant 0x455a14ed + add w9, w9, w6 // Add aux function result + ror w9, w9, #12 // Rotate left s=20 bits + bic x6, x8, x17 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + add w9, w8, w9 // Add X parameter round 2 B=GG(B, C, D, A, 0x455a14ed, s=20, M[8]) + and x13, x9, x17 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + movz x13, #0xe905 // Load lower half of constant 0xa9e3e905 + movk x13, #0xa9e3, lsl #16 // Load upper half of constant 0xa9e3e905 + add w4, w4, w26 // Add dest value + add w4, w4, w13 // Add constant 0xa9e3e905 + add w4, w4, w6 // Add aux function result + ror w4, w4, #27 // Rotate left s=5 bits + bic x6, x9, x8 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + add w4, w9, w4 // Add X parameter round 2 A=GG(A, B, C, D, 0xa9e3e905, s=5, M[13]) + and x13, x4, x8 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + movz x13, #0xa3f8 // Load lower half of constant 0xfcefa3f8 + movk x13, #0xfcef, lsl #16 // Load upper half of constant 0xfcefa3f8 + add w17, w17, w3 // Add dest value + add w17, w17, w13 // Add constant 0xfcefa3f8 + add w17, w17, w6 // Add aux function result + ror w17, w17, #23 // Rotate left s=9 bits + bic x6, x4, x9 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + add w17, w4, w17 // Add X parameter round 2 D=GG(D, A, B, C, 0xfcefa3f8, s=9, M[2]) + and x13, x17, x9 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + movz x13, #0x2d9 // Load lower half of constant 0x676f02d9 + movk x13, #0x676f, lsl #16 // Load upper half of constant 0x676f02d9 + add w8, w8, w23 // Add dest value + add w8, w8, w13 // Add constant 0x676f02d9 + add w8, w8, w6 // Add aux function result + ror w8, w8, #18 // Rotate left s=14 bits + bic x6, x17, x4 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + add w8, w17, w8 // Add X parameter round 2 C=GG(C, D, A, B, 0x676f02d9, s=14, M[7]) + and x13, x8, x4 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y)) + movz x13, #0x4c8a // Load lower half of constant 0x8d2a4c8a + movk x13, #0x8d2a, lsl #16 // Load upper half of constant 0x8d2a4c8a + add w9, w9, w11 // Add dest value + add w9, w9, w13 // Add constant 0x8d2a4c8a + add w9, w9, w6 // Add aux function result + eor x6, x8, x17 // Begin aux function round 3 H(x,y,z)=(x^y^z) + ror w9, w9, #12 // Rotate left s=20 bits + movz x10, #0x3942 // Load lower half of constant 0xfffa3942 + add w9, w8, w9 // Add X parameter round 2 B=GG(B, C, D, A, 0x8d2a4c8a, s=20, M[12]) + movk x10, #0xfffa, lsl #16 // Load upper half of constant 0xfffa3942 + add w4, w4, w22 // Add dest value + eor x6, x6, x9 // End aux function round 3 H(x,y,z)=(x^y^z) + add w4, w4, w10 // Add constant 0xfffa3942 + add w4, w4, w6 // Add aux function result + ror w4, w4, #28 // Rotate left s=4 bits + eor x6, x9, x8 // Begin aux function round 3 H(x,y,z)=(x^y^z) + movz x10, #0xf681 // Load lower half of constant 0x8771f681 + add w4, w9, w4 // Add X parameter round 3 A=HH(A, B, C, D, 0xfffa3942, s=4, M[5]) + movk x10, #0x8771, lsl #16 // Load upper half of constant 0x8771f681 + add w17, w17, w5 // Add dest value + eor x6, x6, x4 // End aux function round 3 H(x,y,z)=(x^y^z) + add w17, w17, w10 // Add constant 0x8771f681 + add w17, w17, w6 // Add aux function result + eor x6, x4, x9 // Begin aux function round 3 H(x,y,z)=(x^y^z) + ror w17, w17, #21 // Rotate left s=11 bits + movz x13, #0x6122 // Load lower half of constant 0x6d9d6122 + add w17, w4, w17 // Add X parameter round 3 D=HH(D, A, B, C, 0x8771f681, s=11, M[8]) + movk x13, #0x6d9d, lsl #16 // Load upper half of constant 0x6d9d6122 + add w8, w8, w25 // Add dest value + eor x6, x6, x17 // End aux function round 3 H(x,y,z)=(x^y^z) + add w8, w8, w13 // Add constant 0x6d9d6122 + add w8, w8, w6 // Add aux function result + ror w8, w8, #16 // Rotate left s=16 bits + eor x6, x17, x4 // Begin aux function round 3 H(x,y,z)=(x^y^z) + movz x13, #0x380c // Load lower half of constant 0xfde5380c + add w8, w17, w8 // Add X parameter round 3 C=HH(C, D, A, B, 0x6d9d6122, s=16, M[11]) + movk x13, #0xfde5, lsl #16 // Load upper half of constant 0xfde5380c + add w9, w9, w12 // Add dest value + eor x6, x6, x8 // End aux function round 3 H(x,y,z)=(x^y^z) + add w9, w9, w13 // Add constant 0xfde5380c + add w9, w9, w6 // Add aux function result + eor x6, x8, x17 // Begin aux function round 3 H(x,y,z)=(x^y^z) + ror w9, w9, #9 // Rotate left s=23 bits + movz x10, #0xea44 // Load lower half of constant 0xa4beea44 + add w9, w8, w9 // Add X parameter round 3 B=HH(B, C, D, A, 0xfde5380c, s=23, M[14]) + movk x10, #0xa4be, lsl #16 // Load upper half of constant 0xa4beea44 + add w4, w4, w20 // Add dest value + eor x6, x6, x9 // End aux function round 3 H(x,y,z)=(x^y^z) + add w4, w4, w10 // Add constant 0xa4beea44 + add w4, w4, w6 // Add aux function result + ror w4, w4, #28 // Rotate left s=4 bits + eor x6, x9, x8 // Begin aux function round 3 H(x,y,z)=(x^y^z) + movz x10, #0xcfa9 // Load lower half of constant 0x4bdecfa9 + add w4, w9, w4 // Add X parameter round 3 A=HH(A, B, C, D, 0xa4beea44, s=4, M[1]) + movk x10, #0x4bde, lsl #16 // Load upper half of constant 0x4bdecfa9 + add w17, w17, w14 // Add dest value + eor x6, x6, x4 // End aux function round 3 H(x,y,z)=(x^y^z) + add w17, w17, w10 // Add constant 0x4bdecfa9 + add w17, w17, w6 // Add aux function result + eor x6, x4, x9 // Begin aux function round 3 H(x,y,z)=(x^y^z) + ror w17, w17, #21 // Rotate left s=11 bits + movz x13, #0x4b60 // Load lower half of constant 0xf6bb4b60 + add w17, w4, w17 // Add X parameter round 3 D=HH(D, A, B, C, 0x4bdecfa9, s=11, M[4]) + movk x13, #0xf6bb, lsl #16 // Load upper half of constant 0xf6bb4b60 + add w8, w8, w23 // Add dest value + eor x6, x6, x17 // End aux function round 3 H(x,y,z)=(x^y^z) + add w8, w8, w13 // Add constant 0xf6bb4b60 + add w8, w8, w6 // Add aux function result + ror w8, w8, #16 // Rotate left s=16 bits + eor x6, x17, x4 // Begin aux function round 3 H(x,y,z)=(x^y^z) + movz x13, #0xbc70 // Load lower half of constant 0xbebfbc70 + add w8, w17, w8 // Add X parameter round 3 C=HH(C, D, A, B, 0xf6bb4b60, s=16, M[7]) + movk x13, #0xbebf, lsl #16 // Load upper half of constant 0xbebfbc70 + add w9, w9, w16 // Add dest value + eor x6, x6, x8 // End aux function round 3 H(x,y,z)=(x^y^z) + add w9, w9, w13 // Add constant 0xbebfbc70 + add w9, w9, w6 // Add aux function result + eor x6, x8, x17 // Begin aux function round 3 H(x,y,z)=(x^y^z) + ror w9, w9, #9 // Rotate left s=23 bits + movz x10, #0x7ec6 // Load lower half of constant 0x289b7ec6 + add w9, w8, w9 // Add X parameter round 3 B=HH(B, C, D, A, 0xbebfbc70, s=23, M[10]) + movk x10, #0x289b, lsl #16 // Load upper half of constant 0x289b7ec6 + add w4, w4, w26 // Add dest value + eor x6, x6, x9 // End aux function round 3 H(x,y,z)=(x^y^z) + add w4, w4, w10 // Add constant 0x289b7ec6 + add w4, w4, w6 // Add aux function result + ror w4, w4, #28 // Rotate left s=4 bits + eor x6, x9, x8 // Begin aux function round 3 H(x,y,z)=(x^y^z) + movz x10, #0x27fa // Load lower half of constant 0xeaa127fa + add w4, w9, w4 // Add X parameter round 3 A=HH(A, B, C, D, 0x289b7ec6, s=4, M[13]) + movk x10, #0xeaa1, lsl #16 // Load upper half of constant 0xeaa127fa + add w17, w17, w15 // Add dest value + eor x6, x6, x4 // End aux function round 3 H(x,y,z)=(x^y^z) + add w17, w17, w10 // Add constant 0xeaa127fa + add w17, w17, w6 // Add aux function result + eor x6, x4, x9 // Begin aux function round 3 H(x,y,z)=(x^y^z) + ror w17, w17, #21 // Rotate left s=11 bits + movz x13, #0x3085 // Load lower half of constant 0xd4ef3085 + add w17, w4, w17 // Add X parameter round 3 D=HH(D, A, B, C, 0xeaa127fa, s=11, M[0]) + movk x13, #0xd4ef, lsl #16 // Load upper half of constant 0xd4ef3085 + add w8, w8, w21 // Add dest value + eor x6, x6, x17 // End aux function round 3 H(x,y,z)=(x^y^z) + add w8, w8, w13 // Add constant 0xd4ef3085 + add w8, w8, w6 // Add aux function result + ror w8, w8, #16 // Rotate left s=16 bits + eor x6, x17, x4 // Begin aux function round 3 H(x,y,z)=(x^y^z) + movz x13, #0x1d05 // Load lower half of constant 0x4881d05 + add w8, w17, w8 // Add X parameter round 3 C=HH(C, D, A, B, 0xd4ef3085, s=16, M[3]) + movk x13, #0x488, lsl #16 // Load upper half of constant 0x4881d05 + add w9, w9, w7 // Add dest value + eor x6, x6, x8 // End aux function round 3 H(x,y,z)=(x^y^z) + add w9, w9, w13 // Add constant 0x4881d05 + add w9, w9, w6 // Add aux function result + eor x6, x8, x17 // Begin aux function round 3 H(x,y,z)=(x^y^z) + ror w9, w9, #9 // Rotate left s=23 bits + movz x10, #0xd039 // Load lower half of constant 0xd9d4d039 + add w9, w8, w9 // Add X parameter round 3 B=HH(B, C, D, A, 0x4881d05, s=23, M[6]) + movk x10, #0xd9d4, lsl #16 // Load upper half of constant 0xd9d4d039 + add w4, w4, w24 // Add dest value + eor x6, x6, x9 // End aux function round 3 H(x,y,z)=(x^y^z) + add w4, w4, w10 // Add constant 0xd9d4d039 + add w4, w4, w6 // Add aux function result + ror w4, w4, #28 // Rotate left s=4 bits + eor x6, x9, x8 // Begin aux function round 3 H(x,y,z)=(x^y^z) + movz x10, #0x99e5 // Load lower half of constant 0xe6db99e5 + add w4, w9, w4 // Add X parameter round 3 A=HH(A, B, C, D, 0xd9d4d039, s=4, M[9]) + movk x10, #0xe6db, lsl #16 // Load upper half of constant 0xe6db99e5 + add w17, w17, w11 // Add dest value + eor x6, x6, x4 // End aux function round 3 H(x,y,z)=(x^y^z) + add w17, w17, w10 // Add constant 0xe6db99e5 + add w17, w17, w6 // Add aux function result + eor x6, x4, x9 // Begin aux function round 3 H(x,y,z)=(x^y^z) + ror w17, w17, #21 // Rotate left s=11 bits + movz x13, #0x7cf8 // Load lower half of constant 0x1fa27cf8 + add w17, w4, w17 // Add X parameter round 3 D=HH(D, A, B, C, 0xe6db99e5, s=11, M[12]) + movk x13, #0x1fa2, lsl #16 // Load upper half of constant 0x1fa27cf8 + add w8, w8, w27 // Add dest value + eor x6, x6, x17 // End aux function round 3 H(x,y,z)=(x^y^z) + add w8, w8, w13 // Add constant 0x1fa27cf8 + add w8, w8, w6 // Add aux function result + ror w8, w8, #16 // Rotate left s=16 bits + eor x6, x17, x4 // Begin aux function round 3 H(x,y,z)=(x^y^z) + movz x13, #0x5665 // Load lower half of constant 0xc4ac5665 + add w8, w17, w8 // Add X parameter round 3 C=HH(C, D, A, B, 0x1fa27cf8, s=16, M[15]) + movk x13, #0xc4ac, lsl #16 // Load upper half of constant 0xc4ac5665 + add w9, w9, w3 // Add dest value + eor x6, x6, x8 // End aux function round 3 H(x,y,z)=(x^y^z) + add w9, w9, w13 // Add constant 0xc4ac5665 + add w9, w9, w6 // Add aux function result + ror w9, w9, #9 // Rotate left s=23 bits + movz x6, #0x2244 // Load lower half of constant 0xf4292244 + movk x6, #0xf429, lsl #16 // Load upper half of constant 0xf4292244 + add w9, w8, w9 // Add X parameter round 3 B=HH(B, C, D, A, 0xc4ac5665, s=23, M[2]) + add w4, w4, w15 // Add dest value + orn x13, x9, x17 // Begin aux function round 4 I(x,y,z)=((~z|x)^y) + add w4, w4, w6 // Add constant 0xf4292244 + eor x6, x8, x13 // End aux function round 4 I(x,y,z)=((~z|x)^y) + add w4, w4, w6 // Add aux function result + ror w4, w4, #26 // Rotate left s=6 bits + movz x6, #0xff97 // Load lower half of constant 0x432aff97 + movk x6, #0x432a, lsl #16 // Load upper half of constant 0x432aff97 + add w4, w9, w4 // Add X parameter round 4 A=II(A, B, C, D, 0xf4292244, s=6, M[0]) + orn x10, x4, x8 // Begin aux function round 4 I(x,y,z)=((~z|x)^y) + add w17, w17, w23 // Add dest value + eor x10, x9, x10 // End aux function round 4 I(x,y,z)=((~z|x)^y) + add w17, w17, w6 // Add constant 0x432aff97 + add w6, w17, w10 // Add aux function result + ror w6, w6, #22 // Rotate left s=10 bits + movz x17, #0x23a7 // Load lower half of constant 0xab9423a7 + movk x17, #0xab94, lsl #16 // Load upper half of constant 0xab9423a7 + add w6, w4, w6 // Add X parameter round 4 D=II(D, A, B, C, 0x432aff97, s=10, M[7]) + add w8, w8, w12 // Add dest value + orn x10, x6, x9 // Begin aux function round 4 I(x,y,z)=((~z|x)^y) + add w8, w8, w17 // Add constant 0xab9423a7 + eor x17, x4, x10 // End aux function round 4 I(x,y,z)=((~z|x)^y) + add w8, w8, w17 // Add aux function result + ror w8, w8, #17 // Rotate left s=15 bits + movz x17, #0xa039 // Load lower half of constant 0xfc93a039 + movk x17, #0xfc93, lsl #16 // Load upper half of constant 0xfc93a039 + add w8, w6, w8 // Add X parameter round 4 C=II(C, D, A, B, 0xab9423a7, s=15, M[14]) + orn x13, x8, x4 // Begin aux function round 4 I(x,y,z)=((~z|x)^y) + add w9, w9, w22 // Add dest value + eor x13, x6, x13 // End aux function round 4 I(x,y,z)=((~z|x)^y) + add w9, w9, w17 // Add constant 0xfc93a039 + add w17, w9, w13 // Add aux function result + ror w17, w17, #11 // Rotate left s=21 bits + movz x9, #0x59c3 // Load lower half of constant 0x655b59c3 + movk x9, #0x655b, lsl #16 // Load upper half of constant 0x655b59c3 + add w17, w8, w17 // Add X parameter round 4 B=II(B, C, D, A, 0xfc93a039, s=21, M[5]) + add w4, w4, w11 // Add dest value + orn x13, x17, x6 // Begin aux function round 4 I(x,y,z)=((~z|x)^y) + add w9, w4, w9 // Add constant 0x655b59c3 + eor x4, x8, x13 // End aux function round 4 I(x,y,z)=((~z|x)^y) + add w9, w9, w4 // Add aux function result + ror w9, w9, #26 // Rotate left s=6 bits + movz x4, #0xcc92 // Load lower half of constant 0x8f0ccc92 + movk x4, #0x8f0c, lsl #16 // Load upper half of constant 0x8f0ccc92 + add w9, w17, w9 // Add X parameter round 4 A=II(A, B, C, D, 0x655b59c3, s=6, M[12]) + orn x10, x9, x8 // Begin aux function round 4 I(x,y,z)=((~z|x)^y) + add w6, w6, w21 // Add dest value + eor x10, x17, x10 // End aux function round 4 I(x,y,z)=((~z|x)^y) + add w4, w6, w4 // Add constant 0x8f0ccc92 + add w6, w4, w10 // Add aux function result + ror w6, w6, #22 // Rotate left s=10 bits + movz x4, #0xf47d // Load lower half of constant 0xffeff47d + movk x4, #0xffef, lsl #16 // Load upper half of constant 0xffeff47d + add w6, w9, w6 // Add X parameter round 4 D=II(D, A, B, C, 0x8f0ccc92, s=10, M[3]) + add w8, w8, w16 // Add dest value + orn x10, x6, x17 // Begin aux function round 4 I(x,y,z)=((~z|x)^y) + add w8, w8, w4 // Add constant 0xffeff47d + eor x4, x9, x10 // End aux function round 4 I(x,y,z)=((~z|x)^y) + add w8, w8, w4 // Add aux function result + ror w8, w8, #17 // Rotate left s=15 bits + movz x4, #0x5dd1 // Load lower half of constant 0x85845dd1 + movk x4, #0x8584, lsl #16 // Load upper half of constant 0x85845dd1 + add w8, w6, w8 // Add X parameter round 4 C=II(C, D, A, B, 0xffeff47d, s=15, M[10]) + orn x10, x8, x9 // Begin aux function round 4 I(x,y,z)=((~z|x)^y) + add w15, w17, w20 // Add dest value + eor x17, x6, x10 // End aux function round 4 I(x,y,z)=((~z|x)^y) + add w15, w15, w4 // Add constant 0x85845dd1 + add w4, w15, w17 // Add aux function result + ror w4, w4, #11 // Rotate left s=21 bits + movz x15, #0x7e4f // Load lower half of constant 0x6fa87e4f + movk x15, #0x6fa8, lsl #16 // Load upper half of constant 0x6fa87e4f + add w17, w8, w4 // Add X parameter round 4 B=II(B, C, D, A, 0x85845dd1, s=21, M[1]) + add w4, w9, w5 // Add dest value + orn x9, x17, x6 // Begin aux function round 4 I(x,y,z)=((~z|x)^y) + add w15, w4, w15 // Add constant 0x6fa87e4f + eor x4, x8, x9 // End aux function round 4 I(x,y,z)=((~z|x)^y) + add w9, w15, w4 // Add aux function result + ror w9, w9, #26 // Rotate left s=6 bits + movz x15, #0xe6e0 // Load lower half of constant 0xfe2ce6e0 + movk x15, #0xfe2c, lsl #16 // Load upper half of constant 0xfe2ce6e0 + add w4, w17, w9 // Add X parameter round 4 A=II(A, B, C, D, 0x6fa87e4f, s=6, M[8]) + orn x9, x4, x8 // Begin aux function round 4 I(x,y,z)=((~z|x)^y) + add w6, w6, w27 // Add dest value + eor x9, x17, x9 // End aux function round 4 I(x,y,z)=((~z|x)^y) + add w15, w6, w15 // Add constant 0xfe2ce6e0 + add w6, w15, w9 // Add aux function result + ror w6, w6, #22 // Rotate left s=10 bits + movz x9, #0x4314 // Load lower half of constant 0xa3014314 + movk x9, #0xa301, lsl #16 // Load upper half of constant 0xa3014314 + add w15, w4, w6 // Add X parameter round 4 D=II(D, A, B, C, 0xfe2ce6e0, s=10, M[15]) + add w6, w8, w7 // Add dest value + orn x7, x15, x17 // Begin aux function round 4 I(x,y,z)=((~z|x)^y) + add w8, w6, w9 // Add constant 0xa3014314 + eor x9, x4, x7 // End aux function round 4 I(x,y,z)=((~z|x)^y) + add w6, w8, w9 // Add aux function result + ror w6, w6, #17 // Rotate left s=15 bits + movz x7, #0x11a1 // Load lower half of constant 0x4e0811a1 + movk x7, #0x4e08, lsl #16 // Load upper half of constant 0x4e0811a1 + add w8, w15, w6 // Add X parameter round 4 C=II(C, D, A, B, 0xa3014314, s=15, M[6]) + orn x9, x8, x4 // Begin aux function round 4 I(x,y,z)=((~z|x)^y) + add w6, w17, w26 // Add dest value + eor x17, x15, x9 // End aux function round 4 I(x,y,z)=((~z|x)^y) + add w9, w6, w7 // Add constant 0x4e0811a1 + add w7, w9, w17 // Add aux function result + ror w7, w7, #11 // Rotate left s=21 bits + movz x6, #0x7e82 // Load lower half of constant 0xf7537e82 + movk x6, #0xf753, lsl #16 // Load upper half of constant 0xf7537e82 + add w9, w8, w7 // Add X parameter round 4 B=II(B, C, D, A, 0x4e0811a1, s=21, M[13]) + add w17, w4, w14 // Add dest value + orn x7, x9, x15 // Begin aux function round 4 I(x,y,z)=((~z|x)^y) + add w14, w17, w6 // Add constant 0xf7537e82 + eor x4, x8, x7 // End aux function round 4 I(x,y,z)=((~z|x)^y) + add w17, w14, w4 // Add aux function result + ror w17, w17, #26 // Rotate left s=6 bits + movz x6, #0xf235 // Load lower half of constant 0xbd3af235 + movk x6, #0xbd3a, lsl #16 // Load upper half of constant 0xbd3af235 + add w7, w9, w17 // Add X parameter round 4 A=II(A, B, C, D, 0xf7537e82, s=6, M[4]) + orn x14, x7, x8 // Begin aux function round 4 I(x,y,z)=((~z|x)^y) + add w4, w15, w25 // Add dest value + eor x17, x9, x14 // End aux function round 4 I(x,y,z)=((~z|x)^y) + add w15, w4, w6 // Add constant 0xbd3af235 + add w16, w15, w17 // Add aux function result + ror w16, w16, #22 // Rotate left s=10 bits + movz x14, #0xd2bb // Load lower half of constant 0x2ad7d2bb + movk x14, #0x2ad7, lsl #16 // Load upper half of constant 0x2ad7d2bb + add w4, w7, w16 // Add X parameter round 4 D=II(D, A, B, C, 0xbd3af235, s=10, M[11]) + add w6, w8, w3 // Add dest value + orn x15, x4, x9 // Begin aux function round 4 I(x,y,z)=((~z|x)^y) + add w17, w6, w14 // Add constant 0x2ad7d2bb + eor x16, x7, x15 // End aux function round 4 I(x,y,z)=((~z|x)^y) + add w8, w17, w16 // Add aux function result + ror w8, w8, #17 // Rotate left s=15 bits + movz x3, #0xd391 // Load lower half of constant 0xeb86d391 + movk x3, #0xeb86, lsl #16 // Load upper half of constant 0xeb86d391 + add w14, w4, w8 // Add X parameter round 4 C=II(C, D, A, B, 0x2ad7d2bb, s=15, M[2]) + orn x6, x14, x7 // Begin aux function round 4 I(x,y,z)=((~z|x)^y) + add w15, w9, w24 // Add dest value + eor x17, x4, x6 // End aux function round 4 I(x,y,z)=((~z|x)^y) + add w16, w15, w3 // Add constant 0xeb86d391 + add w8, w16, w17 // Add aux function result + ror w8, w8, #11 // Rotate left s=21 bits + ldp w6, w15, [x0] // Reload MD5 state->A and state->B + ldp w5, w9, [x0, #8] // Reload MD5 state->C and state->D + add w3, w14, w8 // Add X parameter round 4 B=II(B, C, D, A, 0xeb86d391, s=21, M[9]) + add w13, w4, w9 // Add result of MD5 rounds to state->D + add w12, w14, w5 // Add result of MD5 rounds to state->C + add w10, w7, w6 // Add result of MD5 rounds to state->A + add w11, w3, w15 // Add result of MD5 rounds to state->B + stp w12, w13, [x0, #8] // Store MD5 states C,D + stp w10, w11, [x0] // Store MD5 states A,B + add x1, x1, #64 // Increment data pointer + subs w2, w2, #1 // Decrement block counter + b.ne ossl_md5_blocks_loop + + ldp x21,x22,[sp,#16] + ldp x23,x24,[sp,#32] + ldp x25,x26,[sp,#48] + ldp x27,x28,[sp,#64] + ldp x19,x20,[sp],#80 + ret + +EOF + +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, +# CONTEXT *context,DISPATCHER_CONTEXT *disp) + +print $code; + +close STDOUT or die "error closing STDOUT: $!"; diff --git a/crypto/md5/build.info b/crypto/md5/build.info index a1e28c8153..e8c547842a 100644 --- a/crypto/md5/build.info +++ b/crypto/md5/build.info @@ -4,6 +4,7 @@ $MD5ASM= IF[{- !$disabled{asm} -}] $MD5ASM_x86=md5-586.S $MD5ASM_x86_64=md5-x86_64.s + $MD5ASM_aarch64=md5-aarch64.s $MD5ASM_sparcv9=md5-sparcv9.S # Now that we have defined all the arch specific variables, use the @@ -35,6 +36,7 @@ DEFINE[../../providers/liblegacy.a]=$MD5DEF GENERATE[md5-586.S]=asm/md5-586.pl GENERATE[md5-x86_64.s]=asm/md5-x86_64.pl +GENERATE[md5-aarch64.s]=asm/md5-aarch64.pl GENERATE[md5-sparcv9.S]=asm/md5-sparcv9.pl INCLUDE[md5-sparcv9.o]=.. diff --git a/crypto/md5/md5_local.h b/crypto/md5/md5_local.h index 22a0e0f62a..ca7362fe54 100644 --- a/crypto/md5/md5_local.h +++ b/crypto/md5/md5_local.h @@ -14,7 +14,8 @@ #ifdef MD5_ASM # if defined(__i386) || defined(__i386__) || defined(_M_IX86) || \ - defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64) + defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || \ + defined(_M_X64) || defined(__aarch64__) # define md5_block_data_order ossl_md5_block_asm_data_order # elif defined(__ia64) || defined(__ia64__) || defined(_M_IA64) # define md5_block_data_order ossl_md5_block_asm_data_order -- cgit v1.2.3