summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndy Polyakov <appro@openssl.org>2015-05-11 11:43:55 +0200
committerAndy Polyakov <appro@openssl.org>2015-05-13 17:59:22 +0200
commitbb98f6bef66dc423a3736cc9c5e5602933f58c64 (patch)
treeec7563404dce463e41ee84020bcb69ac4e8f950c
parent728b53058ee6f89fa95c0ed3feaa410a85db7323 (diff)
Adapt ARM assembly pack for iOS.
This is achieved by filtering perlasm output through arm-xlate.pl. But note that it's done only if "flavour" argument is not 'void'. As 'void' is default value for other ARM targets, permasm output is not actually filtered on previously validated platforms. Reviewed-by: Dr. Stephen Henson <steve@openssl.org> (cherry picked from commit 874faf2ffb22187ad5483d9691a3a2eb7112f161)
-rw-r--r--crypto/aes/asm/aes-armv4.pl31
-rw-r--r--crypto/bn/asm/armv4-gf2m.pl23
-rw-r--r--crypto/bn/asm/armv4-mont.pl16
-rw-r--r--crypto/modes/asm/ghash-armv4.pl33
-rw-r--r--crypto/sha/asm/sha1-armv4-large.pl16
-rw-r--r--crypto/sha/asm/sha256-armv4.pl16
-rw-r--r--crypto/sha/asm/sha512-armv4.pl22
7 files changed, 136 insertions, 21 deletions
diff --git a/crypto/aes/asm/aes-armv4.pl b/crypto/aes/asm/aes-armv4.pl
index 55b6e04b67..ed5125827b 100644
--- a/crypto/aes/asm/aes-armv4.pl
+++ b/crypto/aes/asm/aes-armv4.pl
@@ -32,8 +32,20 @@
# Profiler-assisted and platform-specific optimization resulted in 16%
# improvement on Cortex A8 core and ~21.5 cycles per byte.
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+$flavour = shift;
+if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+ die "can't locate arm-xlate.pl";
+
+ open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+ open STDOUT,">$output";
+}
$s0="r0";
$s1="r1";
@@ -171,7 +183,12 @@ AES_encrypt:
stmdb sp!,{r1,r4-r12,lr}
mov $rounds,r0 @ inp
mov $key,r2
+#ifdef __APPLE__
+ mov $tbl,#AES_encrypt-AES_Te
+ sub $tbl,r3,$tbl @ Te
+#else
sub $tbl,r3,#AES_encrypt-AES_Te @ Te
+#endif
#if __ARM_ARCH__<7
ldrb $s0,[$rounds,#3] @ load input data in endian-neutral
ldrb $t1,[$rounds,#2] @ manner...
@@ -425,7 +442,12 @@ AES_set_encrypt_key:
bne .Labrt
.Lok: stmdb sp!,{r4-r12,lr}
+#ifdef __APPLE__
+ mov $tbl,#AES_set_encrypt_key-AES_Te-1024
+ sub $tbl,r3,$tbl @ Te4
+#else
sub $tbl,r3,#AES_set_encrypt_key-AES_Te-1024 @ Te4
+#endif
mov $rounds,r0 @ inp
mov lr,r1 @ bits
@@ -886,7 +908,12 @@ AES_decrypt:
stmdb sp!,{r1,r4-r12,lr}
mov $rounds,r0 @ inp
mov $key,r2
+#ifdef __APPLE__
+ mov $tbl,#AES_decrypt-AES_Td
+ sub $tbl,r3,$tbl @ Td
+#else
sub $tbl,r3,#AES_decrypt-AES_Td @ Td
+#endif
#if __ARM_ARCH__<7
ldrb $s0,[$rounds,#3] @ load input data in endian-neutral
ldrb $t1,[$rounds,#2] @ manner...
diff --git a/crypto/bn/asm/armv4-gf2m.pl b/crypto/bn/asm/armv4-gf2m.pl
index c52e0b75b5..737659f0db 100644
--- a/crypto/bn/asm/armv4-gf2m.pl
+++ b/crypto/bn/asm/armv4-gf2m.pl
@@ -21,8 +21,20 @@
# runs in even less cycles, ~30, improvement is measurable only on
# longer keys. One has to optimize code elsewhere to get NEON glow...
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+$flavour = shift;
+if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+ die "can't locate arm-xlate.pl";
+
+ open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+ open STDOUT,">$output";
+}
sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
@@ -170,11 +182,18 @@ bn_GF2m_mul_2x2:
#if __ARM_ARCH__>=7
ldr r12,.LOPENSSL_armcap
.Lpic: ldr r12,[pc,r12]
+#ifdef __APPLE__
+ ldr r12,[r12]
+#endif
tst r12,#1
beq .Lialu
veor $A1,$A1
+#ifdef __APPLE__
+ vmov $B1,r3,r3 @ two copies of b1
+#else
vmov.32 $B1,r3,r3 @ two copies of b1
+#endif
vmov.32 ${A1}[0],r1 @ a1
veor $A0,$A0
diff --git a/crypto/bn/asm/armv4-mont.pl b/crypto/bn/asm/armv4-mont.pl
index f78a8b5f0f..aa00f38c2f 100644
--- a/crypto/bn/asm/armv4-mont.pl
+++ b/crypto/bn/asm/armv4-mont.pl
@@ -23,8 +23,20 @@
# than 1/2KB. Windows CE port would be trivial, as it's exclusively
# about decorations, ABI and instruction syntax are identical.
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+$flavour = shift;
+if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+ die "can't locate arm-xlate.pl";
+
+ open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+ open STDOUT,">$output";
+}
$num="r0"; # starts as num argument, but holds &tp[num-1]
$ap="r1";
diff --git a/crypto/modes/asm/ghash-armv4.pl b/crypto/modes/asm/ghash-armv4.pl
index d91586ee29..3799b2b559 100644
--- a/crypto/modes/asm/ghash-armv4.pl
+++ b/crypto/modes/asm/ghash-armv4.pl
@@ -57,8 +57,20 @@
# *native* byte order on current platform. See gcm128.c for working
# example...
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+$flavour = shift;
+if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+ die "can't locate arm-xlate.pl";
+
+ open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+ open STDOUT,">$output";
+}
$Xi="r0"; # argument block
$Htbl="r1";
@@ -112,6 +124,11 @@ $code=<<___;
.text
.code 32
+#ifdef __APPLE__
+#define ldrplb ldrbpl
+#define ldrneb ldrbne
+#endif
+
.type rem_4bit,%object
.align 5
rem_4bit:
@@ -326,9 +343,9 @@ $code.=<<___;
.align 4
gcm_gmult_neon:
sub $Htbl,#16 @ point at H in GCM128_CTX
- vld1.64 `&Dhi("$IN")`,[$Xi,:64]!@ load Xi
+ vld1.64 `&Dhi("$IN")`,[$Xi]! @ load Xi
vmov.i32 $mod,#0xe1 @ our irreducible polynomial
- vld1.64 `&Dlo("$IN")`,[$Xi,:64]!
+ vld1.64 `&Dlo("$IN")`,[$Xi]!
vshr.u64 $mod,#32
vldmia $Htbl,{$Hhi-$Hlo} @ load H
veor $zero,$zero
@@ -349,9 +366,9 @@ gcm_gmult_neon:
.type gcm_ghash_neon,%function
.align 4
gcm_ghash_neon:
- vld1.64 `&Dhi("$Z")`,[$Xi,:64]! @ load Xi
+ vld1.64 `&Dhi("$Z")`,[$Xi]! @ load Xi
vmov.i32 $mod,#0xe1 @ our irreducible polynomial
- vld1.64 `&Dlo("$Z")`,[$Xi,:64]!
+ vld1.64 `&Dlo("$Z")`,[$Xi]!
vshr.u64 $mod,#32
vldmia $Xi,{$Hhi-$Hlo} @ load H
veor $zero,$zero
@@ -410,8 +427,8 @@ gcm_ghash_neon:
vrev64.8 $Z,$Z
#endif
sub $Xi,#16
- vst1.64 `&Dhi("$Z")`,[$Xi,:64]! @ write out Xi
- vst1.64 `&Dlo("$Z")`,[$Xi,:64]
+ vst1.64 `&Dhi("$Z")`,[$Xi]! @ write out Xi
+ vst1.64 `&Dlo("$Z")`,[$Xi]
bx lr
.size gcm_ghash_neon,.-gcm_ghash_neon
diff --git a/crypto/sha/asm/sha1-armv4-large.pl b/crypto/sha/asm/sha1-armv4-large.pl
index 33da3e0e3c..6c0adb9911 100644
--- a/crypto/sha/asm/sha1-armv4-large.pl
+++ b/crypto/sha/asm/sha1-armv4-large.pl
@@ -52,8 +52,20 @@
# Profiler-assisted and platform-specific optimization resulted in 10%
# improvement on Cortex A8 core and 12.2 cycles per byte.
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+$flavour = shift;
+if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+ die "can't locate arm-xlate.pl";
+
+ open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+ open STDOUT,">$output";
+}
$ctx="r0";
$inp="r1";
diff --git a/crypto/sha/asm/sha256-armv4.pl b/crypto/sha/asm/sha256-armv4.pl
index 9c84e8d93c..252a583d06 100644
--- a/crypto/sha/asm/sha256-armv4.pl
+++ b/crypto/sha/asm/sha256-armv4.pl
@@ -23,8 +23,20 @@
# Profiler-assisted and platform-specific optimization resulted in 16%
# improvement on Cortex A8 core and ~17 cycles per processed byte.
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+$flavour = shift;
+if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+ die "can't locate arm-xlate.pl";
+
+ open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+ open STDOUT,">$output";
+}
$ctx="r0"; $t0="r0";
$inp="r1"; $t3="r1";
diff --git a/crypto/sha/asm/sha512-armv4.pl b/crypto/sha/asm/sha512-armv4.pl
index 7faf37b147..c032afdbca 100644
--- a/crypto/sha/asm/sha512-armv4.pl
+++ b/crypto/sha/asm/sha512-armv4.pl
@@ -38,8 +38,20 @@ $hi="HI";
$lo="LO";
# ====================================================================
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+$flavour = shift;
+if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+ die "can't locate arm-xlate.pl";
+
+ open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+ open STDOUT,">$output";
+}
$ctx="r0"; # parameter block
$inp="r1";
@@ -221,17 +233,21 @@ WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
.size K512,.-K512
.LOPENSSL_armcap:
-.word OPENSSL_armcap_P-sha512_block_data_order
+.word OPENSSL_armcap_P-.Lsha512_block_data_order
.skip 32-4
.global sha512_block_data_order
.type sha512_block_data_order,%function
sha512_block_data_order:
+.Lsha512_block_data_order:
sub r3,pc,#8 @ sha512_block_data_order
add $len,$inp,$len,lsl#7 @ len to point at the end of inp
#if __ARM_ARCH__>=7
ldr r12,.LOPENSSL_armcap
ldr r12,[r3,r12] @ OPENSSL_armcap_P
+#ifdef __APPLE__
+ ldr r12,[r12]
+#endif
tst r12,#1
bne .LNEON
#endif