summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--crypto/aes/asm/aes-armv4.pl31
-rw-r--r--crypto/bn/asm/armv4-gf2m.pl23
-rw-r--r--crypto/bn/asm/armv4-mont.pl16
-rw-r--r--crypto/modes/asm/ghash-armv4.pl33
-rw-r--r--crypto/sha/asm/sha1-armv4-large.pl16
-rw-r--r--crypto/sha/asm/sha256-armv4.pl16
-rw-r--r--crypto/sha/asm/sha512-armv4.pl22
7 files changed, 136 insertions, 21 deletions
diff --git a/crypto/aes/asm/aes-armv4.pl b/crypto/aes/asm/aes-armv4.pl
index 55b6e04b67..ed5125827b 100644
--- a/crypto/aes/asm/aes-armv4.pl
+++ b/crypto/aes/asm/aes-armv4.pl
@@ -32,8 +32,20 @@
# Profiler-assisted and platform-specific optimization resulted in 16%
# improvement on Cortex A8 core and ~21.5 cycles per byte.
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+$flavour = shift;
+if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+ die "can't locate arm-xlate.pl";
+
+ open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+ open STDOUT,">$output";
+}
$s0="r0";
$s1="r1";
@@ -171,7 +183,12 @@ AES_encrypt:
stmdb sp!,{r1,r4-r12,lr}
mov $rounds,r0 @ inp
mov $key,r2
+#ifdef __APPLE__
+ mov $tbl,#AES_encrypt-AES_Te
+ sub $tbl,r3,$tbl @ Te
+#else
sub $tbl,r3,#AES_encrypt-AES_Te @ Te
+#endif
#if __ARM_ARCH__<7
ldrb $s0,[$rounds,#3] @ load input data in endian-neutral
ldrb $t1,[$rounds,#2] @ manner...
@@ -425,7 +442,12 @@ AES_set_encrypt_key:
bne .Labrt
.Lok: stmdb sp!,{r4-r12,lr}
+#ifdef __APPLE__
+ mov $tbl,#AES_set_encrypt_key-AES_Te-1024
+ sub $tbl,r3,$tbl @ Te4
+#else
sub $tbl,r3,#AES_set_encrypt_key-AES_Te-1024 @ Te4
+#endif
mov $rounds,r0 @ inp
mov lr,r1 @ bits
@@ -886,7 +908,12 @@ AES_decrypt:
stmdb sp!,{r1,r4-r12,lr}
mov $rounds,r0 @ inp
mov $key,r2
+#ifdef __APPLE__
+ mov $tbl,#AES_decrypt-AES_Td
+ sub $tbl,r3,$tbl @ Td
+#else
sub $tbl,r3,#AES_decrypt-AES_Td @ Td
+#endif
#if __ARM_ARCH__<7
ldrb $s0,[$rounds,#3] @ load input data in endian-neutral
ldrb $t1,[$rounds,#2] @ manner...
diff --git a/crypto/bn/asm/armv4-gf2m.pl b/crypto/bn/asm/armv4-gf2m.pl
index c52e0b75b5..737659f0db 100644
--- a/crypto/bn/asm/armv4-gf2m.pl
+++ b/crypto/bn/asm/armv4-gf2m.pl
@@ -21,8 +21,20 @@
# runs in even less cycles, ~30, improvement is measurable only on
# longer keys. One has to optimize code elsewhere to get NEON glow...
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+$flavour = shift;
+if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+ die "can't locate arm-xlate.pl";
+
+ open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+ open STDOUT,">$output";
+}
sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
@@ -170,11 +182,18 @@ bn_GF2m_mul_2x2:
#if __ARM_ARCH__>=7
ldr r12,.LOPENSSL_armcap
.Lpic: ldr r12,[pc,r12]
+#ifdef __APPLE__
+ ldr r12,[r12]
+#endif
tst r12,#1
beq .Lialu
veor $A1,$A1
+#ifdef __APPLE__
+ vmov $B1,r3,r3 @ two copies of b1
+#else
vmov.32 $B1,r3,r3 @ two copies of b1
+#endif
vmov.32 ${A1}[0],r1 @ a1
veor $A0,$A0
diff --git a/crypto/bn/asm/armv4-mont.pl b/crypto/bn/asm/armv4-mont.pl
index f78a8b5f0f..aa00f38c2f 100644
--- a/crypto/bn/asm/armv4-mont.pl
+++ b/crypto/bn/asm/armv4-mont.pl
@@ -23,8 +23,20 @@
# than 1/2KB. Windows CE port would be trivial, as it's exclusively
# about decorations, ABI and instruction syntax are identical.
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+$flavour = shift;
+if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+ die "can't locate arm-xlate.pl";
+
+ open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+ open STDOUT,">$output";
+}
$num="r0"; # starts as num argument, but holds &tp[num-1]
$ap="r1";
diff --git a/crypto/modes/asm/ghash-armv4.pl b/crypto/modes/asm/ghash-armv4.pl
index d91586ee29..3799b2b559 100644
--- a/crypto/modes/asm/ghash-armv4.pl
+++ b/crypto/modes/asm/ghash-armv4.pl
@@ -57,8 +57,20 @@
# *native* byte order on current platform. See gcm128.c for working
# example...
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+$flavour = shift;
+if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+ die "can't locate arm-xlate.pl";
+
+ open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+ open STDOUT,">$output";
+}
$Xi="r0"; # argument block
$Htbl="r1";
@@ -112,6 +124,11 @@ $code=<<___;
.text
.code 32
+#ifdef __APPLE__
+#define ldrplb ldrbpl
+#define ldrneb ldrbne
+#endif
+
.type rem_4bit,%object
.align 5
rem_4bit:
@@ -326,9 +343,9 @@ $code.=<<___;
.align 4
gcm_gmult_neon:
sub $Htbl,#16 @ point at H in GCM128_CTX
- vld1.64 `&Dhi("$IN")`,[$Xi,:64]!@ load Xi
+ vld1.64 `&Dhi("$IN")`,[$Xi]! @ load Xi
vmov.i32 $mod,#0xe1 @ our irreducible polynomial
- vld1.64 `&Dlo("$IN")`,[$Xi,:64]!
+ vld1.64 `&Dlo("$IN")`,[$Xi]!
vshr.u64 $mod,#32
vldmia $Htbl,{$Hhi-$Hlo} @ load H
veor $zero,$zero
@@ -349,9 +366,9 @@ gcm_gmult_neon:
.type gcm_ghash_neon,%function
.align 4
gcm_ghash_neon:
- vld1.64 `&Dhi("$Z")`,[$Xi,:64]! @ load Xi
+ vld1.64 `&Dhi("$Z")`,[$Xi]! @ load Xi
vmov.i32 $mod,#0xe1 @ our irreducible polynomial
- vld1.64 `&Dlo("$Z")`,[$Xi,:64]!
+ vld1.64 `&Dlo("$Z")`,[$Xi]!
vshr.u64 $mod,#32
vldmia $Xi,{$Hhi-$Hlo} @ load H
veor $zero,$zero
@@ -410,8 +427,8 @@ gcm_ghash_neon:
vrev64.8 $Z,$Z
#endif
sub $Xi,#16
- vst1.64 `&Dhi("$Z")`,[$Xi,:64]! @ write out Xi
- vst1.64 `&Dlo("$Z")`,[$Xi,:64]
+ vst1.64 `&Dhi("$Z")`,[$Xi]! @ write out Xi
+ vst1.64 `&Dlo("$Z")`,[$Xi]
bx lr
.size gcm_ghash_neon,.-gcm_ghash_neon
diff --git a/crypto/sha/asm/sha1-armv4-large.pl b/crypto/sha/asm/sha1-armv4-large.pl
index 33da3e0e3c..6c0adb9911 100644
--- a/crypto/sha/asm/sha1-armv4-large.pl
+++ b/crypto/sha/asm/sha1-armv4-large.pl
@@ -52,8 +52,20 @@
# Profiler-assisted and platform-specific optimization resulted in 10%
# improvement on Cortex A8 core and 12.2 cycles per byte.
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+$flavour = shift;
+if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+ die "can't locate arm-xlate.pl";
+
+ open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+ open STDOUT,">$output";
+}
$ctx="r0";
$inp="r1";
diff --git a/crypto/sha/asm/sha256-armv4.pl b/crypto/sha/asm/sha256-armv4.pl
index 9c84e8d93c..252a583d06 100644
--- a/crypto/sha/asm/sha256-armv4.pl
+++ b/crypto/sha/asm/sha256-armv4.pl
@@ -23,8 +23,20 @@
# Profiler-assisted and platform-specific optimization resulted in 16%
# improvement on Cortex A8 core and ~17 cycles per processed byte.
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+$flavour = shift;
+if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+ die "can't locate arm-xlate.pl";
+
+ open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+ open STDOUT,">$output";
+}
$ctx="r0"; $t0="r0";
$inp="r1"; $t3="r1";
diff --git a/crypto/sha/asm/sha512-armv4.pl b/crypto/sha/asm/sha512-armv4.pl
index 7faf37b147..c032afdbca 100644
--- a/crypto/sha/asm/sha512-armv4.pl
+++ b/crypto/sha/asm/sha512-armv4.pl
@@ -38,8 +38,20 @@ $hi="HI";
$lo="LO";
# ====================================================================
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+$flavour = shift;
+if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+ die "can't locate arm-xlate.pl";
+
+ open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+ open STDOUT,">$output";
+}
$ctx="r0"; # parameter block
$inp="r1";
@@ -221,17 +233,21 @@ WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
.size K512,.-K512
.LOPENSSL_armcap:
-.word OPENSSL_armcap_P-sha512_block_data_order
+.word OPENSSL_armcap_P-.Lsha512_block_data_order
.skip 32-4
.global sha512_block_data_order
.type sha512_block_data_order,%function
sha512_block_data_order:
+.Lsha512_block_data_order:
sub r3,pc,#8 @ sha512_block_data_order
add $len,$inp,$len,lsl#7 @ len to point at the end of inp
#if __ARM_ARCH__>=7
ldr r12,.LOPENSSL_armcap
ldr r12,[r3,r12] @ OPENSSL_armcap_P
+#ifdef __APPLE__
+ ldr r12,[r12]
+#endif
tst r12,#1
bne .LNEON
#endif