SF.net SVN: gar:[25050] csw/mgar/pkg/openssl1/trunk/files
janholzh at users.sourceforge.net
janholzh at users.sourceforge.net
Tue Jun 2 09:41:27 CEST 2015
Revision: 25050
http://sourceforge.net/p/gar/code/25050
Author: janholzh
Date: 2015-06-02 07:41:26 +0000 (Tue, 02 Jun 2015)
Log Message:
-----------
openssl1/trunk: update more patches
Modified Paths:
--------------
csw/mgar/pkg/openssl1/trunk/files/openssl-1.0.1m-t4-engine.sparc.5.11.patch
csw/mgar/pkg/openssl1/trunk/files/openssl-1.0.1m-wanboot.patch
Modified: csw/mgar/pkg/openssl1/trunk/files/openssl-1.0.1m-t4-engine.sparc.5.11.patch
===================================================================
--- csw/mgar/pkg/openssl1/trunk/files/openssl-1.0.1m-t4-engine.sparc.5.11.patch 2015-06-02 06:06:13 UTC (rev 25049)
+++ csw/mgar/pkg/openssl1/trunk/files/openssl-1.0.1m-t4-engine.sparc.5.11.patch 2015-06-02 07:41:26 UTC (rev 25050)
@@ -13,9 +13,9 @@
my $ia64_asm="ia64cpuid.o:bn-ia64.o ia64-mont.o::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o rc4_skey.o:::::ghash-ia64.o::void";
-my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o:::sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o:::::::ghash-sparcv9.o::void";
+my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o sparct4-mont.o sparcv9-gf2m.o:des_enc-sparc.o fcrypt_b.o dest4-sparcv9.o:aes_core.o aes_cbc.o aes-sparcv9.o aest4-sparcv9.o::md5-sparcv9.o:sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o:::::::ghash-sparcv9.o::void";
+ my $fips_sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o:::sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o:::::::ghash-sparcv9.o::void";
my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o:::::::::::::void";
my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o:::::sha1-alpha.o:::::::ghash-alpha.o::void";
- my $mips32_asm=":bn-mips.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o::::::::";
Index: crypto/sparccpuid.S
===================================================================
diff -ru openssl-1.0.1e/crypto/sparccpuid.S openssl-1.0.1e/crypto/sparccpuid.S
@@ -29,20 +29,7 @@
#if defined(__SUNPRO_C) && defined(__sparcv9)
# define ABI64 /* They've said -xarch=v9 at command line */
#elif defined(__GNUC__) && defined(__arch64__)
-@@ -235,10 +239,10 @@
- .global _sparcv9_vis1_probe
- .align 8
- _sparcv9_vis1_probe:
-+ .word 0x81b00d80 !fxor %f0,%f0,%f0
- add %sp,BIAS+2,%o1
-- .word 0xc19a5a40 !ldda [%o1]ASI_FP16_P,%f0
- retl
-- .word 0x81b00d80 !fxor %f0,%f0,%f0
-+ .word 0xc19a5a40 !ldda [%o1]ASI_FP16_P,%f0
- .type _sparcv9_vis1_probe,#function
- .size _sparcv9_vis1_probe,.-_sparcv9_vis1_probe
-
-@@ -251,7 +255,12 @@
+@@ -241,7 +245,12 @@
! UltraSPARC IIe 7
! UltraSPARC III 7
! UltraSPARC T1 24
@@ -55,7 +42,7 @@
! Numbers for T2 and SPARC64 V-VII are more than welcomed.
!
! It would be possible to detect specifically US-T1 by instrumenting
-@@ -260,6 +269,8 @@
+@@ -250,6 +259,8 @@
.global _sparcv9_vis1_instrument
.align 8
_sparcv9_vis1_instrument:
@@ -64,9 +51,9 @@
.word 0x91410000 !rd %tick,%o0
.word 0x81b00d80 !fxor %f0,%f0,%f0
.word 0x85b08d82 !fxor %f2,%f2,%f2
-@@ -314,6 +325,30 @@
- .type _sparcv9_fmadd_probe,#function
- .size _sparcv9_fmadd_probe,.-_sparcv9_fmadd_probe
+@@ -286,6 +297,30 @@
+ .type _sparcv9_vis1_instrument,#function
+ .size _sparcv9_vis1_instrument,.-_sparcv9_vis1_instrument
+.global _sparcv9_rdcfr
+.align 8
@@ -95,7 +82,7 @@
.global OPENSSL_cleanse
.align 32
OPENSSL_cleanse:
-@@ -398,6 +433,102 @@
+@@ -370,6 +405,102 @@
.size OPENSSL_cleanse,.-OPENSSL_cleanse
#ifndef _BOOT
@@ -203,18 +190,20 @@
diff -ru openssl-1.0.1e/crypto/sparcv9cap.c openssl-1.0.1e/crypto/sparcv9cap.c
--- openssl-1.0.1e/crypto/sparcv9cap.c 2011-05-24 17:02:24.000000000 -0700
+++ openssl-1.0.1e/crypto/sparcv9cap.c 2011-07-27 10:48:17.817470000 -0700
-@@ -4,34 +4,58 @@
+@@ -3,36 +3,59 @@
+ #include <string.h>
#include <setjmp.h>
- #include <signal.h>
#include <sys/time.h>
+#include <unistd.h>
#include <openssl/bn.h>
+ #include <sys/auxv.h>
-#define SPARCV9_TICK_PRIVILEGED (1<<0)
-#define SPARCV9_PREFER_FPU (1<<1)
-#define SPARCV9_VIS1 (1<<2)
-#define SPARCV9_VIS2 (1<<3) /* reserved */
-#define SPARCV9_FMADD (1<<4) /* reserved for SPARC64 V */
+-#define SPARCV9_BLK (1<<5)
+#include "sparc_arch.h"
+#if defined(__GNUC__) && defined(__linux)
@@ -275,13 +264,11 @@
}
unsigned long _sparcv9_rdtick(void);
-@@ -39,11 +63,18 @@
+@@ -37,11 +60,16 @@
+
+ unsigned long _sparcv9_rdtick(void);
unsigned long _sparcv9_vis1_instrument(void);
- void _sparcv9_vis2_probe(void);
- void _sparcv9_fmadd_probe(void);
+unsigned long _sparcv9_rdcfr(void);
-+void _sparcv9_vis3_probe(void);
-+unsigned long _sparcv9_random(void);
+#ifndef _BOOT
+size_t _sparcv9_vis1_instrument_bus(unsigned int *,size_t);
+size_t _sparcv9_vis1_instrument_bus2(unsigned int *,size_t,size_t);
@@ -295,7 +282,7 @@
#if defined(__sun) && defined(__SVR4)
return gethrtime();
#else
-@@ -52,6 +83,24 @@
+@@ -50,6 +80,24 @@
else
return _sparcv9_rdtick();
}
@@ -320,7 +307,7 @@
#endif
#if defined(_BOOT)
-@@ -61,7 +110,7 @@
+@@ -59,7 +107,7 @@
*/
void OPENSSL_cpuid_setup(void)
{
@@ -329,7 +316,7 @@
}
#elif 0 && defined(__sun) && defined(__SVR4)
-@@ -90,11 +139,11 @@
+@@ -88,11 +136,11 @@
if (!strcmp(name, "SUNW,UltraSPARC") ||
/* covers II,III,IV */
!strncmp(name, "SUNW,UltraSPARC-I", 17)) {
@@ -343,7 +330,7 @@
return DI_WALK_TERMINATE;
}
-@@ -100,7 +149,7 @@
+@@ -98,7 +146,7 @@
}
/* This is expected to catch remaining UltraSPARCs, such as T1 */
else if (!strncmp(name, "SUNW,UltraSPARC", 15)) {
@@ -352,7 +339,7 @@
return DI_WALK_TERMINATE;
}
-@@ -119,7 +168,7 @@
+@@ -117,7 +165,7 @@
trigger = 1;
if ((e = getenv("OPENSSL_sparcv9cap"))) {
@@ -361,7 +348,7 @@
return;
}
-@@ -126,15 +175,15 @@
+@@ -124,15 +172,15 @@
if (sysinfo(SI_MACHINE, si, sizeof(si)) > 0) {
if (strcmp(si, "sun4v"))
/* FPU is preferred for all CPUs, but US-T1/2 */
@@ -381,7 +368,7 @@
return;
}
}
-@@ -204,12 +253,14 @@
+@@ -195,7 +241,9 @@
trigger = 1;
if ((e = getenv("OPENSSL_sparcv9cap"))) {
@@ -392,73 +379,48 @@
return;
}
+@@ -202,21 +250,48 @@
+ (void) getisax(&ui, 1);
+
/* Initial value, fits UltraSPARC-I&II... */
-- OPENSSL_sparcv9cap_P = SPARCV9_PREFER_FPU | SPARCV9_TICK_PRIVILEGED;
-+ OPENSSL_sparcv9cap_P[0] = SPARCV9_PREFER_FPU | SPARCV9_TICK_PRIVILEGED;
+- OPENSSL_sparcv9cap_P = SPARCV9_BLK;
++ OPENSSL_sparcv9cap_P[0] = SPARCV9_BLK;
- sigfillset(&all_masked);
- sigdelset(&all_masked, SIGILL);
-@@ -232,18 +283,18 @@
-
- if (sigsetjmp(common_jmp, 1) == 0) {
- _sparcv9_rdtick();
-- OPENSSL_sparcv9cap_P &= ~SPARCV9_TICK_PRIVILEGED;
-+ OPENSSL_sparcv9cap_P[0] &= ~SPARCV9_TICK_PRIVILEGED;
+ if (ui & AV_SPARC_VIS) {
+- /* detect UltraSPARC-Tx, see sparccpuid.S for details... */
++ /* detect UltraSPARC-Tx, see sparccpud.S for details... */
+ if (_sparcv9_vis1_instrument() < 7)
+- OPENSSL_sparcv9cap_P |= SPARCV9_TICK_PRIVILEGED;
++ OPENSSL_sparcv9cap_P[0] |= SPARCV9_TICK_PRIVILEGED;
+ if (_sparcv9_vis1_instrument() < 12) {
+- OPENSSL_sparcv9cap_P |= SPARCV9_VIS1|SPARCV9_PREFER_FPU;
++ OPENSSL_sparcv9cap_P[0] |= (SPARCV9_VIS1 | SPARCV9_PREFER_FPU);
+ if (ui & AV_SPARC_VIS2)
+- OPENSSL_sparcv9cap_P |= SPARCV9_VIS2;
+- }
++ OPENSSL_sparcv9cap_P[0] |= SPARCV9_VIS2;
++ }
}
- if (sigsetjmp(common_jmp, 1) == 0) {
- _sparcv9_vis1_probe();
-- OPENSSL_sparcv9cap_P |= SPARCV9_VIS1;
-+ OPENSSL_sparcv9cap_P[0] |= SPARCV9_VIS1 | SPARCV9_BLK;
- /* detect UltraSPARC-Tx, see sparccpud.S for details... */
- if (_sparcv9_vis1_instrument() >= 12)
-- OPENSSL_sparcv9cap_P &= ~(SPARCV9_VIS1 | SPARCV9_PREFER_FPU);
-+ OPENSSL_sparcv9cap_P[0] &= ~(SPARCV9_VIS1 | SPARCV9_PREFER_FPU);
- else {
- _sparcv9_vis2_probe();
-- OPENSSL_sparcv9cap_P |= SPARCV9_VIS2;
-+ OPENSSL_sparcv9cap_P[0] |= SPARCV9_VIS2;
- }
- }
-
-@@ -249,13 +300,50 @@
-
- if (sigsetjmp(common_jmp, 1) == 0) {
- _sparcv9_fmadd_probe();
+ if (ui & AV_SPARC_FMAF)
- OPENSSL_sparcv9cap_P |= SPARCV9_FMADD;
+ OPENSSL_sparcv9cap_P[0] |= SPARCV9_FMADD;
- }
-
++
+ /*
+ * VIS3 flag is tested independently from VIS1, unlike VIS2 that is,
+ * because VIS3 defines even integer instructions.
+ */
-+ if (sigsetjmp(common_jmp,1) == 0) {
-+ _sparcv9_vis3_probe();
-+ OPENSSL_sparcv9cap_P[0] |= SPARCV9_VIS3;
-+ }
++ if (ui & AV_SPARC_VIS3)
++ OPENSSL_sparcv9cap_P[0] |= SPARCV9_VIS3;
+
-+ if (sigsetjmp(common_jmp,1) == 0) {
-+ (void)_sparcv9_random();
-+ OPENSSL_sparcv9cap_P[0] |= SPARCV9_RANDOM;
-+ }
++#define AV_T4_MECHS (AV_SPARC_AES | AV_SPARC_DES | AV_SPARC_KASUMI | \
++ AV_SPARC_CAMELLIA | AV_SPARC_MD5 | AV_SPARC_SHA1 | \
++ AV_SPARC_SHA256 | AV_SPARC_SHA512 | AV_SPARC_MPMUL | \
++ AV_SPARC_CRC32C)
+
-+ /*
-+ * In wait for better solution _sparcv9_rdcfr is masked by
-+ * VIS3 flag, because it goes to uninterruptable endless
-+ * loop on UltraSPARC II running Solaris. Things might be
-+ * different on Linux...
-+ */
-+ if ((OPENSSL_sparcv9cap_P[0]&SPARCV9_VIS3) &&
-+ sigsetjmp(common_jmp, 1) == 0) {
++ if ((OPENSSL_sparcv9cap_P[0]&SPARCV9_VIS3) && (ui & AV_T4_MECHS))
+ OPENSSL_sparcv9cap_P[1] = (unsigned int)_sparcv9_rdcfr();
-+ }
+
- sigaction(SIGBUS, &bus_oact, NULL);
- sigaction(SIGILL, &ill_oact, NULL);
-
- sigprocmask(SIG_SETMASK, &oset, NULL);
-+
+ if (sizeof(size_t) == 8)
+ OPENSSL_sparcv9cap_P[0] |= SPARCV9_64BIT_STACK;
+#ifdef __linux
@@ -2265,5563 +2227,3 @@
{ERR_FUNC(EVP_F_AES_XTS), "AES_XTS"},
{ERR_FUNC(EVP_F_AES_XTS_CIPHER), "AES_XTS_CIPHER"},
{ERR_FUNC(EVP_F_ALG_MODULE_INIT), "ALG_MODULE_INIT"},
-Index: crypto/sparc_arch.h
-===================================================================
-diff -uNr openssl-1.0.1m/crypto/sparc_arch.h openssl-1.0.1m/crypto/sparc_arch.h
---- openssl-1.0.1m/crypto/sparc_arch.h 1970-01-01 01:00:00.000000000 +0100
-+++ openssl-1.0.1m/crypto/sparc_arch.h 2015-03-21 16:27:38.578043100 +0100
-@@ -0,0 +1,101 @@
-+#ifndef __SPARC_ARCH_H__
-+#define __SPARC_ARCH_H__
-+
-+#define SPARCV9_TICK_PRIVILEGED (1<<0)
-+#define SPARCV9_PREFER_FPU (1<<1)
-+#define SPARCV9_VIS1 (1<<2)
-+#define SPARCV9_VIS2 (1<<3) /* reserved */
-+#define SPARCV9_FMADD (1<<4) /* reserved for SPARC64 V */
-+#define SPARCV9_BLK (1<<5) /* VIS1 block copy */
-+#define SPARCV9_VIS3 (1<<6)
-+#define SPARCV9_RANDOM (1<<7)
-+#define SPARCV9_64BIT_STACK (1<<8)
-+
-+/*
-+ * OPENSSL_sparcv9cap_P[1] is copy of Compatibility Feature Register,
-+ * %asr26, SPARC-T4 and later. There is no SPARCV9_CFR bit in
-+ * OPENSSL_sparcv9cap_P[0], as %cfr copy is sufficient...
-+ */
-+#define CFR_AES 0x00000001 /* Supports AES opcodes */
-+#define CFR_DES 0x00000002 /* Supports DES opcodes */
-+#define CFR_KASUMI 0x00000004 /* Supports KASUMI opcodes */
-+#define CFR_CAMELLIA 0x00000008 /* Supports CAMELLIA opcodes */
-+#define CFR_MD5 0x00000010 /* Supports MD5 opcodes */
-+#define CFR_SHA1 0x00000020 /* Supports SHA1 opcodes */
-+#define CFR_SHA256 0x00000040 /* Supports SHA256 opcodes */
-+#define CFR_SHA512 0x00000080 /* Supports SHA512 opcodes */
-+#define CFR_MPMUL 0x00000100 /* Supports MPMUL opcodes */
-+#define CFR_MONTMUL 0x00000200 /* Supports MONTMUL opcodes */
-+#define CFR_MONTSQR 0x00000400 /* Supports MONTSQR opcodes */
-+#define CFR_CRC32C 0x00000800 /* Supports CRC32C opcodes */
-+
-+#if defined(OPENSSL_PIC) && !defined(__PIC__)
-+#define __PIC__
-+#endif
-+
-+#if defined(__SUNPRO_C) && defined(__sparcv9) && !defined(__arch64__)
-+#define __arch64__
-+#endif
-+
-+#define SPARC_PIC_THUNK(reg) \
-+ .align 32; \
-+.Lpic_thunk: \
-+ jmp %o7 + 8; \
-+ add %o7, reg, reg;
-+
-+#define SPARC_PIC_THUNK_CALL(reg) \
-+ sethi %hi(_GLOBAL_OFFSET_TABLE_-4), reg; \
-+ call .Lpic_thunk; \
-+ or reg, %lo(_GLOBAL_OFFSET_TABLE_+4), reg;
-+
-+#if 1
-+#define SPARC_SETUP_GOT_REG(reg) SPARC_PIC_THUNK_CALL(reg)
-+#else
-+#define SPARC_SETUP_GOT_REG(reg) \
-+ sethi %hi(_GLOBAL_OFFSET_TABLE_-4), reg; \
-+ call .+8; \
-+ or reg, %lo(_GLOBAL_OFFSET_TABLE_+4), reg; \
-+ add %o7, reg, reg
-+#endif
-+
-+#if defined(__arch64__)
-+
-+#define SPARC_LOAD_ADDRESS(SYM, reg) \
-+ setx SYM, %o7, reg;
-+#define LDPTR ldx
-+#define SIZE_T_CC %xcc
-+#define STACK_FRAME 192
-+#define STACK_BIAS 2047
-+#define STACK_7thARG (STACK_BIAS+176)
-+
-+#else
-+
-+#define SPARC_LOAD_ADDRESS(SYM, reg) \
-+ set SYM, reg;
-+#define LDPTR ld
-+#define SIZE_T_CC %icc
-+#define STACK_FRAME 112
-+#define STACK_BIAS 0
-+#define STACK_7thARG 92
-+#define SPARC_LOAD_ADDRESS_LEAF(SYM, reg, tmp) SPARC_LOAD_ADDRESS(SYM, reg)
-+
-+#endif
-+
-+#ifdef __PIC__
-+#undef SPARC_LOAD_ADDRESS
-+#undef SPARC_LOAD_ADDRESS_LEAF
-+#define SPARC_LOAD_ADDRESS(SYM, reg) \
-+ SPARC_SETUP_GOT_REG(reg); \
-+ sethi %hi(SYM), %o7; \
-+ or %o7, %lo(SYM), %o7; \
-+ LDPTR [reg + %o7], reg;
-+#endif
-+
-+#ifndef SPARC_LOAD_ADDRESS_LEAF
-+#define SPARC_LOAD_ADDRESS_LEAF(SYM, reg, tmp) \
-+ mov %o7, tmp; \
-+ SPARC_LOAD_ADDRESS(SYM, reg) \
-+ mov tmp, %o7;
-+#endif
-+
-+#endif /* __SPARC_ARCH_H__ */
-Index: crypto/md5/asm/md5-sparcv9.pl
-===================================================================
-diff -uNr openssl-1.0.1m/crypto/md5/asm/md5-sparcv9.pl openssl-1.0.1m/crypto/md5/asm/md5-sparcv9.pl
---- openssl-1.0.1m/crypto/md5/asm/md5-sparcv9.pl 1970-01-01 01:00:00.000000000 +0100
-+++ openssl-1.0.1m/crypto/md5/asm/md5-sparcv9.pl 2015-03-21 16:27:38.578043100 +0100
-@@ -0,0 +1,434 @@
-+#!/usr/bin/env perl
-+
-+# ====================================================================
-+# Written by Andy Polyakov <appro at openssl.org> for the OpenSSL
-+# project. The module is, however, dual licensed under OpenSSL and
-+# CRYPTOGAMS licenses depending on where you obtain it. For further
-+# details see http://www.openssl.org/~appro/cryptogams/.
-+#
-+# Hardware SPARC T4 support by David S. Miller <davem at davemloft.net>.
-+# ====================================================================
-+
-+# MD5 for SPARCv9, 6.9 cycles per byte on UltraSPARC, >40% faster than
-+# code generated by Sun C 5.2.
-+
-+# SPARC T4 MD5 hardware achieves 3.20 cycles per byte, which is 2.1x
-+# faster than software. Multi-process benchmark saturates at 12x
-+# single-process result on 8-core processor, or ~11GBps per 2.85GHz
-+# socket.
-+
-+$bits=32;
-+for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
-+if ($bits==64) { $bias=2047; $frame=192; }
-+else { $bias=0; $frame=112; }
-+
-+$output=shift;
-+open STDOUT,">$output";
-+
-+use integer;
-+
-+($ctx,$inp,$len)=("%i0","%i1","%i2"); # input arguments
-+
-+# 64-bit values
-+ at X=("%o0","%o1","%o2","%o3","%o4","%o5","%o7","%g1","%g2");
-+$tx="%g3";
-+($AB,$CD)=("%g4","%g5");
-+
-+# 32-bit values
-+ at V=($A,$B,$C,$D)=map("%l$_",(0..3));
-+($t1,$t2,$t3,$saved_asi)=map("%l$_",(4..7));
-+($shr,$shl1,$shl2)=("%i3","%i4","%i5");
-+
-+my @K=( 0xd76aa478,0xe8c7b756,0x242070db,0xc1bdceee,
-+ 0xf57c0faf,0x4787c62a,0xa8304613,0xfd469501,
-+ 0x698098d8,0x8b44f7af,0xffff5bb1,0x895cd7be,
-+ 0x6b901122,0xfd987193,0xa679438e,0x49b40821,
-+
-+ 0xf61e2562,0xc040b340,0x265e5a51,0xe9b6c7aa,
-+ 0xd62f105d,0x02441453,0xd8a1e681,0xe7d3fbc8,
-+ 0x21e1cde6,0xc33707d6,0xf4d50d87,0x455a14ed,
-+ 0xa9e3e905,0xfcefa3f8,0x676f02d9,0x8d2a4c8a,
-+
-+ 0xfffa3942,0x8771f681,0x6d9d6122,0xfde5380c,
-+ 0xa4beea44,0x4bdecfa9,0xf6bb4b60,0xbebfbc70,
-+ 0x289b7ec6,0xeaa127fa,0xd4ef3085,0x04881d05,
-+ 0xd9d4d039,0xe6db99e5,0x1fa27cf8,0xc4ac5665,
-+
-+ 0xf4292244,0x432aff97,0xab9423a7,0xfc93a039,
-+ 0x655b59c3,0x8f0ccc92,0xffeff47d,0x85845dd1,
-+ 0x6fa87e4f,0xfe2ce6e0,0xa3014314,0x4e0811a1,
-+ 0xf7537e82,0xbd3af235,0x2ad7d2bb,0xeb86d391, 0 );
-+
-+sub R0 {
-+ my ($i,$a,$b,$c,$d) = @_;
-+ my $rot = (7,12,17,22)[$i%4];
-+ my $j = ($i+1)/2;
-+
-+ if ($i&1) {
-+ $code.=<<___;
-+ srlx @X[$j],$shr, at X[$j] ! align X[`$i+1`]
-+ and $b,$t1,$t1 ! round $i
-+ sllx @X[$j+1],$shl1,$tx
-+ add $t2,$a,$a
-+ sllx $tx,$shl2,$tx
-+ xor $d,$t1,$t1
-+ or $tx, at X[$j], at X[$j]
-+ sethi %hi(@K[$i+1]),$t2
-+ add $t1,$a,$a
-+ or $t2,%lo(@K[$i+1]),$t2
-+ sll $a,$rot,$t3
-+ add @X[$j],$t2,$t2 ! X[`$i+1`]+K[`$i+1`]
-+ srl $a,32-$rot,$a
-+ add $b,$t3,$t3
-+ xor $b,$c,$t1
-+ add $t3,$a,$a
-+___
-+ } else {
-+ $code.=<<___;
-+ srlx @X[$j],32,$tx ! extract X[`2*$j+1`]
-+ and $b,$t1,$t1 ! round $i
-+ add $t2,$a,$a
-+ xor $d,$t1,$t1
-+ sethi %hi(@K[$i+1]),$t2
-+ add $t1,$a,$a
-+ or $t2,%lo(@K[$i+1]),$t2
-+ sll $a,$rot,$t3
-+ add $tx,$t2,$t2 ! X[`2*$j+1`]+K[`$i+1`]
-+ srl $a,32-$rot,$a
-+ add $b,$t3,$t3
-+ xor $b,$c,$t1
-+ add $t3,$a,$a
-+___
-+ }
-+}
-+
-+sub R0_1 {
-+ my ($i,$a,$b,$c,$d) = @_;
-+ my $rot = (7,12,17,22)[$i%4];
-+
-+$code.=<<___;
-+ srlx @X[0],32,$tx ! extract X[1]
-+ and $b,$t1,$t1 ! round $i
-+ add $t2,$a,$a
-+ xor $d,$t1,$t1
-+ sethi %hi(@K[$i+1]),$t2
-+ add $t1,$a,$a
-+ or $t2,%lo(@K[$i+1]),$t2
-+ sll $a,$rot,$t3
-+ add $tx,$t2,$t2 ! X[1]+K[`$i+1`]
-+ srl $a,32-$rot,$a
-+ add $b,$t3,$t3
-+ andn $b,$c,$t1
-+ add $t3,$a,$a
-+___
-+}
-+
-+sub R1 {
-+ my ($i,$a,$b,$c,$d) = @_;
-+ my $rot = (5,9,14,20)[$i%4];
-+ my $j = $i<31 ? (1+5*($i+1))%16 : (5+3*($i+1))%16;
-+ my $xi = @X[$j/2];
-+
-+$code.=<<___ if ($j&1 && ($xi=$tx));
-+ srlx @X[$j/2],32,$xi ! extract X[$j]
-+___
-+$code.=<<___;
-+ and $b,$d,$t3 ! round $i
-+ add $t2,$a,$a
-+ or $t3,$t1,$t1
-+ sethi %hi(@K[$i+1]),$t2
-+ add $t1,$a,$a
-+ or $t2,%lo(@K[$i+1]),$t2
-+ sll $a,$rot,$t3
-+ add $xi,$t2,$t2 ! X[$j]+K[`$i+1`]
-+ srl $a,32-$rot,$a
-+ add $b,$t3,$t3
-+ `$i<31?"andn":"xor"` $b,$c,$t1
-+ add $t3,$a,$a
-+___
-+}
-+
-+sub R2 {
-+ my ($i,$a,$b,$c,$d) = @_;
-+ my $rot = (4,11,16,23)[$i%4];
-+ my $j = $i<47 ? (5+3*($i+1))%16 : (0+7*($i+1))%16;
-+ my $xi = @X[$j/2];
-+
-+$code.=<<___ if ($j&1 && ($xi=$tx));
-+ srlx @X[$j/2],32,$xi ! extract X[$j]
-+___
-+$code.=<<___;
-+ add $t2,$a,$a ! round $i
-+ xor $b,$t1,$t1
-+ sethi %hi(@K[$i+1]),$t2
-+ add $t1,$a,$a
-+ or $t2,%lo(@K[$i+1]),$t2
-+ sll $a,$rot,$t3
-+ add $xi,$t2,$t2 ! X[$j]+K[`$i+1`]
-+ srl $a,32-$rot,$a
-+ add $b,$t3,$t3
-+ xor $b,$c,$t1
-+ add $t3,$a,$a
-+___
-+}
-+
-+sub R3 {
-+ my ($i,$a,$b,$c,$d) = @_;
-+ my $rot = (6,10,15,21)[$i%4];
-+ my $j = (0+7*($i+1))%16;
-+ my $xi = @X[$j/2];
-+
-+$code.=<<___;
-+ add $t2,$a,$a ! round $i
-+___
-+$code.=<<___ if ($j&1 && ($xi=$tx));
-+ srlx @X[$j/2],32,$xi ! extract X[$j]
-+___
-+$code.=<<___;
-+ orn $b,$d,$t1
-+ sethi %hi(@K[$i+1]),$t2
-+ xor $c,$t1,$t1
-+ or $t2,%lo(@K[$i+1]),$t2
-+ add $t1,$a,$a
-+ sll $a,$rot,$t3
-+ add $xi,$t2,$t2 ! X[$j]+K[`$i+1`]
-+ srl $a,32-$rot,$a
-+ add $b,$t3,$t3
-+ add $t3,$a,$a
-+___
-+}
-+
-+$code.=<<___ if ($bits==64);
-+.register %g2,#scratch
-+.register %g3,#scratch
-+___
-+$code.=<<___;
-+#include "sparc_arch.h"
-+
-+.section ".text",#alloc,#execinstr
-+
-+#ifdef __PIC__
-+SPARC_PIC_THUNK(%g1)
-+#endif
-+
-+.globl md5_block_asm_data_order
-+.align 32
-+md5_block_asm_data_order:
-+ SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
-+ ld [%g1+4],%g1 ! OPENSSL_sparcv9cap_P[1]
-+
-+ andcc %g1, CFR_MD5, %g0
-+ be .Lsoftware
-+ nop
-+
-+ mov 4, %g1
-+ andcc %o1, 0x7, %g0
-+ lda [%o0 + %g0]0x88, %f0 ! load context
-+ lda [%o0 + %g1]0x88, %f1
-+ add %o0, 8, %o0
-+ lda [%o0 + %g0]0x88, %f2
-+ lda [%o0 + %g1]0x88, %f3
-+ bne,pn %icc, .Lhwunaligned
-+ sub %o0, 8, %o0
-+
-+.Lhw_loop:
-+ ldd [%o1 + 0x00], %f8
-+ ldd [%o1 + 0x08], %f10
-+ ldd [%o1 + 0x10], %f12
-+ ldd [%o1 + 0x18], %f14
-+ ldd [%o1 + 0x20], %f16
-+ ldd [%o1 + 0x28], %f18
-+ ldd [%o1 + 0x30], %f20
-+ subcc %o2, 1, %o2 ! done yet?
-+ ldd [%o1 + 0x38], %f22
-+ add %o1, 0x40, %o1
-+ prefetch [%o1 + 63], 20
-+
-+ .word 0x81b02800 ! MD5
-+
-+ bne,pt `$bits==64?"%xcc":"%icc"`, .Lhw_loop
-+ nop
-+
-+.Lhwfinish:
-+ sta %f0, [%o0 + %g0]0x88 ! store context
-+ sta %f1, [%o0 + %g1]0x88
-+ add %o0, 8, %o0
-+ sta %f2, [%o0 + %g0]0x88
-+ sta %f3, [%o0 + %g1]0x88
-+ retl
-+ nop
-+
-+.align 8
-+.Lhwunaligned:
-+ alignaddr %o1, %g0, %o1
-+
-+ ldd [%o1 + 0x00], %f10
-+.Lhwunaligned_loop:
-+ ldd [%o1 + 0x08], %f12
-+ ldd [%o1 + 0x10], %f14
-+ ldd [%o1 + 0x18], %f16
-+ ldd [%o1 + 0x20], %f18
-+ ldd [%o1 + 0x28], %f20
-+ ldd [%o1 + 0x30], %f22
-+ ldd [%o1 + 0x38], %f24
-+ subcc %o2, 1, %o2 ! done yet?
-+ ldd [%o1 + 0x40], %f26
-+ add %o1, 0x40, %o1
-+ prefetch [%o1 + 63], 20
-+
-+ faligndata %f10, %f12, %f8
-+ faligndata %f12, %f14, %f10
-+ faligndata %f14, %f16, %f12
-+ faligndata %f16, %f18, %f14
-+ faligndata %f18, %f20, %f16
-+ faligndata %f20, %f22, %f18
-+ faligndata %f22, %f24, %f20
-+ faligndata %f24, %f26, %f22
-+
-+ .word 0x81b02800 ! MD5
-+
-+ bne,pt `$bits==64?"%xcc":"%icc"`, .Lhwunaligned_loop
-+ for %f26, %f26, %f10 ! %f10=%f26
-+
-+ ba .Lhwfinish
-+ nop
-+
-+.align 16
-+.Lsoftware:
-+ save %sp,-$frame,%sp
-+
-+ rd %asi,$saved_asi
-+ wr %g0,0x88,%asi ! ASI_PRIMARY_LITTLE
-+ and $inp,7,$shr
-+ andn $inp,7,$inp
-+
-+ sll $shr,3,$shr ! *=8
-+ mov 56,$shl2
-+ ld [$ctx+0],$A
-+ sub $shl2,$shr,$shl2
-+ ld [$ctx+4],$B
-+ and $shl2,32,$shl1
-+ add $shl2,8,$shl2
-+ ld [$ctx+8],$C
-+ sub $shl2,$shl1,$shl2 ! shr+shl1+shl2==64
-+ ld [$ctx+12],$D
-+ nop
-+
-+.Loop:
-+ cmp $shr,0 ! was inp aligned?
-+ ldxa [$inp+0]%asi, at X[0] ! load little-endian input
-+ ldxa [$inp+8]%asi, at X[1]
-+ ldxa [$inp+16]%asi, at X[2]
-+ ldxa [$inp+24]%asi, at X[3]
-+ ldxa [$inp+32]%asi, at X[4]
-+ sllx $A,32,$AB ! pack A,B
-+ ldxa [$inp+40]%asi, at X[5]
-+ sllx $C,32,$CD ! pack C,D
-+ ldxa [$inp+48]%asi, at X[6]
-+ or $B,$AB,$AB
-+ ldxa [$inp+56]%asi, at X[7]
-+ or $D,$CD,$CD
-+ bnz,a,pn %icc,.+8
-+ ldxa [$inp+64]%asi, at X[8]
-+
-+ srlx @X[0],$shr, at X[0] ! align X[0]
-+ sllx @X[1],$shl1,$tx
-+ sethi %hi(@K[0]),$t2
-+ sllx $tx,$shl2,$tx
-+ or $t2,%lo(@K[0]),$t2
-+ or $tx, at X[0], at X[0]
-+ xor $C,$D,$t1
-+ add @X[0],$t2,$t2 ! X[0]+K[0]
-+___
-+ for ($i=0;$i<15;$i++) { &R0($i, at V); unshift(@V,pop(@V)); }
-+ for (;$i<16;$i++) { &R0_1($i, at V); unshift(@V,pop(@V)); }
-+ for (;$i<32;$i++) { &R1($i, at V); unshift(@V,pop(@V)); }
-+ for (;$i<48;$i++) { &R2($i, at V); unshift(@V,pop(@V)); }
-+ for (;$i<64;$i++) { &R3($i, at V); unshift(@V,pop(@V)); }
-+$code.=<<___;
-+ srlx $AB,32,$t1 ! unpack A,B,C,D and accumulate
-+ add $inp,64,$inp ! advance inp
-+ srlx $CD,32,$t2
-+ add $t1,$A,$A
-+ subcc $len,1,$len ! done yet?
-+ add $AB,$B,$B
-+ add $t2,$C,$C
-+ add $CD,$D,$D
-+ srl $B,0,$B ! clruw $B
-+ bne `$bits==64?"%xcc":"%icc"`,.Loop
-+ srl $D,0,$D ! clruw $D
-+
-+ st $A,[$ctx+0] ! write out ctx
-+ st $B,[$ctx+4]
-+ st $C,[$ctx+8]
-+ st $D,[$ctx+12]
-+
-+ wr %g0,$saved_asi,%asi
-+ ret
-+ restore
-+.type md5_block_asm_data_order,#function
-+.size md5_block_asm_data_order,(.-md5_block_asm_data_order)
-+
-+.asciz "MD5 block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
-+.align 4
-+___
-+
-+# Purpose of these subroutines is to explicitly encode VIS instructions,
-+# so that one can compile the module without having to specify VIS
-+# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
-+# Idea is to reserve for option to produce "universal" binary and let
-+# programmer detect if current CPU is VIS capable at run-time.
-+sub unvis {
-+my ($mnemonic,$rs1,$rs2,$rd)=@_;
-+my $ref,$opf;
-+my %visopf = ( "faligndata" => 0x048,
-+ "for" => 0x07c );
-+
-+ $ref = "$mnemonic\t$rs1,$rs2,$rd";
-+
-+ if ($opf=$visopf{$mnemonic}) {
-+ foreach ($rs1,$rs2,$rd) {
-+ return $ref if (!/%f([0-9]{1,2})/);
-+ $_=$1;
-+ if ($1>=32) {
-+ return $ref if ($1&1);
-+ # re-encode for upper double register addressing
-+ $_=($1|$1>>5)&31;
-+ }
-+ }
-+
-+ return sprintf ".word\t0x%08x !%s",
-+ 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
-+ $ref;
-+ } else {
-+ return $ref;
-+ }
-+}
-+sub unalignaddr {
-+my ($mnemonic,$rs1,$rs2,$rd)=@_;
-+my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
-+my $ref="$mnemonic\t$rs1,$rs2,$rd";
-+
-+ foreach ($rs1,$rs2,$rd) {
-+ if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; }
-+ else { return $ref; }
-+ }
-+ return sprintf ".word\t0x%08x !%s",
-+ 0x81b00300|$rd<<25|$rs1<<14|$rs2,
-+ $ref;
-+}
-+
-+foreach (split("\n",$code)) {
-+ s/\`([^\`]*)\`/eval $1/ge;
-+
-+ s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
-+ &unvis($1,$2,$3,$4)
-+ /ge;
-+ s/\b(alignaddr)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
-+ &unalignaddr($1,$2,$3,$4)
-+ /ge;
-+
-+ print $_,"\n";
-+}
-+
-+close STDOUT;
-Index: crypto/aes/asm/aest4-sparcv9.pl
-===================================================================
-diff -uNr openssl-1.0.1m/crypto/aes/asm/aest4-sparcv9.pl openssl-1.0.1m/crypto/aes/asm/aest4-sparcv9.pl
---- openssl-1.0.1m/crypto/aes/asm/aest4-sparcv9.pl 1970-01-01 01:00:00.000000000 +0100
-+++ openssl-1.0.1m/crypto/aes/asm/aest4-sparcv9.pl 2015-03-21 16:27:38.578043100 +0100
-@@ -0,0 +1,902 @@
-+#!/usr/bin/env perl
-+
-+# ====================================================================
-+# Written by David S. Miller <davem at devemloft.net> and Andy Polyakov
-+# <appro at openssl.org>. The module is licensed under 2-clause BSD
-+# license. October 2012. All rights reserved.
-+# ====================================================================
-+
-+######################################################################
-+# AES for SPARC T4.
-+#
-+# AES round instructions complete in 3 cycles and can be issued every
-+# cycle. It means that round calculations should take 4*rounds cycles,
-+# because any given round instruction depends on result of *both*
-+# previous instructions:
-+#
-+# |0 |1 |2 |3 |4
-+# |01|01|01|
-+# |23|23|23|
-+# |01|01|...
-+# |23|...
-+#
-+# Provided that fxor [with IV] takes 3 cycles to complete, critical
-+# path length for CBC encrypt would be 3+4*rounds, or in other words
-+# it should process one byte in at least (3+4*rounds)/16 cycles. This
-+# estimate doesn't account for "collateral" instructions, such as
-+# fetching input from memory, xor-ing it with zero-round key and
-+# storing the result. Yet, *measured* performance [for data aligned
-+# at 64-bit boundary!] deviates from this equation by less than 0.5%:
-+#
-+# 128-bit key 192- 256-
-+# CBC encrypt 2.70/2.90(*) 3.20/3.40 3.70/3.90
-+# (*) numbers after slash are for
-+# misaligned data.
-+#
-+# Out-of-order execution logic managed to fully overlap "collateral"
-+# instructions with those on critical path. Amazing!
-+#
-+# As with Intel AES-NI, question is if it's possible to improve
-+# performance of parallelizeable modes by interleaving round
-+# instructions. Provided round instruction latency and throughput
-+# optimal interleave factor is 2. But can we expect 2x performance
-+# improvement? Well, as round instructions can be issued one per
-+# cycle, they don't saturate the 2-way issue pipeline and therefore
-+# there is room for "collateral" calculations... Yet, 2x speed-up
-+# over CBC encrypt remains unattaintable:
-+#
-+# 128-bit key 192- 256-
-+# CBC decrypt 1.64/2.11 1.89/2.37 2.23/2.61
-+# CTR 1.64/2.08(*) 1.89/2.33 2.23/2.61
-+# (*) numbers after slash are for
-+# misaligned data.
-+#
-+# Estimates based on amount of instructions under assumption that
-+# round instructions are not pairable with any other instruction
-+# suggest that latter is the actual case and pipeline runs
-+# underutilized. It should be noted that T4 out-of-order execution
-+# logic is so capable that performance gain from 2x interleave is
-+# not even impressive, ~7-13% over non-interleaved code, largest
-+# for 256-bit keys.
-+
-+# To anchor to something else, software implementation processes
-+# one byte in 29 cycles with 128-bit key on same processor. Intel
-+# Sandy Bridge encrypts byte in 5.07 cycles in CBC mode and decrypts
-+# in 0.93, naturally with AES-NI.
-+
-+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-+push(@INC,"${dir}","${dir}../../perlasm");
-+require "sparcv9_modes.pl";
-+
-+&asm_init(@ARGV);
-+
-+$::evp=1; # if $evp is set to 0, script generates module with
-+# AES_[en|de]crypt, AES_set_[en|de]crypt_key and AES_cbc_encrypt entry
-+# points. These however are not fully compatible with openssl/aes.h,
-+# because they expect AES_KEY to be aligned at 64-bit boundary. When
-+# used through EVP, alignment is arranged at EVP layer. Second thing
-+# that is arranged by EVP is at least 32-bit alignment of IV.
-+
-+######################################################################
-+# single-round subroutines
-+#
-+{
-+my ($inp,$out,$key,$rounds,$tmp,$mask)=map("%o$_",(0..5));
-+
-+$code=<<___;
-+.text
-+
-+.globl aes_t4_encrypt
-+.align 32
-+aes_t4_encrypt:
-+ andcc $inp, 7, %g1 ! is input aligned?
-+ andn $inp, 7, $inp
-+
-+ ldx [$key + 0], %g4
-+ ldx [$key + 8], %g5
-+
-+ ldx [$inp + 0], %o4
-+ bz,pt %icc, 1f
-+ ldx [$inp + 8], %o5
-+ ldx [$inp + 16], $inp
-+ sll %g1, 3, %g1
-+ sub %g0, %g1, %o3
-+ sllx %o4, %g1, %o4
-+ sllx %o5, %g1, %g1
-+ srlx %o5, %o3, %o5
-+ srlx $inp, %o3, %o3
-+ or %o5, %o4, %o4
-+ or %o3, %g1, %o5
-+1:
-+ ld [$key + 240], $rounds
-+ ldd [$key + 16], %f12
-+ ldd [$key + 24], %f14
-+ xor %g4, %o4, %o4
-+ xor %g5, %o5, %o5
-+ movxtod %o4, %f0
-+ movxtod %o5, %f2
-+ srl $rounds, 1, $rounds
-+ ldd [$key + 32], %f16
-+ sub $rounds, 1, $rounds
-+ ldd [$key + 40], %f18
-+ add $key, 48, $key
-+
-+.Lenc:
-+ aes_eround01 %f12, %f0, %f2, %f4
-+ aes_eround23 %f14, %f0, %f2, %f2
-+ ldd [$key + 0], %f12
-+ ldd [$key + 8], %f14
-+ sub $rounds,1,$rounds
-+ aes_eround01 %f16, %f4, %f2, %f0
-+ aes_eround23 %f18, %f4, %f2, %f2
-+ ldd [$key + 16], %f16
-+ ldd [$key + 24], %f18
-+ brnz,pt $rounds, .Lenc
-+ add $key, 32, $key
-+
-+ andcc $out, 7, $tmp ! is output aligned?
-+ aes_eround01 %f12, %f0, %f2, %f4
-+ aes_eround23 %f14, %f0, %f2, %f2
-+ aes_eround01_l %f16, %f4, %f2, %f0
-+ aes_eround23_l %f18, %f4, %f2, %f2
-+
-+ bnz,pn %icc, 2f
-+ nop
-+
-+ std %f0, [$out + 0]
-+ retl
-+ std %f2, [$out + 8]
-+
-+2: alignaddrl $out, %g0, $out
-+ mov 0xff, $mask
-+ srl $mask, $tmp, $mask
-+
-+ faligndata %f0, %f0, %f4
-+ faligndata %f0, %f2, %f6
-+ faligndata %f2, %f2, %f8
-+
-+ stda %f4, [$out + $mask]0xc0 ! partial store
-+ std %f6, [$out + 8]
-+ add $out, 16, $out
-+ orn %g0, $mask, $mask
-+ retl
-+ stda %f8, [$out + $mask]0xc0 ! partial store
-+.type aes_t4_encrypt,#function
-+.size aes_t4_encrypt,.-aes_t4_encrypt
-+
-+.globl aes_t4_decrypt
-+.align 32
-+aes_t4_decrypt:
-+ andcc $inp, 7, %g1 ! is input aligned?
-+ andn $inp, 7, $inp
-+
-+ ldx [$key + 0], %g4
-+ ldx [$key + 8], %g5
-+
-+ ldx [$inp + 0], %o4
-+ bz,pt %icc, 1f
-+ ldx [$inp + 8], %o5
-+ ldx [$inp + 16], $inp
-+ sll %g1, 3, %g1
-+ sub %g0, %g1, %o3
-+ sllx %o4, %g1, %o4
-+ sllx %o5, %g1, %g1
-+ srlx %o5, %o3, %o5
-+ srlx $inp, %o3, %o3
-+ or %o5, %o4, %o4
-+ or %o3, %g1, %o5
-+1:
-+ ld [$key + 240], $rounds
-+ ldd [$key + 16], %f12
-+ ldd [$key + 24], %f14
-+ xor %g4, %o4, %o4
-+ xor %g5, %o5, %o5
-+ movxtod %o4, %f0
-+ movxtod %o5, %f2
-+ srl $rounds, 1, $rounds
-+ ldd [$key + 32], %f16
-+ sub $rounds, 1, $rounds
-+ ldd [$key + 40], %f18
-+ add $key, 48, $key
-+
-+.Ldec:
-+ aes_dround01 %f12, %f0, %f2, %f4
-+ aes_dround23 %f14, %f0, %f2, %f2
-+ ldd [$key + 0], %f12
-+ ldd [$key + 8], %f14
-+ sub $rounds,1,$rounds
-+ aes_dround01 %f16, %f4, %f2, %f0
-+ aes_dround23 %f18, %f4, %f2, %f2
-+ ldd [$key + 16], %f16
-+ ldd [$key + 24], %f18
-+ brnz,pt $rounds, .Ldec
-+ add $key, 32, $key
-+
-+ andcc $out, 7, $tmp ! is output aligned?
-+ aes_dround01 %f12, %f0, %f2, %f4
-+ aes_dround23 %f14, %f0, %f2, %f2
-+ aes_dround01_l %f16, %f4, %f2, %f0
-+ aes_dround23_l %f18, %f4, %f2, %f2
-+
-+ bnz,pn %icc, 2f
-+ nop
-+
-+ std %f0, [$out + 0]
-+ retl
-+ std %f2, [$out + 8]
-+
-+2: alignaddrl $out, %g0, $out
-+ mov 0xff, $mask
-+ srl $mask, $tmp, $mask
-+
-+ faligndata %f0, %f0, %f4
-+ faligndata %f0, %f2, %f6
-+ faligndata %f2, %f2, %f8
-+
-+ stda %f4, [$out + $mask]0xc0 ! partial store
-+ std %f6, [$out + 8]
-+ add $out, 16, $out
-+ orn %g0, $mask, $mask
-+ retl
-+ stda %f8, [$out + $mask]0xc0 ! partial store
-+.type aes_t4_decrypt,#function
-+.size aes_t4_decrypt,.-aes_t4_decrypt
-+___
-+}
-+
-+######################################################################
-+# key setup subroutines
-+#
-+{
-+my ($inp,$bits,$out,$tmp)=map("%o$_",(0..5));
-+$code.=<<___;
-+.globl aes_t4_set_encrypt_key
-+.align 32
-+aes_t4_set_encrypt_key:
-+.Lset_encrypt_key:
-+ and $inp, 7, $tmp
-+ alignaddr $inp, %g0, $inp
-+ cmp $bits, 192
-+ ldd [$inp + 0], %f0
-+ bl,pt %icc,.L128
-+ ldd [$inp + 8], %f2
-+
-+ be,pt %icc,.L192
-+ ldd [$inp + 16], %f4
-+ brz,pt $tmp, .L256aligned
-+ ldd [$inp + 24], %f6
-+
-+ ldd [$inp + 32], %f8
-+ faligndata %f0, %f2, %f0
-+ faligndata %f2, %f4, %f2
-+ faligndata %f4, %f6, %f4
-+ faligndata %f6, %f8, %f6
-+.L256aligned:
-+___
-+for ($i=0; $i<6; $i++) {
-+ $code.=<<___;
-+ std %f0, [$out + `32*$i+0`]
-+ aes_kexpand1 %f0, %f6, $i, %f0
-+ std %f2, [$out + `32*$i+8`]
-+ aes_kexpand2 %f2, %f0, %f2
-+ std %f4, [$out + `32*$i+16`]
-+ aes_kexpand0 %f4, %f2, %f4
-+ std %f6, [$out + `32*$i+24`]
-+ aes_kexpand2 %f6, %f4, %f6
-+___
-+}
-+$code.=<<___;
-+ std %f0, [$out + `32*$i+0`]
-+ aes_kexpand1 %f0, %f6, $i, %f0
-+ std %f2, [$out + `32*$i+8`]
-+ aes_kexpand2 %f2, %f0, %f2
-+ std %f4, [$out + `32*$i+16`]
-+ std %f6, [$out + `32*$i+24`]
-+ std %f0, [$out + `32*$i+32`]
-+ std %f2, [$out + `32*$i+40`]
-+
-+ mov 14, $tmp
-+ st $tmp, [$out + 240]
-+ retl
-+ xor %o0, %o0, %o0
-+
-+.align 16
-+.L192:
-+ brz,pt $tmp, .L192aligned
-+ nop
-+
-+ ldd [$inp + 24], %f6
-+ faligndata %f0, %f2, %f0
-+ faligndata %f2, %f4, %f2
-+ faligndata %f4, %f6, %f4
-+.L192aligned:
-+___
-+for ($i=0; $i<7; $i++) {
-+ $code.=<<___;
-+ std %f0, [$out + `24*$i+0`]
-+ aes_kexpand1 %f0, %f4, $i, %f0
-+ std %f2, [$out + `24*$i+8`]
-+ aes_kexpand2 %f2, %f0, %f2
-+ std %f4, [$out + `24*$i+16`]
-+ aes_kexpand2 %f4, %f2, %f4
-+___
-+}
-+$code.=<<___;
-+ std %f0, [$out + `24*$i+0`]
-+ aes_kexpand1 %f0, %f4, $i, %f0
-+ std %f2, [$out + `24*$i+8`]
-+ aes_kexpand2 %f2, %f0, %f2
-+ std %f4, [$out + `24*$i+16`]
-+ std %f0, [$out + `24*$i+24`]
-+ std %f2, [$out + `24*$i+32`]
-+
-+ mov 12, $tmp
-+ st $tmp, [$out + 240]
-+ retl
-+ xor %o0, %o0, %o0
-+
-+.align 16
-+.L128:
-+ brz,pt $tmp, .L128aligned
-+ nop
-+
-+ ldd [$inp + 16], %f4
-+ faligndata %f0, %f2, %f0
-+ faligndata %f2, %f4, %f2
-+.L128aligned:
-+___
-+for ($i=0; $i<10; $i++) {
-+ $code.=<<___;
-+ std %f0, [$out + `16*$i+0`]
-+ aes_kexpand1 %f0, %f2, $i, %f0
-+ std %f2, [$out + `16*$i+8`]
-+ aes_kexpand2 %f2, %f0, %f2
-+___
-+}
-+$code.=<<___;
-+ std %f0, [$out + `16*$i+0`]
-+ std %f2, [$out + `16*$i+8`]
-+
-+ mov 10, $tmp
-+ st $tmp, [$out + 240]
-+ retl
-+ xor %o0, %o0, %o0
-+.type aes_t4_set_encrypt_key,#function
-+.size aes_t4_set_encrypt_key,.-aes_t4_set_encrypt_key
-+
-+.globl aes_t4_set_decrypt_key
-+.align 32
-+aes_t4_set_decrypt_key:
-+ mov %o7, %o5
-+ call .Lset_encrypt_key
-+ nop
-+
-+ mov %o5, %o7
-+ sll $tmp, 4, $inp ! $tmp is number of rounds
-+ add $tmp, 2, $tmp
-+ add $out, $inp, $inp ! $inp=$out+16*rounds
-+ srl $tmp, 2, $tmp ! $tmp=(rounds+2)/4
-+
-+.Lkey_flip:
-+ ldd [$out + 0], %f0
-+ ldd [$out + 8], %f2
-+ ldd [$out + 16], %f4
-+ ldd [$out + 24], %f6
-+ ldd [$inp + 0], %f8
-+ ldd [$inp + 8], %f10
-+ ldd [$inp - 16], %f12
-+ ldd [$inp - 8], %f14
-+ sub $tmp, 1, $tmp
-+ std %f0, [$inp + 0]
-+ std %f2, [$inp + 8]
-+ std %f4, [$inp - 16]
-+ std %f6, [$inp - 8]
-+ std %f8, [$out + 0]
-+ std %f10, [$out + 8]
-+ std %f12, [$out + 16]
-+ std %f14, [$out + 24]
-+ add $out, 32, $out
-+ brnz $tmp, .Lkey_flip
-+ sub $inp, 32, $inp
-+
-+ retl
-+ xor %o0, %o0, %o0
-+.type aes_t4_set_decrypt_key,#function
-+.size aes_t4_set_decrypt_key,.-aes_t4_set_decrypt_key
-+___
-+}
-+
-+{{{
-+my ($inp,$out,$len,$key,$ivec,$enc)=map("%i$_",(0..5));
-+my ($ileft,$iright,$ooff,$omask,$ivoff)=map("%l$_",(1..7));
-+
-+$code.=<<___;
-+.align 32
-+_aes128_loadkey:
-+ ldx [$key + 0], %g4
-+ ldx [$key + 8], %g5
-+___
-+for ($i=2; $i<22;$i++) { # load key schedule
-+ $code.=<<___;
-+ ldd [$key + `8*$i`], %f`12+2*$i`
-+___
-+}
-+$code.=<<___;
-+ retl
-+ nop
-+.type _aes128_loadkey,#function
-+.size _aes128_loadkey,.-_aes128_loadkey
-+_aes128_load_enckey=_aes128_loadkey
-+_aes128_load_deckey=_aes128_loadkey
-+
-+.align 32
-+_aes128_encrypt_1x:
-+___
-+for ($i=0; $i<4; $i++) {
-+ $code.=<<___;
-+ aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f4
-+ aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
-+ aes_eround01 %f`16+8*$i+4`, %f4, %f2, %f0
-+ aes_eround23 %f`16+8*$i+6`, %f4, %f2, %f2
-+___
-+}
-+$code.=<<___;
-+ aes_eround01 %f48, %f0, %f2, %f4
-+ aes_eround23 %f50, %f0, %f2, %f2
-+ aes_eround01_l %f52, %f4, %f2, %f0
-+ retl
-+ aes_eround23_l %f54, %f4, %f2, %f2
-+.type _aes128_encrypt_1x,#function
-+.size _aes128_encrypt_1x,.-_aes128_encrypt_1x
-+
-+.align 32
-+_aes128_encrypt_2x:
-+___
-+for ($i=0; $i<4; $i++) {
-+ $code.=<<___;
-+ aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f8
-+ aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
-+ aes_eround01 %f`16+8*$i+0`, %f4, %f6, %f10
-+ aes_eround23 %f`16+8*$i+2`, %f4, %f6, %f6
-+ aes_eround01 %f`16+8*$i+4`, %f8, %f2, %f0
-+ aes_eround23 %f`16+8*$i+6`, %f8, %f2, %f2
-+ aes_eround01 %f`16+8*$i+4`, %f10, %f6, %f4
-+ aes_eround23 %f`16+8*$i+6`, %f10, %f6, %f6
-+___
-+}
-+$code.=<<___;
-+ aes_eround01 %f48, %f0, %f2, %f8
-+ aes_eround23 %f50, %f0, %f2, %f2
-+ aes_eround01 %f48, %f4, %f6, %f10
-+ aes_eround23 %f50, %f4, %f6, %f6
-+ aes_eround01_l %f52, %f8, %f2, %f0
-+ aes_eround23_l %f54, %f8, %f2, %f2
-+ aes_eround01_l %f52, %f10, %f6, %f4
-+ retl
-+ aes_eround23_l %f54, %f10, %f6, %f6
-+.type _aes128_encrypt_2x,#function
-+.size _aes128_encrypt_2x,.-_aes128_encrypt_2x
-+
-+.align 32
-+_aes128_decrypt_1x:
-+___
-+for ($i=0; $i<4; $i++) {
-+ $code.=<<___;
-+ aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f4
-+ aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
-+ aes_dround01 %f`16+8*$i+4`, %f4, %f2, %f0
-+ aes_dround23 %f`16+8*$i+6`, %f4, %f2, %f2
-+___
-+}
-+$code.=<<___;
-+ aes_dround01 %f48, %f0, %f2, %f4
-+ aes_dround23 %f50, %f0, %f2, %f2
-+ aes_dround01_l %f52, %f4, %f2, %f0
-+ retl
-+ aes_dround23_l %f54, %f4, %f2, %f2
-+.type _aes128_decrypt_1x,#function
-+.size _aes128_decrypt_1x,.-_aes128_decrypt_1x
-+
-+.align 32
-+_aes128_decrypt_2x:
-+___
-+for ($i=0; $i<4; $i++) {
-+ $code.=<<___;
-+ aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f8
-+ aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
-+ aes_dround01 %f`16+8*$i+0`, %f4, %f6, %f10
-+ aes_dround23 %f`16+8*$i+2`, %f4, %f6, %f6
-+ aes_dround01 %f`16+8*$i+4`, %f8, %f2, %f0
-+ aes_dround23 %f`16+8*$i+6`, %f8, %f2, %f2
-+ aes_dround01 %f`16+8*$i+4`, %f10, %f6, %f4
-+ aes_dround23 %f`16+8*$i+6`, %f10, %f6, %f6
-+___
-+}
-+$code.=<<___;
-+ aes_dround01 %f48, %f0, %f2, %f8
-+ aes_dround23 %f50, %f0, %f2, %f2
-+ aes_dround01 %f48, %f4, %f6, %f10
-+ aes_dround23 %f50, %f4, %f6, %f6
-+ aes_dround01_l %f52, %f8, %f2, %f0
-+ aes_dround23_l %f54, %f8, %f2, %f2
-+ aes_dround01_l %f52, %f10, %f6, %f4
-+ retl
-+ aes_dround23_l %f54, %f10, %f6, %f6
-+.type _aes128_decrypt_2x,#function
-+.size _aes128_decrypt_2x,.-_aes128_decrypt_2x
-+
-+.align 32
-+_aes192_loadkey:
-+_aes256_loadkey:
-+ ldx [$key + 0], %g4
-+ ldx [$key + 8], %g5
-+___
-+for ($i=2; $i<26;$i++) { # load key schedule
-+ $code.=<<___;
-+ ldd [$key + `8*$i`], %f`12+2*$i`
-+___
-+}
-+$code.=<<___;
-+ retl
-+ nop
-+.type _aes192_loadkey,#function
-+.size _aes192_loadkey,.-_aes192_loadkey
-+_aes192_load_enckey=_aes192_loadkey
-+_aes192_load_deckey=_aes192_loadkey
-+_aes256_load_enckey=_aes192_loadkey
-+_aes256_load_deckey=_aes192_loadkey
-+
-+.align 32
-+_aes192_encrypt_1x:
-+___
-+for ($i=0; $i<5; $i++) {
-+ $code.=<<___;
-+ aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f4
-+ aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
-+ aes_eround01 %f`16+8*$i+4`, %f4, %f2, %f0
-+ aes_eround23 %f`16+8*$i+6`, %f4, %f2, %f2
-+___
-+}
-+$code.=<<___;
-+ aes_eround01 %f56, %f0, %f2, %f4
-+ aes_eround23 %f58, %f0, %f2, %f2
-+ aes_eround01_l %f60, %f4, %f2, %f0
-+ retl
-+ aes_eround23_l %f62, %f4, %f2, %f2
-+.type _aes192_encrypt_1x,#function
-+.size _aes192_encrypt_1x,.-_aes192_encrypt_1x
-+
-+.align 32
-+_aes192_encrypt_2x:
-+___
-+for ($i=0; $i<5; $i++) {
-+ $code.=<<___;
-+ aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f8
-+ aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
-+ aes_eround01 %f`16+8*$i+0`, %f4, %f6, %f10
-+ aes_eround23 %f`16+8*$i+2`, %f4, %f6, %f6
-+ aes_eround01 %f`16+8*$i+4`, %f8, %f2, %f0
-+ aes_eround23 %f`16+8*$i+6`, %f8, %f2, %f2
-+ aes_eround01 %f`16+8*$i+4`, %f10, %f6, %f4
-+ aes_eround23 %f`16+8*$i+6`, %f10, %f6, %f6
-+___
-+}
-+$code.=<<___;
-+ aes_eround01 %f56, %f0, %f2, %f8
-+ aes_eround23 %f58, %f0, %f2, %f2
-+ aes_eround01 %f56, %f4, %f6, %f10
-+ aes_eround23 %f58, %f4, %f6, %f6
-+ aes_eround01_l %f60, %f8, %f2, %f0
-+ aes_eround23_l %f62, %f8, %f2, %f2
-+ aes_eround01_l %f60, %f10, %f6, %f4
-+ retl
-+ aes_eround23_l %f62, %f10, %f6, %f6
-+.type _aes192_encrypt_2x,#function
-+.size _aes192_encrypt_2x,.-_aes192_encrypt_2x
-+
-+.align 32
-+_aes192_decrypt_1x:
-+___
-+for ($i=0; $i<5; $i++) {
-+ $code.=<<___;
-+ aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f4
-+ aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
-+ aes_dround01 %f`16+8*$i+4`, %f4, %f2, %f0
-+ aes_dround23 %f`16+8*$i+6`, %f4, %f2, %f2
-+___
-+}
-+$code.=<<___;
-+ aes_dround01 %f56, %f0, %f2, %f4
-+ aes_dround23 %f58, %f0, %f2, %f2
-+ aes_dround01_l %f60, %f4, %f2, %f0
-+ retl
-+ aes_dround23_l %f62, %f4, %f2, %f2
-+.type _aes192_decrypt_1x,#function
-+.size _aes192_decrypt_1x,.-_aes192_decrypt_1x
-+
-+.align 32
-+_aes192_decrypt_2x:
-+___
-+for ($i=0; $i<5; $i++) {
-+ $code.=<<___;
-+ aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f8
-+ aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
-+ aes_dround01 %f`16+8*$i+0`, %f4, %f6, %f10
-+ aes_dround23 %f`16+8*$i+2`, %f4, %f6, %f6
-+ aes_dround01 %f`16+8*$i+4`, %f8, %f2, %f0
-+ aes_dround23 %f`16+8*$i+6`, %f8, %f2, %f2
-+ aes_dround01 %f`16+8*$i+4`, %f10, %f6, %f4
-+ aes_dround23 %f`16+8*$i+6`, %f10, %f6, %f6
-+___
-+}
-+$code.=<<___;
-+ aes_dround01 %f56, %f0, %f2, %f8
-+ aes_dround23 %f58, %f0, %f2, %f2
-+ aes_dround01 %f56, %f4, %f6, %f10
-+ aes_dround23 %f58, %f4, %f6, %f6
-+ aes_dround01_l %f60, %f8, %f2, %f0
-+ aes_dround23_l %f62, %f8, %f2, %f2
-+ aes_dround01_l %f60, %f10, %f6, %f4
-+ retl
-+ aes_dround23_l %f62, %f10, %f6, %f6
-+.type _aes192_decrypt_2x,#function
-+.size _aes192_decrypt_2x,.-_aes192_decrypt_2x
-+
-+.align 32
-+_aes256_encrypt_1x:
-+ aes_eround01 %f16, %f0, %f2, %f4
-+ aes_eround23 %f18, %f0, %f2, %f2
-+ ldd [$key + 208], %f16
-+ ldd [$key + 216], %f18
-+ aes_eround01 %f20, %f4, %f2, %f0
-+ aes_eround23 %f22, %f4, %f2, %f2
-+ ldd [$key + 224], %f20
-+ ldd [$key + 232], %f22
-+___
-+for ($i=1; $i<6; $i++) {
-+ $code.=<<___;
-+ aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f4
-+ aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
-+ aes_eround01 %f`16+8*$i+4`, %f4, %f2, %f0
-+ aes_eround23 %f`16+8*$i+6`, %f4, %f2, %f2
-+___
-+}
-+$code.=<<___;
-+ aes_eround01 %f16, %f0, %f2, %f4
-+ aes_eround23 %f18, %f0, %f2, %f2
-+ ldd [$key + 16], %f16
-+ ldd [$key + 24], %f18
-+ aes_eround01_l %f20, %f4, %f2, %f0
-+ aes_eround23_l %f22, %f4, %f2, %f2
-+ ldd [$key + 32], %f20
-+ retl
-+ ldd [$key + 40], %f22
-+.type _aes256_encrypt_1x,#function
-+.size _aes256_encrypt_1x,.-_aes256_encrypt_1x
-+
-+.align 32
-+_aes256_encrypt_2x:
-+ aes_eround01 %f16, %f0, %f2, %f8
-+ aes_eround23 %f18, %f0, %f2, %f2
-+ aes_eround01 %f16, %f4, %f6, %f10
-+ aes_eround23 %f18, %f4, %f6, %f6
-+ ldd [$key + 208], %f16
-+ ldd [$key + 216], %f18
-+ aes_eround01 %f20, %f8, %f2, %f0
-+ aes_eround23 %f22, %f8, %f2, %f2
-+ aes_eround01 %f20, %f10, %f6, %f4
-+ aes_eround23 %f22, %f10, %f6, %f6
-+ ldd [$key + 224], %f20
-+ ldd [$key + 232], %f22
-+___
-+for ($i=1; $i<6; $i++) {
-+ $code.=<<___;
-+ aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f8
-+ aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
-+ aes_eround01 %f`16+8*$i+0`, %f4, %f6, %f10
-+ aes_eround23 %f`16+8*$i+2`, %f4, %f6, %f6
-+ aes_eround01 %f`16+8*$i+4`, %f8, %f2, %f0
-+ aes_eround23 %f`16+8*$i+6`, %f8, %f2, %f2
-+ aes_eround01 %f`16+8*$i+4`, %f10, %f6, %f4
-+ aes_eround23 %f`16+8*$i+6`, %f10, %f6, %f6
-+___
-+}
-+$code.=<<___;
-+ aes_eround01 %f16, %f0, %f2, %f8
-+ aes_eround23 %f18, %f0, %f2, %f2
-+ aes_eround01 %f16, %f4, %f6, %f10
-+ aes_eround23 %f18, %f4, %f6, %f6
-+ ldd [$key + 16], %f16
-+ ldd [$key + 24], %f18
-+ aes_eround01_l %f20, %f8, %f2, %f0
-+ aes_eround23_l %f22, %f8, %f2, %f2
-+ aes_eround01_l %f20, %f10, %f6, %f4
-+ aes_eround23_l %f22, %f10, %f6, %f6
-+ ldd [$key + 32], %f20
-+ retl
-+ ldd [$key + 40], %f22
-+.type _aes256_encrypt_2x,#function
-+.size _aes256_encrypt_2x,.-_aes256_encrypt_2x
-+
-+.align 32
-+_aes256_decrypt_1x:
-+ aes_dround01 %f16, %f0, %f2, %f4
-+ aes_dround23 %f18, %f0, %f2, %f2
-+ ldd [$key + 208], %f16
-+ ldd [$key + 216], %f18
-+ aes_dround01 %f20, %f4, %f2, %f0
-+ aes_dround23 %f22, %f4, %f2, %f2
-+ ldd [$key + 224], %f20
-+ ldd [$key + 232], %f22
-+___
-+for ($i=1; $i<6; $i++) {
-+ $code.=<<___;
-+ aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f4
-+ aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
-+ aes_dround01 %f`16+8*$i+4`, %f4, %f2, %f0
-+ aes_dround23 %f`16+8*$i+6`, %f4, %f2, %f2
-+___
-+}
-+$code.=<<___;
-+ aes_dround01 %f16, %f0, %f2, %f4
-+ aes_dround23 %f18, %f0, %f2, %f2
-+ ldd [$key + 16], %f16
-+ ldd [$key + 24], %f18
-+ aes_dround01_l %f20, %f4, %f2, %f0
-+ aes_dround23_l %f22, %f4, %f2, %f2
-+ ldd [$key + 32], %f20
-+ retl
-+ ldd [$key + 40], %f22
-+.type _aes256_decrypt_1x,#function
-+.size _aes256_decrypt_1x,.-_aes256_decrypt_1x
-+
-+.align 32
-+_aes256_decrypt_2x:
-+ aes_dround01 %f16, %f0, %f2, %f8
-+ aes_dround23 %f18, %f0, %f2, %f2
-+ aes_dround01 %f16, %f4, %f6, %f10
-+ aes_dround23 %f18, %f4, %f6, %f6
-+ ldd [$key + 208], %f16
-+ ldd [$key + 216], %f18
-+ aes_dround01 %f20, %f8, %f2, %f0
-+ aes_dround23 %f22, %f8, %f2, %f2
-+ aes_dround01 %f20, %f10, %f6, %f4
-+ aes_dround23 %f22, %f10, %f6, %f6
-+ ldd [$key + 224], %f20
-+ ldd [$key + 232], %f22
-+___
-+for ($i=1; $i<6; $i++) {
-+ $code.=<<___;
-+ aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f8
-+ aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
-+ aes_dround01 %f`16+8*$i+0`, %f4, %f6, %f10
-+ aes_dround23 %f`16+8*$i+2`, %f4, %f6, %f6
-+ aes_dround01 %f`16+8*$i+4`, %f8, %f2, %f0
-+ aes_dround23 %f`16+8*$i+6`, %f8, %f2, %f2
-+ aes_dround01 %f`16+8*$i+4`, %f10, %f6, %f4
-+ aes_dround23 %f`16+8*$i+6`, %f10, %f6, %f6
-+___
-+}
-+$code.=<<___;
-+ aes_dround01 %f16, %f0, %f2, %f8
-+ aes_dround23 %f18, %f0, %f2, %f2
-+ aes_dround01 %f16, %f4, %f6, %f10
-+ aes_dround23 %f18, %f4, %f6, %f6
-+ ldd [$key + 16], %f16
-+ ldd [$key + 24], %f18
-+ aes_dround01_l %f20, %f8, %f2, %f0
-+ aes_dround23_l %f22, %f8, %f2, %f2
-+ aes_dround01_l %f20, %f10, %f6, %f4
-+ aes_dround23_l %f22, %f10, %f6, %f6
-+ ldd [$key + 32], %f20
-+ retl
-+ ldd [$key + 40], %f22
-+.type _aes256_decrypt_2x,#function
-+.size _aes256_decrypt_2x,.-_aes256_decrypt_2x
-+___
-+
-+&alg_cbc_encrypt_implement("aes",128);
-+&alg_cbc_encrypt_implement("aes",192);
-+&alg_cbc_encrypt_implement("aes",256);
-+
-+&alg_cbc_decrypt_implement("aes",128);
-+&alg_cbc_decrypt_implement("aes",192);
-+&alg_cbc_decrypt_implement("aes",256);
-+
-+if ($::evp) {
-+ &alg_ctr32_implement("aes",128);
-+ &alg_ctr32_implement("aes",192);
-+ &alg_ctr32_implement("aes",256);
-+}
-+}}}
-+
-+if (!$::evp) {
-+$code.=<<___;
-+.global AES_encrypt
-+AES_encrypt=aes_t4_encrypt
-+.global AES_decrypt
-+AES_decrypt=aes_t4_decrypt
-+.global AES_set_encrypt_key
-+.align 32
-+AES_set_encrypt_key:
-+ andcc %o2, 7, %g0 ! check alignment
-+ bnz,a,pn %icc, 1f
-+ mov -1, %o0
-+ brz,a,pn %o0, 1f
-+ mov -1, %o0
-+ brz,a,pn %o2, 1f
-+ mov -1, %o0
-+ andncc %o1, 0x1c0, %g0
-+ bnz,a,pn %icc, 1f
-+ mov -2, %o0
-+ cmp %o1, 128
-+ bl,a,pn %icc, 1f
-+ mov -2, %o0
-+ b aes_t4_set_encrypt_key
-+ nop
-+1: retl
-+ nop
-+.type AES_set_encrypt_key,#function
-+.size AES_set_encrypt_key,.-AES_set_encrypt_key
-+
-+.global AES_set_decrypt_key
-+.align 32
-+AES_set_decrypt_key:
-+ andcc %o2, 7, %g0 ! check alignment
-+ bnz,a,pn %icc, 1f
-+ mov -1, %o0
-+ brz,a,pn %o0, 1f
-+ mov -1, %o0
-+ brz,a,pn %o2, 1f
-+ mov -1, %o0
-+ andncc %o1, 0x1c0, %g0
-+ bnz,a,pn %icc, 1f
-+ mov -2, %o0
-+ cmp %o1, 128
-+ bl,a,pn %icc, 1f
-+ mov -2, %o0
-+ b aes_t4_set_decrypt_key
-+ nop
-+1: retl
-+ nop
-+.type AES_set_decrypt_key,#function
-+.size AES_set_decrypt_key,.-AES_set_decrypt_key
-+___
-+
-+my ($inp,$out,$len,$key,$ivec,$enc)=map("%o$_",(0..5));
-+
-+$code.=<<___;
-+.globl AES_cbc_encrypt
-+.align 32
-+AES_cbc_encrypt:
-+ ld [$key + 240], %g1
-+ nop
-+ brz $enc, .Lcbc_decrypt
-+ cmp %g1, 12
-+
-+ bl,pt %icc, aes128_t4_cbc_encrypt
-+ nop
-+ be,pn %icc, aes192_t4_cbc_encrypt
-+ nop
-+ ba aes256_t4_cbc_encrypt
-+ nop
-+
-+.Lcbc_decrypt:
-+ bl,pt %icc, aes128_t4_cbc_decrypt
-+ nop
-+ be,pn %icc, aes192_t4_cbc_decrypt
-+ nop
-+ ba aes256_t4_cbc_decrypt
-+ nop
-+.type AES_cbc_encrypt,#function
-+.size AES_cbc_encrypt,.-AES_cbc_encrypt
-+___
-+}
-+$code.=<<___;
-+.asciz "AES for SPARC T4, David S. Miller, Andy Polyakov"
-+.align 4
-+___
-+
-+&emit_assembler();
-+
-+close STDOUT;
-Index: crypto/des/asm/dest4-sparcv9.pl
-===================================================================
-diff -uNr openssl-1.0.1m/crypto/des/asm/dest4-sparcv9.pl openssl-1.0.1m/crypto/des/asm/dest4-sparcv9.pl
---- openssl-1.0.1m/crypto/des/asm/dest4-sparcv9.pl 1970-01-01 01:00:00.000000000 +0100
-+++ openssl-1.0.1m/crypto/des/asm/dest4-sparcv9.pl 2015-03-21 16:27:38.578043100 +0100
-@@ -0,0 +1,602 @@
-+#!/usr/bin/env perl
-+
-+# ====================================================================
-+# Written by David S. Miller <davem at devemloft.net> and Andy Polyakov
-+# <appro at openssl.org>. The module is licensed under 2-clause BSD
-+# license. March 2013. All rights reserved.
-+# ====================================================================
-+
-+######################################################################
-+# DES for SPARC T4.
-+#
-+# As with other hardware-assisted ciphers CBC encrypt results [for
-+# aligned data] are virtually identical to critical path lengths:
-+#
-+# DES Triple-DES
-+# CBC encrypt 4.14/4.15(*) 11.7/11.7
-+# CBC decrypt 1.77/4.11(**) 6.42/7.47
-+#
-+# (*) numbers after slash are for
-+# misaligned data;
-+# (**) this is result for largest
-+# block size, unlike all other
-+# cases smaller blocks results
-+# are better[?];
-+
-+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-+push(@INC,"${dir}","${dir}../../perlasm");
-+require "sparcv9_modes.pl";
-+
-+&asm_init(@ARGV);
-+
-+$code.=<<___ if ($::abibits==64);
-+.register %g2,#scratch
-+.register %g3,#scratch
-+___
-+
-+$code.=<<___;
-+.text
-+___
-+
-+{ my ($inp,$out)=("%o0","%o1");
-+
-+$code.=<<___;
-+.align 32
-+.globl des_t4_key_expand
-+.type des_t4_key_expand,#function
-+des_t4_key_expand:
-+ andcc $inp, 0x7, %g0
-+ alignaddr $inp, %g0, $inp
-+ bz,pt %icc, 1f
-+ ldd [$inp + 0x00], %f0
-+ ldd [$inp + 0x08], %f2
-+ faligndata %f0, %f2, %f0
-+1: des_kexpand %f0, 0, %f0
-+ des_kexpand %f0, 1, %f2
-+ std %f0, [$out + 0x00]
-+ des_kexpand %f2, 3, %f6
-+ std %f2, [$out + 0x08]
-+ des_kexpand %f2, 2, %f4
-+ des_kexpand %f6, 3, %f10
-+ std %f6, [$out + 0x18]
-+ des_kexpand %f6, 2, %f8
-+ std %f4, [$out + 0x10]
-+ des_kexpand %f10, 3, %f14
-+ std %f10, [$out + 0x28]
-+ des_kexpand %f10, 2, %f12
-+ std %f8, [$out + 0x20]
-+ des_kexpand %f14, 1, %f16
-+ std %f14, [$out + 0x38]
-+ des_kexpand %f16, 3, %f20
-+ std %f12, [$out + 0x30]
-+ des_kexpand %f16, 2, %f18
-+ std %f16, [$out + 0x40]
-+ des_kexpand %f20, 3, %f24
-+ std %f20, [$out + 0x50]
-+ des_kexpand %f20, 2, %f22
-+ std %f18, [$out + 0x48]
-+ des_kexpand %f24, 3, %f28
-+ std %f24, [$out + 0x60]
-+ des_kexpand %f24, 2, %f26
-+ std %f22, [$out + 0x58]
-+ des_kexpand %f28, 1, %f30
-+ std %f28, [$out + 0x70]
-+ std %f26, [$out + 0x68]
-+ retl
-+ std %f30, [$out + 0x78]
-+.size des_t4_key_expand,.-des_t4_key_expand
-+___
-+}
-+{ my ($inp,$out,$len,$key,$ivec) = map("%o$_",(0..4));
-+ my ($ileft,$iright,$omask) = map("%g$_",(1..3));
-+
-+$code.=<<___;
-+.globl des_t4_cbc_encrypt
-+.align 32
-+des_t4_cbc_encrypt:
-+ ld [$ivec + 0], %f0 ! load ivec
-+ ld [$ivec + 4], %f1
-+
-+ and $inp, 7, $ileft
-+ andn $inp, 7, $inp
-+ sll $ileft, 3, $ileft
-+ mov 0xff, $omask
-+ prefetch [$inp], 20
-+ prefetch [$inp + 63], 20
-+ sub %g0, $ileft, $iright
-+ and $out, 7, %g4
-+ alignaddrl $out, %g0, $out
-+ srl $omask, %g4, $omask
-+ srlx $len, 3, $len
-+ movrz %g4, 0, $omask
-+ prefetch [$out], 22
-+
-+ ldd [$key + 0x00], %f4 ! load key schedule
-+ ldd [$key + 0x08], %f6
-+ ldd [$key + 0x10], %f8
-+ ldd [$key + 0x18], %f10
-+ ldd [$key + 0x20], %f12
-+ ldd [$key + 0x28], %f14
-+ ldd [$key + 0x30], %f16
-+ ldd [$key + 0x38], %f18
-+ ldd [$key + 0x40], %f20
-+ ldd [$key + 0x48], %f22
-+ ldd [$key + 0x50], %f24
-+ ldd [$key + 0x58], %f26
-+ ldd [$key + 0x60], %f28
-+ ldd [$key + 0x68], %f30
-+ ldd [$key + 0x70], %f32
-+ ldd [$key + 0x78], %f34
-+
-+.Ldes_cbc_enc_loop:
-+ ldx [$inp + 0], %g4
-+ brz,pt $ileft, 4f
-+ nop
-+
-+ ldx [$inp + 8], %g5
-+ sllx %g4, $ileft, %g4
-+ srlx %g5, $iright, %g5
-+ or %g5, %g4, %g4
-+4:
-+ movxtod %g4, %f2
-+ prefetch [$inp + 8+63], 20
-+ add $inp, 8, $inp
-+ fxor %f2, %f0, %f0 ! ^= ivec
-+ prefetch [$out + 63], 22
-+
-+ des_ip %f0, %f0
-+ des_round %f4, %f6, %f0, %f0
-+ des_round %f8, %f10, %f0, %f0
-+ des_round %f12, %f14, %f0, %f0
-+ des_round %f16, %f18, %f0, %f0
-+ des_round %f20, %f22, %f0, %f0
-+ des_round %f24, %f26, %f0, %f0
-+ des_round %f28, %f30, %f0, %f0
-+ des_round %f32, %f34, %f0, %f0
-+ des_iip %f0, %f0
-+
-+ brnz,pn $omask, 2f
-+ sub $len, 1, $len
-+
-+ std %f0, [$out + 0]
-+ brnz,pt $len, .Ldes_cbc_enc_loop
-+ add $out, 8, $out
-+
-+ st %f0, [$ivec + 0] ! write out ivec
-+ retl
-+ st %f1, [$ivec + 4]
-+
-+.align 16
-+2: ldxa [$inp]0x82, %g4 ! avoid read-after-write hazard
-+ ! and ~4x deterioration
-+ ! in inp==out case
-+ faligndata %f0, %f0, %f2 ! handle unaligned output
-+
-+ stda %f2, [$out + $omask]0xc0 ! partial store
-+ add $out, 8, $out
-+ orn %g0, $omask, $omask
-+ stda %f2, [$out + $omask]0xc0 ! partial store
-+
-+ brnz,pt $len, .Ldes_cbc_enc_loop+4
-+ orn %g0, $omask, $omask
-+
-+ st %f0, [$ivec + 0] ! write out ivec
-+ retl
-+ st %f1, [$ivec + 4]
-+.type des_t4_cbc_encrypt,#function
-+.size des_t4_cbc_encrypt,.-des_t4_cbc_encrypt
-+
-+.globl des_t4_cbc_decrypt
-+.align 32
-+des_t4_cbc_decrypt:
-+ ld [$ivec + 0], %f2 ! load ivec
-+ ld [$ivec + 4], %f3
-+
-+ and $inp, 7, $ileft
-+ andn $inp, 7, $inp
-+ sll $ileft, 3, $ileft
-+ mov 0xff, $omask
-+ prefetch [$inp], 20
-+ prefetch [$inp + 63], 20
-+ sub %g0, $ileft, $iright
-+ and $out, 7, %g4
-+ alignaddrl $out, %g0, $out
-+ srl $omask, %g4, $omask
-+ srlx $len, 3, $len
-+ movrz %g4, 0, $omask
-+ prefetch [$out], 22
-+
-+ ldd [$key + 0x78], %f4 ! load key schedule
-+ ldd [$key + 0x70], %f6
-+ ldd [$key + 0x68], %f8
-+ ldd [$key + 0x60], %f10
-+ ldd [$key + 0x58], %f12
-+ ldd [$key + 0x50], %f14
-+ ldd [$key + 0x48], %f16
-+ ldd [$key + 0x40], %f18
-+ ldd [$key + 0x38], %f20
-+ ldd [$key + 0x30], %f22
-+ ldd [$key + 0x28], %f24
-+ ldd [$key + 0x20], %f26
-+ ldd [$key + 0x18], %f28
-+ ldd [$key + 0x10], %f30
-+ ldd [$key + 0x08], %f32
-+ ldd [$key + 0x00], %f34
-+
-+.Ldes_cbc_dec_loop:
-+ ldx [$inp + 0], %g4
-+ brz,pt $ileft, 4f
-+ nop
-+
-+ ldx [$inp + 8], %g5
-+ sllx %g4, $ileft, %g4
-+ srlx %g5, $iright, %g5
-+ or %g5, %g4, %g4
-+4:
-+ movxtod %g4, %f0
-+ prefetch [$inp + 8+63], 20
-+ add $inp, 8, $inp
-+ prefetch [$out + 63], 22
-+
-+ des_ip %f0, %f0
-+ des_round %f4, %f6, %f0, %f0
-+ des_round %f8, %f10, %f0, %f0
-+ des_round %f12, %f14, %f0, %f0
-+ des_round %f16, %f18, %f0, %f0
-+ des_round %f20, %f22, %f0, %f0
-+ des_round %f24, %f26, %f0, %f0
-+ des_round %f28, %f30, %f0, %f0
-+ des_round %f32, %f34, %f0, %f0
-+ des_iip %f0, %f0
-+
-+ fxor %f2, %f0, %f0 ! ^= ivec
-+ movxtod %g4, %f2
-+
-+ brnz,pn $omask, 2f
-+ sub $len, 1, $len
-+
-+ std %f0, [$out + 0]
-+ brnz,pt $len, .Ldes_cbc_dec_loop
-+ add $out, 8, $out
-+
-+ st %f2, [$ivec + 0] ! write out ivec
-+ retl
-+ st %f3, [$ivec + 4]
-+
-+.align 16
-+2: ldxa [$inp]0x82, %g4 ! avoid read-after-write hazard
-+ ! and ~4x deterioration
-+ ! in inp==out case
-+ faligndata %f0, %f0, %f0 ! handle unaligned output
-+
-+ stda %f0, [$out + $omask]0xc0 ! partial store
-+ add $out, 8, $out
-+ orn %g0, $omask, $omask
-+ stda %f0, [$out + $omask]0xc0 ! partial store
-+
-+ brnz,pt $len, .Ldes_cbc_dec_loop+4
-+ orn %g0, $omask, $omask
-+
-+ st %f2, [$ivec + 0] ! write out ivec
-+ retl
-+ st %f3, [$ivec + 4]
-+.type des_t4_cbc_decrypt,#function
-+.size des_t4_cbc_decrypt,.-des_t4_cbc_decrypt
-+___
-+
-+# One might wonder why does one have back-to-back des_iip/des_ip
-+# pairs between EDE passes. Indeed, aren't they inverse of each other?
-+# They almost are. Outcome of the pair is 32-bit words being swapped
-+# in target register. Consider pair of des_iip/des_ip as a way to
-+# perform the due swap, it's actually fastest way in this case.
-+
-+$code.=<<___;
-+.globl des_t4_ede3_cbc_encrypt
-+.align 32
-+des_t4_ede3_cbc_encrypt:
-+ ld [$ivec + 0], %f0 ! load ivec
-+ ld [$ivec + 4], %f1
-+
-+ and $inp, 7, $ileft
-+ andn $inp, 7, $inp
-+ sll $ileft, 3, $ileft
-+ mov 0xff, $omask
-+ prefetch [$inp], 20
-+ prefetch [$inp + 63], 20
-+ sub %g0, $ileft, $iright
-+ and $out, 7, %g4
-+ alignaddrl $out, %g0, $out
-+ srl $omask, %g4, $omask
-+ srlx $len, 3, $len
-+ movrz %g4, 0, $omask
-+ prefetch [$out], 22
-+
-+ ldd [$key + 0x00], %f4 ! load key schedule
-+ ldd [$key + 0x08], %f6
-+ ldd [$key + 0x10], %f8
-+ ldd [$key + 0x18], %f10
-+ ldd [$key + 0x20], %f12
-+ ldd [$key + 0x28], %f14
-+ ldd [$key + 0x30], %f16
-+ ldd [$key + 0x38], %f18
-+ ldd [$key + 0x40], %f20
-+ ldd [$key + 0x48], %f22
-+ ldd [$key + 0x50], %f24
-+ ldd [$key + 0x58], %f26
-+ ldd [$key + 0x60], %f28
-+ ldd [$key + 0x68], %f30
-+ ldd [$key + 0x70], %f32
-+ ldd [$key + 0x78], %f34
-+
-+.Ldes_ede3_cbc_enc_loop:
-+ ldx [$inp + 0], %g4
-+ brz,pt $ileft, 4f
-+ nop
-+
-+ ldx [$inp + 8], %g5
-+ sllx %g4, $ileft, %g4
-+ srlx %g5, $iright, %g5
-+ or %g5, %g4, %g4
-+4:
-+ movxtod %g4, %f2
-+ prefetch [$inp + 8+63], 20
-+ add $inp, 8, $inp
-+ fxor %f2, %f0, %f0 ! ^= ivec
-+ prefetch [$out + 63], 22
-+
-+ des_ip %f0, %f0
-+ des_round %f4, %f6, %f0, %f0
-+ des_round %f8, %f10, %f0, %f0
-+ des_round %f12, %f14, %f0, %f0
-+ des_round %f16, %f18, %f0, %f0
-+ ldd [$key + 0x100-0x08], %f36
-+ ldd [$key + 0x100-0x10], %f38
-+ des_round %f20, %f22, %f0, %f0
-+ ldd [$key + 0x100-0x18], %f40
-+ ldd [$key + 0x100-0x20], %f42
-+ des_round %f24, %f26, %f0, %f0
-+ ldd [$key + 0x100-0x28], %f44
-+ ldd [$key + 0x100-0x30], %f46
-+ des_round %f28, %f30, %f0, %f0
-+ ldd [$key + 0x100-0x38], %f48
-+ ldd [$key + 0x100-0x40], %f50
-+ des_round %f32, %f34, %f0, %f0
-+ ldd [$key + 0x100-0x48], %f52
-+ ldd [$key + 0x100-0x50], %f54
-+ des_iip %f0, %f0
-+
-+ ldd [$key + 0x100-0x58], %f56
-+ ldd [$key + 0x100-0x60], %f58
-+ des_ip %f0, %f0
-+ ldd [$key + 0x100-0x68], %f60
-+ ldd [$key + 0x100-0x70], %f62
-+ des_round %f36, %f38, %f0, %f0
-+ ldd [$key + 0x100-0x78], %f36
-+ ldd [$key + 0x100-0x80], %f38
-+ des_round %f40, %f42, %f0, %f0
-+ des_round %f44, %f46, %f0, %f0
-+ des_round %f48, %f50, %f0, %f0
-+ ldd [$key + 0x100+0x00], %f40
-+ ldd [$key + 0x100+0x08], %f42
-+ des_round %f52, %f54, %f0, %f0
-+ ldd [$key + 0x100+0x10], %f44
-+ ldd [$key + 0x100+0x18], %f46
-+ des_round %f56, %f58, %f0, %f0
-+ ldd [$key + 0x100+0x20], %f48
-+ ldd [$key + 0x100+0x28], %f50
-+ des_round %f60, %f62, %f0, %f0
-+ ldd [$key + 0x100+0x30], %f52
-+ ldd [$key + 0x100+0x38], %f54
-+ des_round %f36, %f38, %f0, %f0
-+ ldd [$key + 0x100+0x40], %f56
-+ ldd [$key + 0x100+0x48], %f58
-+ des_iip %f0, %f0
-+
-+ ldd [$key + 0x100+0x50], %f60
-+ ldd [$key + 0x100+0x58], %f62
-+ des_ip %f0, %f0
-+ ldd [$key + 0x100+0x60], %f36
-+ ldd [$key + 0x100+0x68], %f38
-+ des_round %f40, %f42, %f0, %f0
-+ ldd [$key + 0x100+0x70], %f40
-+ ldd [$key + 0x100+0x78], %f42
-+ des_round %f44, %f46, %f0, %f0
-+ des_round %f48, %f50, %f0, %f0
-+ des_round %f52, %f54, %f0, %f0
-+ des_round %f56, %f58, %f0, %f0
-+ des_round %f60, %f62, %f0, %f0
-+ des_round %f36, %f38, %f0, %f0
-+ des_round %f40, %f42, %f0, %f0
-+ des_iip %f0, %f0
-+
-+ brnz,pn $omask, 2f
-+ sub $len, 1, $len
-+
-+ std %f0, [$out + 0]
-+ brnz,pt $len, .Ldes_ede3_cbc_enc_loop
-+ add $out, 8, $out
-+
-+ st %f0, [$ivec + 0] ! write out ivec
-+ retl
-+ st %f1, [$ivec + 4]
-+
-+.align 16
-+2: ldxa [$inp]0x82, %g4 ! avoid read-after-write hazard
-+ ! and ~2x deterioration
-+ ! in inp==out case
-+ faligndata %f0, %f0, %f2 ! handle unaligned output
-+
-+ stda %f2, [$out + $omask]0xc0 ! partial store
-+ add $out, 8, $out
-+ orn %g0, $omask, $omask
-+ stda %f2, [$out + $omask]0xc0 ! partial store
-+
-+ brnz,pt $len, .Ldes_ede3_cbc_enc_loop+4
-+ orn %g0, $omask, $omask
-+
-+ st %f0, [$ivec + 0] ! write out ivec
-+ retl
-+ st %f1, [$ivec + 4]
-+.type des_t4_ede3_cbc_encrypt,#function
-+.size des_t4_ede3_cbc_encrypt,.-des_t4_ede3_cbc_encrypt
-+
-+.globl des_t4_ede3_cbc_decrypt
-+.align 32
-+des_t4_ede3_cbc_decrypt:
-+ ld [$ivec + 0], %f2 ! load ivec
-+ ld [$ivec + 4], %f3
-+
-+ and $inp, 7, $ileft
-+ andn $inp, 7, $inp
-+ sll $ileft, 3, $ileft
-+ mov 0xff, $omask
-+ prefetch [$inp], 20
-+ prefetch [$inp + 63], 20
-+ sub %g0, $ileft, $iright
-+ and $out, 7, %g4
-+ alignaddrl $out, %g0, $out
-+ srl $omask, %g4, $omask
-+ srlx $len, 3, $len
-+ movrz %g4, 0, $omask
-+ prefetch [$out], 22
-+
-+ ldd [$key + 0x100+0x78], %f4 ! load key schedule
-+ ldd [$key + 0x100+0x70], %f6
-+ ldd [$key + 0x100+0x68], %f8
-+ ldd [$key + 0x100+0x60], %f10
-+ ldd [$key + 0x100+0x58], %f12
-+ ldd [$key + 0x100+0x50], %f14
-+ ldd [$key + 0x100+0x48], %f16
-+ ldd [$key + 0x100+0x40], %f18
-+ ldd [$key + 0x100+0x38], %f20
-+ ldd [$key + 0x100+0x30], %f22
-+ ldd [$key + 0x100+0x28], %f24
-+ ldd [$key + 0x100+0x20], %f26
-+ ldd [$key + 0x100+0x18], %f28
-+ ldd [$key + 0x100+0x10], %f30
-+ ldd [$key + 0x100+0x08], %f32
-+ ldd [$key + 0x100+0x00], %f34
-+
-+.Ldes_ede3_cbc_dec_loop:
-+ ldx [$inp + 0], %g4
-+ brz,pt $ileft, 4f
-+ nop
-+
-+ ldx [$inp + 8], %g5
-+ sllx %g4, $ileft, %g4
-+ srlx %g5, $iright, %g5
-+ or %g5, %g4, %g4
-+4:
-+ movxtod %g4, %f0
-+ prefetch [$inp + 8+63], 20
-+ add $inp, 8, $inp
-+ prefetch [$out + 63], 22
-+
-+ des_ip %f0, %f0
-+ des_round %f4, %f6, %f0, %f0
-+ des_round %f8, %f10, %f0, %f0
-+ des_round %f12, %f14, %f0, %f0
-+ des_round %f16, %f18, %f0, %f0
-+ ldd [$key + 0x80+0x00], %f36
-+ ldd [$key + 0x80+0x08], %f38
-+ des_round %f20, %f22, %f0, %f0
-+ ldd [$key + 0x80+0x10], %f40
-+ ldd [$key + 0x80+0x18], %f42
-+ des_round %f24, %f26, %f0, %f0
-+ ldd [$key + 0x80+0x20], %f44
-+ ldd [$key + 0x80+0x28], %f46
-+ des_round %f28, %f30, %f0, %f0
-+ ldd [$key + 0x80+0x30], %f48
-+ ldd [$key + 0x80+0x38], %f50
-+ des_round %f32, %f34, %f0, %f0
-+ ldd [$key + 0x80+0x40], %f52
-+ ldd [$key + 0x80+0x48], %f54
-+ des_iip %f0, %f0
-+
-+ ldd [$key + 0x80+0x50], %f56
-+ ldd [$key + 0x80+0x58], %f58
-+ des_ip %f0, %f0
-+ ldd [$key + 0x80+0x60], %f60
-+ ldd [$key + 0x80+0x68], %f62
-+ des_round %f36, %f38, %f0, %f0
-+ ldd [$key + 0x80+0x70], %f36
-+ ldd [$key + 0x80+0x78], %f38
-+ des_round %f40, %f42, %f0, %f0
-+ des_round %f44, %f46, %f0, %f0
-+ des_round %f48, %f50, %f0, %f0
-+ ldd [$key + 0x80-0x08], %f40
-+ ldd [$key + 0x80-0x10], %f42
-+ des_round %f52, %f54, %f0, %f0
-+ ldd [$key + 0x80-0x18], %f44
-+ ldd [$key + 0x80-0x20], %f46
-+ des_round %f56, %f58, %f0, %f0
-+ ldd [$key + 0x80-0x28], %f48
-+ ldd [$key + 0x80-0x30], %f50
-+ des_round %f60, %f62, %f0, %f0
-+ ldd [$key + 0x80-0x38], %f52
-+ ldd [$key + 0x80-0x40], %f54
-+ des_round %f36, %f38, %f0, %f0
-+ ldd [$key + 0x80-0x48], %f56
-+ ldd [$key + 0x80-0x50], %f58
-+ des_iip %f0, %f0
-+
-+ ldd [$key + 0x80-0x58], %f60
-+ ldd [$key + 0x80-0x60], %f62
-+ des_ip %f0, %f0
-+ ldd [$key + 0x80-0x68], %f36
-+ ldd [$key + 0x80-0x70], %f38
-+ des_round %f40, %f42, %f0, %f0
-+ ldd [$key + 0x80-0x78], %f40
-+ ldd [$key + 0x80-0x80], %f42
-+ des_round %f44, %f46, %f0, %f0
-+ des_round %f48, %f50, %f0, %f0
-+ des_round %f52, %f54, %f0, %f0
-+ des_round %f56, %f58, %f0, %f0
-+ des_round %f60, %f62, %f0, %f0
-+ des_round %f36, %f38, %f0, %f0
-+ des_round %f40, %f42, %f0, %f0
-+ des_iip %f0, %f0
-+
-+ fxor %f2, %f0, %f0 ! ^= ivec
-+ movxtod %g4, %f2
-+
-+ brnz,pn $omask, 2f
-+ sub $len, 1, $len
-+
-+ std %f0, [$out + 0]
-+ brnz,pt $len, .Ldes_ede3_cbc_dec_loop
-+ add $out, 8, $out
-+
-+ st %f2, [$ivec + 0] ! write out ivec
-+ retl
-+ st %f3, [$ivec + 4]
-+
-+.align 16
-+2: ldxa [$inp]0x82, %g4 ! avoid read-after-write hazard
-+ ! and ~3x deterioration
-+ ! in inp==out case
-+ faligndata %f0, %f0, %f0 ! handle unaligned output
-+
-+ stda %f0, [$out + $omask]0xc0 ! partial store
-+ add $out, 8, $out
-+ orn %g0, $omask, $omask
-+ stda %f0, [$out + $omask]0xc0 ! partial store
-+
-+ brnz,pt $len, .Ldes_ede3_cbc_dec_loop+4
-+ orn %g0, $omask, $omask
-+
-+ st %f2, [$ivec + 0] ! write out ivec
-+ retl
-+ st %f3, [$ivec + 4]
-+.type des_t4_ede3_cbc_decrypt,#function
-+.size des_t4_ede3_cbc_decrypt,.-des_t4_ede3_cbc_decrypt
-+___
-+}
-+$code.=<<___;
-+.asciz "DES for SPARC T4, David S. Miller, Andy Polyakov"
-+.align 4
-+___
-+
-+&emit_assembler();
-+
-+close STDOUT;
-Index: crypto/perlasm/sparcv9_modes.pl
-===================================================================
-diff -uNr openssl-1.0.1m/crypto/perlasm/sparcv9_modes.pl openssl-1.0.1m/crypto/perlasm/sparcv9_modes.pl
---- openssl-1.0.1m/crypto/perlasm/sparcv9_modes.pl 1970-01-01 01:00:00.000000000 +0100
-+++ openssl-1.0.1m/crypto/perlasm/sparcv9_modes.pl 2015-03-21 16:27:38.578043100 +0100
-@@ -0,0 +1,1680 @@
-+#!/usr/bin/env perl
-+
-+# Specific modes implementations for SPARC Architecture 2011. There
-+# is T4 dependency though, an ASI value that is not specified in the
-+# Architecture Manual. But as SPARC universe is rather monocultural,
-+# we imply that processor capable of executing crypto instructions
-+# can handle the ASI in question as well. This means that we ought to
-+# keep eyes open when new processors emerge...
-+#
-+# As for above mentioned ASI. It's so called "block initializing
-+# store" which cancels "read" in "read-update-write" on cache lines.
-+# This is "cooperative" optimization, as it reduces overall pressure
-+# on memory interface. Benefits can't be observed/quantified with
-+# usual benchmarks, on the contrary you can notice that single-thread
-+# performance for parallelizable modes is ~1.5% worse for largest
-+# block sizes [though few percent better for not so long ones]. All
-+# this based on suggestions from David Miller.
-+
-+sub asm_init { # to be called with @ARGV as argument
-+ for (@_) { $::abibits=64 if (/\-m64/ || /\-xarch\=v9/); }
-+ if ($::abibits==64) { $::bias=2047; $::frame=192; $::size_t_cc="%xcc"; }
-+ else { $::bias=0; $::frame=112; $::size_t_cc="%icc"; }
-+}
-+
-+# unified interface
-+my ($inp,$out,$len,$key,$ivec)=map("%i$_",(0..5));
-+# local variables
-+my ($ileft,$iright,$ooff,$omask,$ivoff,$blk_init)=map("%l$_",(0..7));
-+
-+sub alg_cbc_encrypt_implement {
-+my ($alg,$bits) = @_;
-+
-+$::code.=<<___;
-+.globl ${alg}${bits}_t4_cbc_encrypt
-+.align 32
-+${alg}${bits}_t4_cbc_encrypt:
-+ save %sp, -$::frame, %sp
-+ sub $inp, $out, $blk_init ! $inp!=$out
-+___
-+$::code.=<<___ if (!$::evp);
-+ andcc $ivec, 7, $ivoff
-+ alignaddr $ivec, %g0, $ivec
-+
-+ ldd [$ivec + 0], %f0 ! load ivec
-+ bz,pt %icc, 1f
-+ ldd [$ivec + 8], %f2
-+ ldd [$ivec + 16], %f4
-+ faligndata %f0, %f2, %f0
-+ faligndata %f2, %f4, %f2
-+1:
-+___
-+$::code.=<<___ if ($::evp);
-+ ld [$ivec + 0], %f0
-+ ld [$ivec + 4], %f1
-+ ld [$ivec + 8], %f2
-+ ld [$ivec + 12], %f3
-+___
-+$::code.=<<___;
-+ prefetch [$inp], 20
-+ prefetch [$inp + 63], 20
-+ call _${alg}${bits}_load_enckey
-+ and $inp, 7, $ileft
-+ andn $inp, 7, $inp
-+ sll $ileft, 3, $ileft
-+ mov 64, $iright
-+ mov 0xff, $omask
-+ sub $iright, $ileft, $iright
-+ and $out, 7, $ooff
-+ cmp $len, 127
-+ movrnz $ooff, 0, $blk_init ! if ( $out&7 ||
-+ movleu $::size_t_cc, 0, $blk_init ! $len<128 ||
-+ brnz,pn $blk_init, .L${bits}cbc_enc_blk ! $inp==$out)
-+ srl $omask, $ooff, $omask
-+
-+ alignaddrl $out, %g0, $out
-+ srlx $len, 4, $len
-+ prefetch [$out], 22
-+
-+.L${bits}_cbc_enc_loop:
-+ ldx [$inp + 0], %o0
-+ brz,pt $ileft, 4f
-+ ldx [$inp + 8], %o1
-+
-+ ldx [$inp + 16], %o2
-+ sllx %o0, $ileft, %o0
-+ srlx %o1, $iright, %g1
-+ sllx %o1, $ileft, %o1
-+ or %g1, %o0, %o0
-+ srlx %o2, $iright, %o2
-+ or %o2, %o1, %o1
-+4:
-+ xor %g4, %o0, %o0 ! ^= rk[0]
-+ xor %g5, %o1, %o1
-+ movxtod %o0, %f12
-+ movxtod %o1, %f14
-+
-+ fxor %f12, %f0, %f0 ! ^= ivec
-+ fxor %f14, %f2, %f2
-+ prefetch [$out + 63], 22
-+ prefetch [$inp + 16+63], 20
-+ call _${alg}${bits}_encrypt_1x
-+ add $inp, 16, $inp
-+
-+ brnz,pn $ooff, 2f
-+ sub $len, 1, $len
-+
-+ std %f0, [$out + 0]
-+ std %f2, [$out + 8]
-+ brnz,pt $len, .L${bits}_cbc_enc_loop
-+ add $out, 16, $out
-+___
-+$::code.=<<___ if ($::evp);
-+ st %f0, [$ivec + 0]
-+ st %f1, [$ivec + 4]
-+ st %f2, [$ivec + 8]
-+ st %f3, [$ivec + 12]
-+___
-+$::code.=<<___ if (!$::evp);
-+ brnz,pn $ivoff, 3f
-+ nop
-+
-+ std %f0, [$ivec + 0] ! write out ivec
-+ std %f2, [$ivec + 8]
-+___
-+$::code.=<<___;
-+ ret
-+ restore
-+
-+.align 16
-+2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
-+ ! and ~3x deterioration
-+ ! in inp==out case
-+ faligndata %f0, %f0, %f4 ! handle unaligned output
-+ faligndata %f0, %f2, %f6
-+ faligndata %f2, %f2, %f8
-+
-+ stda %f4, [$out + $omask]0xc0 ! partial store
-+ std %f6, [$out + 8]
-+ add $out, 16, $out
-+ orn %g0, $omask, $omask
-+ stda %f8, [$out + $omask]0xc0 ! partial store
-+
-+ brnz,pt $len, .L${bits}_cbc_enc_loop+4
-+ orn %g0, $omask, $omask
-+___
-+$::code.=<<___ if ($::evp);
-+ st %f0, [$ivec + 0]
-+ st %f1, [$ivec + 4]
-+ st %f2, [$ivec + 8]
-+ st %f3, [$ivec + 12]
-+___
-+$::code.=<<___ if (!$::evp);
-+ brnz,pn $ivoff, 3f
-+ nop
-+
-+ std %f0, [$ivec + 0] ! write out ivec
-+ std %f2, [$ivec + 8]
-+ ret
-+ restore
-+
-+.align 16
-+3: alignaddrl $ivec, $ivoff, %g0 ! handle unaligned ivec
-+ mov 0xff, $omask
-+ srl $omask, $ivoff, $omask
-+ faligndata %f0, %f0, %f4
-+ faligndata %f0, %f2, %f6
-+ faligndata %f2, %f2, %f8
-+ stda %f4, [$ivec + $omask]0xc0
-+ std %f6, [$ivec + 8]
-+ add $ivec, 16, $ivec
-+ orn %g0, $omask, $omask
-+ stda %f8, [$ivec + $omask]0xc0
-+___
-+$::code.=<<___;
-+ ret
-+ restore
-+
-+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-+.align 32
-+.L${bits}cbc_enc_blk:
-+ add $out, $len, $blk_init
-+ and $blk_init, 63, $blk_init ! tail
-+ sub $len, $blk_init, $len
-+ add $blk_init, 15, $blk_init ! round up to 16n
-+ srlx $len, 4, $len
-+ srl $blk_init, 4, $blk_init
-+
-+.L${bits}_cbc_enc_blk_loop:
-+ ldx [$inp + 0], %o0
-+ brz,pt $ileft, 5f
-+ ldx [$inp + 8], %o1
-+
-+ ldx [$inp + 16], %o2
-+ sllx %o0, $ileft, %o0
-+ srlx %o1, $iright, %g1
-+ sllx %o1, $ileft, %o1
-+ or %g1, %o0, %o0
-+ srlx %o2, $iright, %o2
-+ or %o2, %o1, %o1
-+5:
-+ xor %g4, %o0, %o0 ! ^= rk[0]
-+ xor %g5, %o1, %o1
-+ movxtod %o0, %f12
-+ movxtod %o1, %f14
-+
-+ fxor %f12, %f0, %f0 ! ^= ivec
-+ fxor %f14, %f2, %f2
-+ prefetch [$inp + 16+63], 20
-+ call _${alg}${bits}_encrypt_1x
-+ add $inp, 16, $inp
-+ sub $len, 1, $len
-+
-+ stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
-+ add $out, 8, $out
-+ stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
-+ brnz,pt $len, .L${bits}_cbc_enc_blk_loop
-+ add $out, 8, $out
-+
-+ membar #StoreLoad|#StoreStore
-+ brnz,pt $blk_init, .L${bits}_cbc_enc_loop
-+ mov $blk_init, $len
-+___
-+$::code.=<<___ if ($::evp);
-+ st %f0, [$ivec + 0]
-+ st %f1, [$ivec + 4]
-+ st %f2, [$ivec + 8]
-+ st %f3, [$ivec + 12]
-+___
-+$::code.=<<___ if (!$::evp);
-+ brnz,pn $ivoff, 3b
-+ nop
-+
-+ std %f0, [$ivec + 0] ! write out ivec
-+ std %f2, [$ivec + 8]
-+___
-+$::code.=<<___;
-+ ret
-+ restore
-+.type ${alg}${bits}_t4_cbc_encrypt,#function
-+.size ${alg}${bits}_t4_cbc_encrypt,.-${alg}${bits}_t4_cbc_encrypt
-+___
-+}
-+
-+sub alg_cbc_decrypt_implement {
-+my ($alg,$bits) = @_;
-+
-+$::code.=<<___;
-+.globl ${alg}${bits}_t4_cbc_decrypt
-+.align 32
-+${alg}${bits}_t4_cbc_decrypt:
-+ save %sp, -$::frame, %sp
-+ sub $inp, $out, $blk_init ! $inp!=$out
-+___
-+$::code.=<<___ if (!$::evp);
-+ andcc $ivec, 7, $ivoff
-+ alignaddr $ivec, %g0, $ivec
-+
-+ ldd [$ivec + 0], %f12 ! load ivec
-+ bz,pt %icc, 1f
-+ ldd [$ivec + 8], %f14
-+ ldd [$ivec + 16], %f0
-+ faligndata %f12, %f14, %f12
-+ faligndata %f14, %f0, %f14
-+1:
-+___
-+$::code.=<<___ if ($::evp);
-+ ld [$ivec + 0], %f12 ! load ivec
-+ ld [$ivec + 4], %f13
-+ ld [$ivec + 8], %f14
-+ ld [$ivec + 12], %f15
-+___
-+$::code.=<<___;
-+ prefetch [$inp], 20
-+ prefetch [$inp + 63], 20
-+ call _${alg}${bits}_load_deckey
-+ and $inp, 7, $ileft
-+ andn $inp, 7, $inp
-+ sll $ileft, 3, $ileft
-+ mov 64, $iright
-+ mov 0xff, $omask
-+ sub $iright, $ileft, $iright
-+ and $out, 7, $ooff
-+ cmp $len, 255
-+ movrnz $ooff, 0, $blk_init ! if ( $out&7 ||
-+ movleu $::size_t_cc, 0, $blk_init ! $len<256 ||
-+ brnz,pn $blk_init, .L${bits}cbc_dec_blk ! $inp==$out)
-+ srl $omask, $ooff, $omask
-+
-+ andcc $len, 16, %g0 ! is number of blocks even?
-+ srlx $len, 4, $len
-+ alignaddrl $out, %g0, $out
-+ bz %icc, .L${bits}_cbc_dec_loop2x
-+ prefetch [$out], 22
-+.L${bits}_cbc_dec_loop:
-+ ldx [$inp + 0], %o0
-+ brz,pt $ileft, 4f
-+ ldx [$inp + 8], %o1
-+
-+ ldx [$inp + 16], %o2
-+ sllx %o0, $ileft, %o0
-+ srlx %o1, $iright, %g1
-+ sllx %o1, $ileft, %o1
-+ or %g1, %o0, %o0
-+ srlx %o2, $iright, %o2
-+ or %o2, %o1, %o1
-+4:
-+ xor %g4, %o0, %o2 ! ^= rk[0]
-+ xor %g5, %o1, %o3
-+ movxtod %o2, %f0
-+ movxtod %o3, %f2
-+
-+ prefetch [$out + 63], 22
-+ prefetch [$inp + 16+63], 20
-+ call _${alg}${bits}_decrypt_1x
-+ add $inp, 16, $inp
-+
-+ fxor %f12, %f0, %f0 ! ^= ivec
-+ fxor %f14, %f2, %f2
-+ movxtod %o0, %f12
-+ movxtod %o1, %f14
-+
-+ brnz,pn $ooff, 2f
-+ sub $len, 1, $len
-+
-+ std %f0, [$out + 0]
-+ std %f2, [$out + 8]
-+ brnz,pt $len, .L${bits}_cbc_dec_loop2x
-+ add $out, 16, $out
-+___
-+$::code.=<<___ if ($::evp);
-+ st %f12, [$ivec + 0]
-+ st %f13, [$ivec + 4]
-+ st %f14, [$ivec + 8]
-+ st %f15, [$ivec + 12]
-+___
-+$::code.=<<___ if (!$::evp);
-+ brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec
-+ nop
-+
-+ std %f12, [$ivec + 0] ! write out ivec
-+ std %f14, [$ivec + 8]
-+___
-+$::code.=<<___;
-+ ret
-+ restore
-+
-+.align 16
-+2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
-+ ! and ~3x deterioration
-+ ! in inp==out case
-+ faligndata %f0, %f0, %f4 ! handle unaligned output
-+ faligndata %f0, %f2, %f6
-+ faligndata %f2, %f2, %f8
-+
-+ stda %f4, [$out + $omask]0xc0 ! partial store
-+ std %f6, [$out + 8]
-+ add $out, 16, $out
-+ orn %g0, $omask, $omask
-+ stda %f8, [$out + $omask]0xc0 ! partial store
-+
-+ brnz,pt $len, .L${bits}_cbc_dec_loop2x+4
-+ orn %g0, $omask, $omask
-+___
-+$::code.=<<___ if ($::evp);
-+ st %f12, [$ivec + 0]
-+ st %f13, [$ivec + 4]
-+ st %f14, [$ivec + 8]
-+ st %f15, [$ivec + 12]
-+___
-+$::code.=<<___ if (!$::evp);
-+ brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec
-+ nop
-+
-+ std %f12, [$ivec + 0] ! write out ivec
-+ std %f14, [$ivec + 8]
-+___
-+$::code.=<<___;
-+ ret
-+ restore
-+
-+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-+.align 32
-+.L${bits}_cbc_dec_loop2x:
-+ ldx [$inp + 0], %o0
-+ ldx [$inp + 8], %o1
-+ ldx [$inp + 16], %o2
-+ brz,pt $ileft, 4f
-+ ldx [$inp + 24], %o3
-+
-+ ldx [$inp + 32], %o4
-+ sllx %o0, $ileft, %o0
-+ srlx %o1, $iright, %g1
-+ or %g1, %o0, %o0
-+ sllx %o1, $ileft, %o1
-+ srlx %o2, $iright, %g1
-+ or %g1, %o1, %o1
-+ sllx %o2, $ileft, %o2
-+ srlx %o3, $iright, %g1
-+ or %g1, %o2, %o2
-+ sllx %o3, $ileft, %o3
-+ srlx %o4, $iright, %o4
-+ or %o4, %o3, %o3
-+4:
-+ xor %g4, %o0, %o4 ! ^= rk[0]
-+ xor %g5, %o1, %o5
-+ movxtod %o4, %f0
-+ movxtod %o5, %f2
-+ xor %g4, %o2, %o4
-+ xor %g5, %o3, %o5
-+ movxtod %o4, %f4
-+ movxtod %o5, %f6
-+
-+ prefetch [$out + 63], 22
-+ prefetch [$inp + 32+63], 20
-+ call _${alg}${bits}_decrypt_2x
-+ add $inp, 32, $inp
-+
-+ movxtod %o0, %f8
-+ movxtod %o1, %f10
-+ fxor %f12, %f0, %f0 ! ^= ivec
-+ fxor %f14, %f2, %f2
-+ movxtod %o2, %f12
-+ movxtod %o3, %f14
-+ fxor %f8, %f4, %f4
-+ fxor %f10, %f6, %f6
-+
-+ brnz,pn $ooff, 2f
-+ sub $len, 2, $len
-+
-+ std %f0, [$out + 0]
-+ std %f2, [$out + 8]
-+ std %f4, [$out + 16]
-+ std %f6, [$out + 24]
-+ brnz,pt $len, .L${bits}_cbc_dec_loop2x
-+ add $out, 32, $out
-+___
-+$::code.=<<___ if ($::evp);
-+ st %f12, [$ivec + 0]
-+ st %f13, [$ivec + 4]
-+ st %f14, [$ivec + 8]
-+ st %f15, [$ivec + 12]
-+___
-+$::code.=<<___ if (!$::evp);
-+ brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec
-+ nop
-+
-+ std %f12, [$ivec + 0] ! write out ivec
-+ std %f14, [$ivec + 8]
-+___
-+$::code.=<<___;
-+ ret
-+ restore
-+
-+.align 16
-+2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
-+ ! and ~3x deterioration
-+ ! in inp==out case
-+ faligndata %f0, %f0, %f8 ! handle unaligned output
-+ faligndata %f0, %f2, %f0
-+ faligndata %f2, %f4, %f2
-+ faligndata %f4, %f6, %f4
-+ faligndata %f6, %f6, %f6
-+ stda %f8, [$out + $omask]0xc0 ! partial store
-+ std %f0, [$out + 8]
-+ std %f2, [$out + 16]
-+ std %f4, [$out + 24]
-+ add $out, 32, $out
-+ orn %g0, $omask, $omask
-+ stda %f6, [$out + $omask]0xc0 ! partial store
-+
-+ brnz,pt $len, .L${bits}_cbc_dec_loop2x+4
-+ orn %g0, $omask, $omask
-+___
-+$::code.=<<___ if ($::evp);
-+ st %f12, [$ivec + 0]
-+ st %f13, [$ivec + 4]
-+ st %f14, [$ivec + 8]
-+ st %f15, [$ivec + 12]
-+___
-+$::code.=<<___ if (!$::evp);
-+ brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec
-+ nop
-+
-+ std %f12, [$ivec + 0] ! write out ivec
-+ std %f14, [$ivec + 8]
-+ ret
-+ restore
-+
-+.align 16
-+.L${bits}_cbc_dec_unaligned_ivec:
-+ alignaddrl $ivec, $ivoff, %g0 ! handle unaligned ivec
-+ mov 0xff, $omask
-+ srl $omask, $ivoff, $omask
-+ faligndata %f12, %f12, %f0
-+ faligndata %f12, %f14, %f2
-+ faligndata %f14, %f14, %f4
-+ stda %f0, [$ivec + $omask]0xc0
-+ std %f2, [$ivec + 8]
-+ add $ivec, 16, $ivec
-+ orn %g0, $omask, $omask
-+ stda %f4, [$ivec + $omask]0xc0
-+___
-+$::code.=<<___;
-+ ret
-+ restore
-+
-+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-+.align 32
-+.L${bits}cbc_dec_blk:
-+ add $out, $len, $blk_init
-+ and $blk_init, 63, $blk_init ! tail
-+ sub $len, $blk_init, $len
-+ add $blk_init, 15, $blk_init ! round up to 16n
-+ srlx $len, 4, $len
-+ srl $blk_init, 4, $blk_init
-+ sub $len, 1, $len
-+ add $blk_init, 1, $blk_init
-+
-+.L${bits}_cbc_dec_blk_loop2x:
-+ ldx [$inp + 0], %o0
-+ ldx [$inp + 8], %o1
-+ ldx [$inp + 16], %o2
-+ brz,pt $ileft, 5f
-+ ldx [$inp + 24], %o3
-+
-+ ldx [$inp + 32], %o4
-+ sllx %o0, $ileft, %o0
-+ srlx %o1, $iright, %g1
-+ or %g1, %o0, %o0
-+ sllx %o1, $ileft, %o1
-+ srlx %o2, $iright, %g1
-+ or %g1, %o1, %o1
-+ sllx %o2, $ileft, %o2
-+ srlx %o3, $iright, %g1
-+ or %g1, %o2, %o2
-+ sllx %o3, $ileft, %o3
-+ srlx %o4, $iright, %o4
-+ or %o4, %o3, %o3
-+5:
-+ xor %g4, %o0, %o4 ! ^= rk[0]
-+ xor %g5, %o1, %o5
-+ movxtod %o4, %f0
-+ movxtod %o5, %f2
-+ xor %g4, %o2, %o4
-+ xor %g5, %o3, %o5
-+ movxtod %o4, %f4
-+ movxtod %o5, %f6
-+
-+ prefetch [$inp + 32+63], 20
-+ call _${alg}${bits}_decrypt_2x
-+ add $inp, 32, $inp
-+ subcc $len, 2, $len
-+
-+ movxtod %o0, %f8
-+ movxtod %o1, %f10
-+ fxor %f12, %f0, %f0 ! ^= ivec
-+ fxor %f14, %f2, %f2
-+ movxtod %o2, %f12
-+ movxtod %o3, %f14
-+ fxor %f8, %f4, %f4
-+ fxor %f10, %f6, %f6
-+
-+ stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
-+ add $out, 8, $out
-+ stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
-+ add $out, 8, $out
-+ stda %f4, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
-+ add $out, 8, $out
-+ stda %f6, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
-+ bgu,pt $::size_t_cc, .L${bits}_cbc_dec_blk_loop2x
-+ add $out, 8, $out
-+
-+ add $blk_init, $len, $len
-+ andcc $len, 1, %g0 ! is number of blocks even?
-+ membar #StoreLoad|#StoreStore
-+ bnz,pt %icc, .L${bits}_cbc_dec_loop
-+ srl $len, 0, $len
-+ brnz,pn $len, .L${bits}_cbc_dec_loop2x
-+ nop
-+___
-+$::code.=<<___ if ($::evp);
-+ st %f12, [$ivec + 0] ! write out ivec
-+ st %f13, [$ivec + 4]
-+ st %f14, [$ivec + 8]
-+ st %f15, [$ivec + 12]
-+___
-+$::code.=<<___ if (!$::evp);
-+ brnz,pn $ivoff, 3b
-+ nop
-+
-+ std %f12, [$ivec + 0] ! write out ivec
-+ std %f14, [$ivec + 8]
-+___
-+$::code.=<<___;
-+ ret
-+ restore
-+.type ${alg}${bits}_t4_cbc_decrypt,#function
-+.size ${alg}${bits}_t4_cbc_decrypt,.-${alg}${bits}_t4_cbc_decrypt
-+___
-+}
-+
-+sub alg_ctr32_implement {
-+my ($alg,$bits) = @_;
-+
-+$::code.=<<___;
-+.globl ${alg}${bits}_t4_ctr32_encrypt
-+.align 32
-+${alg}${bits}_t4_ctr32_encrypt:
-+ save %sp, -$::frame, %sp
-+
-+ prefetch [$inp], 20
-+ prefetch [$inp + 63], 20
-+ call _${alg}${bits}_load_enckey
-+ sllx $len, 4, $len
-+
-+ ld [$ivec + 0], %l4 ! counter
-+ ld [$ivec + 4], %l5
-+ ld [$ivec + 8], %l6
-+ ld [$ivec + 12], %l7
-+
-+ sllx %l4, 32, %o5
-+ or %l5, %o5, %o5
-+ sllx %l6, 32, %g1
-+ xor %o5, %g4, %g4 ! ^= rk[0]
-+ xor %g1, %g5, %g5
-+ movxtod %g4, %f14 ! most significant 64 bits
-+
-+ sub $inp, $out, $blk_init ! $inp!=$out
-+ and $inp, 7, $ileft
-+ andn $inp, 7, $inp
-+ sll $ileft, 3, $ileft
-+ mov 64, $iright
-+ mov 0xff, $omask
-+ sub $iright, $ileft, $iright
-+ and $out, 7, $ooff
-+ cmp $len, 255
-+ movrnz $ooff, 0, $blk_init ! if ( $out&7 ||
-+ movleu $::size_t_cc, 0, $blk_init ! $len<256 ||
-+ brnz,pn $blk_init, .L${bits}_ctr32_blk ! $inp==$out)
-+ srl $omask, $ooff, $omask
-+
-+ andcc $len, 16, %g0 ! is number of blocks even?
-+ alignaddrl $out, %g0, $out
-+ bz %icc, .L${bits}_ctr32_loop2x
-+ srlx $len, 4, $len
-+.L${bits}_ctr32_loop:
-+ ldx [$inp + 0], %o0
-+ brz,pt $ileft, 4f
-+ ldx [$inp + 8], %o1
-+
-+ ldx [$inp + 16], %o2
-+ sllx %o0, $ileft, %o0
-+ srlx %o1, $iright, %g1
-+ sllx %o1, $ileft, %o1
-+ or %g1, %o0, %o0
-+ srlx %o2, $iright, %o2
-+ or %o2, %o1, %o1
-+4:
-+ xor %g5, %l7, %g1 ! ^= rk[0]
-+ add %l7, 1, %l7
-+ movxtod %g1, %f2
-+ srl %l7, 0, %l7 ! clruw
-+ prefetch [$out + 63], 22
-+ prefetch [$inp + 16+63], 20
-+___
-+$::code.=<<___ if ($alg eq "aes");
-+ aes_eround01 %f16, %f14, %f2, %f4
-+ aes_eround23 %f18, %f14, %f2, %f2
-+___
-+$::code.=<<___ if ($alg eq "cmll");
-+ camellia_f %f16, %f2, %f14, %f2
-+ camellia_f %f18, %f14, %f2, %f0
-+___
-+$::code.=<<___;
-+ call _${alg}${bits}_encrypt_1x+8
-+ add $inp, 16, $inp
-+
-+ movxtod %o0, %f10
-+ movxtod %o1, %f12
-+ fxor %f10, %f0, %f0 ! ^= inp
-+ fxor %f12, %f2, %f2
-+
-+ brnz,pn $ooff, 2f
-+ sub $len, 1, $len
-+
-+ std %f0, [$out + 0]
-+ std %f2, [$out + 8]
-+ brnz,pt $len, .L${bits}_ctr32_loop2x
-+ add $out, 16, $out
-+
-+ ret
-+ restore
-+
-+.align 16
-+2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
-+ ! and ~3x deterioration
-+ ! in inp==out case
-+ faligndata %f0, %f0, %f4 ! handle unaligned output
-+ faligndata %f0, %f2, %f6
-+ faligndata %f2, %f2, %f8
-+ stda %f4, [$out + $omask]0xc0 ! partial store
-+ std %f6, [$out + 8]
-+ add $out, 16, $out
-+ orn %g0, $omask, $omask
-+ stda %f8, [$out + $omask]0xc0 ! partial store
-+
-+ brnz,pt $len, .L${bits}_ctr32_loop2x+4
-+ orn %g0, $omask, $omask
-+
-+ ret
-+ restore
-+
-+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-+.align 32
-+.L${bits}_ctr32_loop2x:
-+ ldx [$inp + 0], %o0
-+ ldx [$inp + 8], %o1
-+ ldx [$inp + 16], %o2
-+ brz,pt $ileft, 4f
-+ ldx [$inp + 24], %o3
-+
-+ ldx [$inp + 32], %o4
-+ sllx %o0, $ileft, %o0
-+ srlx %o1, $iright, %g1
-+ or %g1, %o0, %o0
-+ sllx %o1, $ileft, %o1
-+ srlx %o2, $iright, %g1
-+ or %g1, %o1, %o1
-+ sllx %o2, $ileft, %o2
-+ srlx %o3, $iright, %g1
-+ or %g1, %o2, %o2
-+ sllx %o3, $ileft, %o3
-+ srlx %o4, $iright, %o4
-+ or %o4, %o3, %o3
-+4:
-+ xor %g5, %l7, %g1 ! ^= rk[0]
-+ add %l7, 1, %l7
-+ movxtod %g1, %f2
-+ srl %l7, 0, %l7 ! clruw
-+ xor %g5, %l7, %g1
-+ add %l7, 1, %l7
-+ movxtod %g1, %f6
-+ srl %l7, 0, %l7 ! clruw
-+ prefetch [$out + 63], 22
-+ prefetch [$inp + 32+63], 20
-+___
-+$::code.=<<___ if ($alg eq "aes");
-+ aes_eround01 %f16, %f14, %f2, %f8
-+ aes_eround23 %f18, %f14, %f2, %f2
-+ aes_eround01 %f16, %f14, %f6, %f10
-+ aes_eround23 %f18, %f14, %f6, %f6
-+___
-+$::code.=<<___ if ($alg eq "cmll");
-+ camellia_f %f16, %f2, %f14, %f2
-+ camellia_f %f16, %f6, %f14, %f6
-+ camellia_f %f18, %f14, %f2, %f0
-+ camellia_f %f18, %f14, %f6, %f4
-+___
-+$::code.=<<___;
-+ call _${alg}${bits}_encrypt_2x+16
-+ add $inp, 32, $inp
-+
-+ movxtod %o0, %f8
-+ movxtod %o1, %f10
-+ movxtod %o2, %f12
-+ fxor %f8, %f0, %f0 ! ^= inp
-+ movxtod %o3, %f8
-+ fxor %f10, %f2, %f2
-+ fxor %f12, %f4, %f4
-+ fxor %f8, %f6, %f6
-+
-+ brnz,pn $ooff, 2f
-+ sub $len, 2, $len
-+
-+ std %f0, [$out + 0]
-+ std %f2, [$out + 8]
-+ std %f4, [$out + 16]
-+ std %f6, [$out + 24]
-+ brnz,pt $len, .L${bits}_ctr32_loop2x
-+ add $out, 32, $out
-+
-+ ret
-+ restore
-+
-+.align 16
-+2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
-+ ! and ~3x deterioration
-+ ! in inp==out case
-+ faligndata %f0, %f0, %f8 ! handle unaligned output
-+ faligndata %f0, %f2, %f0
-+ faligndata %f2, %f4, %f2
-+ faligndata %f4, %f6, %f4
-+ faligndata %f6, %f6, %f6
-+
-+ stda %f8, [$out + $omask]0xc0 ! partial store
-+ std %f0, [$out + 8]
-+ std %f2, [$out + 16]
-+ std %f4, [$out + 24]
-+ add $out, 32, $out
-+ orn %g0, $omask, $omask
-+ stda %f6, [$out + $omask]0xc0 ! partial store
-+
-+ brnz,pt $len, .L${bits}_ctr32_loop2x+4
-+ orn %g0, $omask, $omask
-+
-+ ret
-+ restore
-+
-+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-+.align 32
-+.L${bits}_ctr32_blk:
-+ add $out, $len, $blk_init
-+ and $blk_init, 63, $blk_init ! tail
-+ sub $len, $blk_init, $len
-+ add $blk_init, 15, $blk_init ! round up to 16n
-+ srlx $len, 4, $len
-+ srl $blk_init, 4, $blk_init
-+ sub $len, 1, $len
-+ add $blk_init, 1, $blk_init
-+
-+.L${bits}_ctr32_blk_loop2x:
-+ ldx [$inp + 0], %o0
-+ ldx [$inp + 8], %o1
-+ ldx [$inp + 16], %o2
-+ brz,pt $ileft, 5f
-+ ldx [$inp + 24], %o3
-+
-+ ldx [$inp + 32], %o4
-+ sllx %o0, $ileft, %o0
-+ srlx %o1, $iright, %g1
-+ or %g1, %o0, %o0
-+ sllx %o1, $ileft, %o1
-+ srlx %o2, $iright, %g1
-+ or %g1, %o1, %o1
-+ sllx %o2, $ileft, %o2
-+ srlx %o3, $iright, %g1
-+ or %g1, %o2, %o2
-+ sllx %o3, $ileft, %o3
-+ srlx %o4, $iright, %o4
-+ or %o4, %o3, %o3
-+5:
-+ xor %g5, %l7, %g1 ! ^= rk[0]
-+ add %l7, 1, %l7
-+ movxtod %g1, %f2
-+ srl %l7, 0, %l7 ! clruw
-+ xor %g5, %l7, %g1
-+ add %l7, 1, %l7
-+ movxtod %g1, %f6
-+ srl %l7, 0, %l7 ! clruw
-+ prefetch [$inp + 32+63], 20
-+___
-+$::code.=<<___ if ($alg eq "aes");
-+ aes_eround01 %f16, %f14, %f2, %f8
-+ aes_eround23 %f18, %f14, %f2, %f2
-+ aes_eround01 %f16, %f14, %f6, %f10
-+ aes_eround23 %f18, %f14, %f6, %f6
-+___
-+$::code.=<<___ if ($alg eq "cmll");
-+ camellia_f %f16, %f2, %f14, %f2
-+ camellia_f %f16, %f6, %f14, %f6
-+ camellia_f %f18, %f14, %f2, %f0
-+ camellia_f %f18, %f14, %f6, %f4
-+___
-+$::code.=<<___;
-+ call _${alg}${bits}_encrypt_2x+16
-+ add $inp, 32, $inp
-+ subcc $len, 2, $len
-+
-+ movxtod %o0, %f8
-+ movxtod %o1, %f10
-+ movxtod %o2, %f12
-+ fxor %f8, %f0, %f0 ! ^= inp
-+ movxtod %o3, %f8
-+ fxor %f10, %f2, %f2
-+ fxor %f12, %f4, %f4
-+ fxor %f8, %f6, %f6
-+
-+ stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
-+ add $out, 8, $out
-+ stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
-+ add $out, 8, $out
-+ stda %f4, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
-+ add $out, 8, $out
-+ stda %f6, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
-+ bgu,pt $::size_t_cc, .L${bits}_ctr32_blk_loop2x
-+ add $out, 8, $out
-+
-+ add $blk_init, $len, $len
-+ andcc $len, 1, %g0 ! is number of blocks even?
-+ membar #StoreLoad|#StoreStore
-+ bnz,pt %icc, .L${bits}_ctr32_loop
-+ srl $len, 0, $len
-+ brnz,pn $len, .L${bits}_ctr32_loop2x
-+ nop
-+
-+ ret
-+ restore
-+.type ${alg}${bits}_t4_ctr32_encrypt,#function
-+.size ${alg}${bits}_t4_ctr32_encrypt,.-${alg}${bits}_t4_ctr32_encrypt
-+___
-+}
-+
-+sub alg_xts_implement {
-+my ($alg,$bits,$dir) = @_;
-+my ($inp,$out,$len,$key1,$key2,$ivec)=map("%i$_",(0..5));
-+my $rem=$ivec;
-+
-+$::code.=<<___;
-+.globl ${alg}${bits}_t4_xts_${dir}crypt
-+.align 32
-+${alg}${bits}_t4_xts_${dir}crypt:
-+ save %sp, -$::frame-16, %sp
-+
-+ mov $ivec, %o0
-+ add %fp, $::bias-16, %o1
-+ call ${alg}_t4_encrypt
-+ mov $key2, %o2
-+
-+ add %fp, $::bias-16, %l7
-+ ldxa [%l7]0x88, %g2
-+ add %fp, $::bias-8, %l7
-+ ldxa [%l7]0x88, %g3 ! %g3:%g2 is tweak
-+
-+ sethi %hi(0x76543210), %l7
-+ or %l7, %lo(0x76543210), %l7
-+ bmask %l7, %g0, %g0 ! byte swap mask
-+
-+ prefetch [$inp], 20
-+ prefetch [$inp + 63], 20
-+ call _${alg}${bits}_load_${dir}ckey
-+ and $len, 15, $rem
-+ and $len, -16, $len
-+___
-+$code.=<<___ if ($dir eq "de");
-+ mov 0, %l7
-+ movrnz $rem, 16, %l7
-+ sub $len, %l7, $len
-+___
-+$code.=<<___;
-+
-+ sub $inp, $out, $blk_init ! $inp!=$out
-+ and $inp, 7, $ileft
-+ andn $inp, 7, $inp
-+ sll $ileft, 3, $ileft
-+ mov 64, $iright
-+ mov 0xff, $omask
-+ sub $iright, $ileft, $iright
-+ and $out, 7, $ooff
-+ cmp $len, 255
-+ movrnz $ooff, 0, $blk_init ! if ( $out&7 ||
-+ movleu $::size_t_cc, 0, $blk_init ! $len<256 ||
-+ brnz,pn $blk_init, .L${bits}_xts_${dir}blk ! $inp==$out)
-+ srl $omask, $ooff, $omask
-+
-+ andcc $len, 16, %g0 ! is number of blocks even?
-+___
-+$code.=<<___ if ($dir eq "de");
-+ brz,pn $len, .L${bits}_xts_${dir}steal
-+___
-+$code.=<<___;
-+ alignaddrl $out, %g0, $out
-+ bz %icc, .L${bits}_xts_${dir}loop2x
-+ srlx $len, 4, $len
-+.L${bits}_xts_${dir}loop:
-+ ldx [$inp + 0], %o0
-+ brz,pt $ileft, 4f
-+ ldx [$inp + 8], %o1
-+
-+ ldx [$inp + 16], %o2
-+ sllx %o0, $ileft, %o0
-+ srlx %o1, $iright, %g1
-+ sllx %o1, $ileft, %o1
-+ or %g1, %o0, %o0
-+ srlx %o2, $iright, %o2
-+ or %o2, %o1, %o1
-+4:
-+ movxtod %g2, %f12
-+ movxtod %g3, %f14
-+ bshuffle %f12, %f12, %f12
-+ bshuffle %f14, %f14, %f14
-+
-+ xor %g4, %o0, %o0 ! ^= rk[0]
-+ xor %g5, %o1, %o1
-+ movxtod %o0, %f0
-+ movxtod %o1, %f2
-+
-+ fxor %f12, %f0, %f0 ! ^= tweak[0]
-+ fxor %f14, %f2, %f2
-+
-+ prefetch [$out + 63], 22
-+ prefetch [$inp + 16+63], 20
-+ call _${alg}${bits}_${dir}crypt_1x
-+ add $inp, 16, $inp
-+
-+ fxor %f12, %f0, %f0 ! ^= tweak[0]
-+ fxor %f14, %f2, %f2
-+
-+ srax %g3, 63, %l7 ! next tweak value
-+ addcc %g2, %g2, %g2
-+ and %l7, 0x87, %l7
-+ addxc %g3, %g3, %g3
-+ xor %l7, %g2, %g2
-+
-+ brnz,pn $ooff, 2f
-+ sub $len, 1, $len
-+
-+ std %f0, [$out + 0]
-+ std %f2, [$out + 8]
-+ brnz,pt $len, .L${bits}_xts_${dir}loop2x
-+ add $out, 16, $out
-+
-+ brnz,pn $rem, .L${bits}_xts_${dir}steal
-+ nop
-+
-+ ret
-+ restore
-+
-+.align 16
-+2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
-+ ! and ~3x deterioration
-+ ! in inp==out case
-+ faligndata %f0, %f0, %f4 ! handle unaligned output
-+ faligndata %f0, %f2, %f6
-+ faligndata %f2, %f2, %f8
-+ stda %f4, [$out + $omask]0xc0 ! partial store
-+ std %f6, [$out + 8]
-+ add $out, 16, $out
-+ orn %g0, $omask, $omask
-+ stda %f8, [$out + $omask]0xc0 ! partial store
-+
-+ brnz,pt $len, .L${bits}_xts_${dir}loop2x+4
-+ orn %g0, $omask, $omask
-+
-+ brnz,pn $rem, .L${bits}_xts_${dir}steal
-+ nop
-+
-+ ret
-+ restore
-+
-+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-+.align 32
-+.L${bits}_xts_${dir}loop2x:
-+ ldx [$inp + 0], %o0
-+ ldx [$inp + 8], %o1
-+ ldx [$inp + 16], %o2
-+ brz,pt $ileft, 4f
-+ ldx [$inp + 24], %o3
-+
-+ ldx [$inp + 32], %o4
-+ sllx %o0, $ileft, %o0
-+ srlx %o1, $iright, %g1
-+ or %g1, %o0, %o0
-+ sllx %o1, $ileft, %o1
-+ srlx %o2, $iright, %g1
-+ or %g1, %o1, %o1
-+ sllx %o2, $ileft, %o2
-+ srlx %o3, $iright, %g1
-+ or %g1, %o2, %o2
-+ sllx %o3, $ileft, %o3
-+ srlx %o4, $iright, %o4
-+ or %o4, %o3, %o3
-+4:
-+ movxtod %g2, %f12
-+ movxtod %g3, %f14
-+ bshuffle %f12, %f12, %f12
-+ bshuffle %f14, %f14, %f14
-+
-+ srax %g3, 63, %l7 ! next tweak value
-+ addcc %g2, %g2, %g2
-+ and %l7, 0x87, %l7
-+ addxc %g3, %g3, %g3
-+ xor %l7, %g2, %g2
-+
-+ movxtod %g2, %f8
-+ movxtod %g3, %f10
-+ bshuffle %f8, %f8, %f8
-+ bshuffle %f10, %f10, %f10
-+
-+ xor %g4, %o0, %o0 ! ^= rk[0]
-+ xor %g5, %o1, %o1
-+ xor %g4, %o2, %o2 ! ^= rk[0]
-+ xor %g5, %o3, %o3
-+ movxtod %o0, %f0
-+ movxtod %o1, %f2
-+ movxtod %o2, %f4
-+ movxtod %o3, %f6
-+
-+ fxor %f12, %f0, %f0 ! ^= tweak[0]
-+ fxor %f14, %f2, %f2
-+ fxor %f8, %f4, %f4 ! ^= tweak[0]
-+ fxor %f10, %f6, %f6
-+
-+ prefetch [$out + 63], 22
-+ prefetch [$inp + 32+63], 20
-+ call _${alg}${bits}_${dir}crypt_2x
-+ add $inp, 32, $inp
-+
-+ movxtod %g2, %f8
-+ movxtod %g3, %f10
-+
-+ srax %g3, 63, %l7 ! next tweak value
-+ addcc %g2, %g2, %g2
-+ and %l7, 0x87, %l7
-+ addxc %g3, %g3, %g3
-+ xor %l7, %g2, %g2
-+
-+ bshuffle %f8, %f8, %f8
-+ bshuffle %f10, %f10, %f10
-+
-+ fxor %f12, %f0, %f0 ! ^= tweak[0]
-+ fxor %f14, %f2, %f2
-+ fxor %f8, %f4, %f4
-+ fxor %f10, %f6, %f6
-+
-+ brnz,pn $ooff, 2f
-+ sub $len, 2, $len
-+
-+ std %f0, [$out + 0]
-+ std %f2, [$out + 8]
-+ std %f4, [$out + 16]
-+ std %f6, [$out + 24]
-+ brnz,pt $len, .L${bits}_xts_${dir}loop2x
-+ add $out, 32, $out
-+
-+ fsrc2 %f4, %f0
-+ fsrc2 %f6, %f2
-+ brnz,pn $rem, .L${bits}_xts_${dir}steal
-+ nop
-+
-+ ret
-+ restore
-+
-+.align 16
-+2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
-+ ! and ~3x deterioration
-+ ! in inp==out case
-+ faligndata %f0, %f0, %f8 ! handle unaligned output
-+ faligndata %f0, %f2, %f10
-+ faligndata %f2, %f4, %f12
-+ faligndata %f4, %f6, %f14
-+ faligndata %f6, %f6, %f0
-+
-+ stda %f8, [$out + $omask]0xc0 ! partial store
-+ std %f10, [$out + 8]
-+ std %f12, [$out + 16]
-+ std %f14, [$out + 24]
-+ add $out, 32, $out
-+ orn %g0, $omask, $omask
-+ stda %f0, [$out + $omask]0xc0 ! partial store
-+
-+ brnz,pt $len, .L${bits}_xts_${dir}loop2x+4
-+ orn %g0, $omask, $omask
-+
-+ fsrc2 %f4, %f0
-+ fsrc2 %f6, %f2
-+ brnz,pn $rem, .L${bits}_xts_${dir}steal
-+ nop
-+
-+ ret
-+ restore
-+
-+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-+.align 32
-+.L${bits}_xts_${dir}blk:
-+ add $out, $len, $blk_init
-+ and $blk_init, 63, $blk_init ! tail
-+ sub $len, $blk_init, $len
-+ add $blk_init, 15, $blk_init ! round up to 16n
-+ srlx $len, 4, $len
-+ srl $blk_init, 4, $blk_init
-+ sub $len, 1, $len
-+ add $blk_init, 1, $blk_init
-+
-+.L${bits}_xts_${dir}blk2x:
-+ ldx [$inp + 0], %o0
-+ ldx [$inp + 8], %o1
-+ ldx [$inp + 16], %o2
-+ brz,pt $ileft, 5f
-+ ldx [$inp + 24], %o3
-+
-+ ldx [$inp + 32], %o4
-+ sllx %o0, $ileft, %o0
-+ srlx %o1, $iright, %g1
-+ or %g1, %o0, %o0
-+ sllx %o1, $ileft, %o1
-+ srlx %o2, $iright, %g1
-+ or %g1, %o1, %o1
-+ sllx %o2, $ileft, %o2
-+ srlx %o3, $iright, %g1
-+ or %g1, %o2, %o2
-+ sllx %o3, $ileft, %o3
-+ srlx %o4, $iright, %o4
-+ or %o4, %o3, %o3
-+5:
-+ movxtod %g2, %f12
-+ movxtod %g3, %f14
-+ bshuffle %f12, %f12, %f12
-+ bshuffle %f14, %f14, %f14
-+
-+ srax %g3, 63, %l7 ! next tweak value
-+ addcc %g2, %g2, %g2
-+ and %l7, 0x87, %l7
-+ addxc %g3, %g3, %g3
-+ xor %l7, %g2, %g2
-+
-+ movxtod %g2, %f8
-+ movxtod %g3, %f10
-+ bshuffle %f8, %f8, %f8
-+ bshuffle %f10, %f10, %f10
-+
-+ xor %g4, %o0, %o0 ! ^= rk[0]
-+ xor %g5, %o1, %o1
-+ xor %g4, %o2, %o2 ! ^= rk[0]
-+ xor %g5, %o3, %o3
-+ movxtod %o0, %f0
-+ movxtod %o1, %f2
-+ movxtod %o2, %f4
-+ movxtod %o3, %f6
-+
-+ fxor %f12, %f0, %f0 ! ^= tweak[0]
-+ fxor %f14, %f2, %f2
-+ fxor %f8, %f4, %f4 ! ^= tweak[0]
-+ fxor %f10, %f6, %f6
-+
-+ prefetch [$inp + 32+63], 20
-+ call _${alg}${bits}_${dir}crypt_2x
-+ add $inp, 32, $inp
-+
-+ movxtod %g2, %f8
-+ movxtod %g3, %f10
-+
-+ srax %g3, 63, %l7 ! next tweak value
-+ addcc %g2, %g2, %g2
-+ and %l7, 0x87, %l7
-+ addxc %g3, %g3, %g3
-+ xor %l7, %g2, %g2
-+
-+ bshuffle %f8, %f8, %f8
-+ bshuffle %f10, %f10, %f10
-+
-+ fxor %f12, %f0, %f0 ! ^= tweak[0]
-+ fxor %f14, %f2, %f2
-+ fxor %f8, %f4, %f4
-+ fxor %f10, %f6, %f6
-+
-+ stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
-+ add $out, 8, $out
-+ stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
-+ add $out, 8, $out
-+ stda %f4, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
-+ add $out, 8, $out
-+ stda %f6, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
-+ bgu,pt $::size_t_cc, .L${bits}_xts_${dir}blk2x
-+ add $out, 8, $out
-+
-+ add $blk_init, $len, $len
-+ andcc $len, 1, %g0 ! is number of blocks even?
-+ membar #StoreLoad|#StoreStore
-+ bnz,pt %icc, .L${bits}_xts_${dir}loop
-+ srl $len, 0, $len
-+ brnz,pn $len, .L${bits}_xts_${dir}loop2x
-+ nop
-+
-+ fsrc2 %f4, %f0
-+ fsrc2 %f6, %f2
-+ brnz,pn $rem, .L${bits}_xts_${dir}steal
-+ nop
-+
-+ ret
-+ restore
-+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-+___
-+$code.=<<___ if ($dir eq "en");
-+.align 32
-+.L${bits}_xts_${dir}steal:
-+ std %f0, [%fp + $::bias-16] ! copy of output
-+ std %f2, [%fp + $::bias-8]
-+
-+ srl $ileft, 3, $ileft
-+ add %fp, $::bias-16, %l7
-+ add $inp, $ileft, $inp ! original $inp+$len&-15
-+ add $out, $ooff, $out ! original $out+$len&-15
-+ mov 0, $ileft
-+ nop ! align
-+
-+.L${bits}_xts_${dir}stealing:
-+ ldub [$inp + $ileft], %o0
-+ ldub [%l7 + $ileft], %o1
-+ dec $rem
-+ stb %o0, [%l7 + $ileft]
-+ stb %o1, [$out + $ileft]
-+ brnz $rem, .L${bits}_xts_${dir}stealing
-+ inc $ileft
-+
-+ mov %l7, $inp
-+ sub $out, 16, $out
-+ mov 0, $ileft
-+ sub $out, $ooff, $out
-+ ba .L${bits}_xts_${dir}loop ! one more time
-+ mov 1, $len ! $rem is 0
-+___
-+$code.=<<___ if ($dir eq "de");
-+.align 32
-+.L${bits}_xts_${dir}steal:
-+ ldx [$inp + 0], %o0
-+ brz,pt $ileft, 8f
-+ ldx [$inp + 8], %o1
-+
-+ ldx [$inp + 16], %o2
-+ sllx %o0, $ileft, %o0
-+ srlx %o1, $iright, %g1
-+ sllx %o1, $ileft, %o1
-+ or %g1, %o0, %o0
-+ srlx %o2, $iright, %o2
-+ or %o2, %o1, %o1
-+8:
-+ srax %g3, 63, %l7 ! next tweak value
-+ addcc %g2, %g2, %o2
-+ and %l7, 0x87, %l7
-+ addxc %g3, %g3, %o3
-+ xor %l7, %o2, %o2
-+
-+ movxtod %o2, %f12
-+ movxtod %o3, %f14
-+ bshuffle %f12, %f12, %f12
-+ bshuffle %f14, %f14, %f14
-+
-+ xor %g4, %o0, %o0 ! ^= rk[0]
-+ xor %g5, %o1, %o1
-+ movxtod %o0, %f0
-+ movxtod %o1, %f2
-+
-+ fxor %f12, %f0, %f0 ! ^= tweak[0]
-+ fxor %f14, %f2, %f2
-+
-+ call _${alg}${bits}_${dir}crypt_1x
-+ add $inp, 16, $inp
-+
-+ fxor %f12, %f0, %f0 ! ^= tweak[0]
-+ fxor %f14, %f2, %f2
-+
-+ std %f0, [%fp + $::bias-16]
-+ std %f2, [%fp + $::bias-8]
-+
-+ srl $ileft, 3, $ileft
-+ add %fp, $::bias-16, %l7
-+ add $inp, $ileft, $inp ! original $inp+$len&-15
-+ add $out, $ooff, $out ! original $out+$len&-15
-+ mov 0, $ileft
-+ add $out, 16, $out
-+ nop ! align
-+
-+.L${bits}_xts_${dir}stealing:
-+ ldub [$inp + $ileft], %o0
-+ ldub [%l7 + $ileft], %o1
-+ dec $rem
-+ stb %o0, [%l7 + $ileft]
-+ stb %o1, [$out + $ileft]
-+ brnz $rem, .L${bits}_xts_${dir}stealing
-+ inc $ileft
-+
-+ mov %l7, $inp
-+ sub $out, 16, $out
-+ mov 0, $ileft
-+ sub $out, $ooff, $out
-+ ba .L${bits}_xts_${dir}loop ! one more time
-+ mov 1, $len ! $rem is 0
-+___
-+$code.=<<___;
-+ ret
-+ restore
-+.type ${alg}${bits}_t4_xts_${dir}crypt,#function
-+.size ${alg}${bits}_t4_xts_${dir}crypt,.-${alg}${bits}_t4_xts_${dir}crypt
-+___
-+}
-+
-+# Purpose of these subroutines is to explicitly encode VIS instructions,
-+# so that one can compile the module without having to specify VIS
-+# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
-+# Idea is to reserve for option to produce "universal" binary and let
-+# programmer detect if current CPU is VIS capable at run-time.
-+sub unvis {
-+my ($mnemonic,$rs1,$rs2,$rd)=@_;
-+my ($ref,$opf);
-+my %visopf = ( "faligndata" => 0x048,
-+ "bshuffle" => 0x04c,
-+ "fnot2" => 0x066,
-+ "fxor" => 0x06c,
-+ "fsrc2" => 0x078 );
-+
-+ $ref = "$mnemonic\t$rs1,$rs2,$rd";
-+
-+ if ($opf=$visopf{$mnemonic}) {
-+ foreach ($rs1,$rs2,$rd) {
-+ return $ref if (!/%f([0-9]{1,2})/);
-+ $_=$1;
-+ if ($1>=32) {
-+ return $ref if ($1&1);
-+ # re-encode for upper double register addressing
-+ $_=($1|$1>>5)&31;
-+ }
-+ }
-+
-+ return sprintf ".word\t0x%08x !%s",
-+ 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
-+ $ref;
-+ } else {
-+ return $ref;
-+ }
-+}
-+
-+sub unvis3 {
-+my ($mnemonic,$rs1,$rs2,$rd)=@_;
-+my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
-+my ($ref,$opf);
-+my %visopf = ( "addxc" => 0x011,
-+ "addxccc" => 0x013,
-+ "umulxhi" => 0x016,
-+ "alignaddr" => 0x018,
-+ "bmask" => 0x019,
-+ "alignaddrl" => 0x01a );
-+
-+ $ref = "$mnemonic\t$rs1,$rs2,$rd";
-+
-+ if ($opf=$visopf{$mnemonic}) {
-+ foreach ($rs1,$rs2,$rd) {
-+ return $ref if (!/%([goli])([0-9])/);
-+ $_=$bias{$1}+$2;
-+ }
-+
-+ return sprintf ".word\t0x%08x !%s",
-+ 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
-+ $ref;
-+ } else {
-+ return $ref;
-+ }
-+}
-+
-+sub unaes_round { # 4-argument instructions
-+my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
-+my ($ref,$opf);
-+my %aesopf = ( "aes_eround01" => 0,
-+ "aes_eround23" => 1,
-+ "aes_dround01" => 2,
-+ "aes_dround23" => 3,
-+ "aes_eround01_l"=> 4,
-+ "aes_eround23_l"=> 5,
-+ "aes_dround01_l"=> 6,
-+ "aes_dround23_l"=> 7,
-+ "aes_kexpand1" => 8 );
-+
-+ $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
-+
-+ if (defined($opf=$aesopf{$mnemonic})) {
-+ $rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3;
-+ foreach ($rs1,$rs2,$rd) {
-+ return $ref if (!/%f([0-9]{1,2})/);
-+ $_=$1;
-+ if ($1>=32) {
-+ return $ref if ($1&1);
-+ # re-encode for upper double register addressing
-+ $_=($1|$1>>5)&31;
-+ }
-+ }
-+
-+ return sprintf ".word\t0x%08x !%s",
-+ 2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|$opf<<5|$rs2,
-+ $ref;
-+ } else {
-+ return $ref;
-+ }
-+}
-+
-+sub unaes_kexpand { # 3-argument instructions
-+my ($mnemonic,$rs1,$rs2,$rd)=@_;
-+my ($ref,$opf);
-+my %aesopf = ( "aes_kexpand0" => 0x130,
-+ "aes_kexpand2" => 0x131 );
-+
-+ $ref = "$mnemonic\t$rs1,$rs2,$rd";
-+
-+ if (defined($opf=$aesopf{$mnemonic})) {
-+ foreach ($rs1,$rs2,$rd) {
-+ return $ref if (!/%f([0-9]{1,2})/);
-+ $_=$1;
-+ if ($1>=32) {
-+ return $ref if ($1&1);
-+ # re-encode for upper double register addressing
-+ $_=($1|$1>>5)&31;
-+ }
-+ }
-+
-+ return sprintf ".word\t0x%08x !%s",
-+ 2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
-+ $ref;
-+ } else {
-+ return $ref;
-+ }
-+}
-+
-+sub uncamellia_f { # 4-argument instructions
-+my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
-+my ($ref,$opf);
-+
-+ $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
-+
-+ if (1) {
-+ $rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3;
-+ foreach ($rs1,$rs2,$rd) {
-+ return $ref if (!/%f([0-9]{1,2})/);
-+ $_=$1;
@@ Diff output truncated at 100000 characters. @@
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
More information about the devel
mailing list