SF.net SVN: gar:[25050] csw/mgar/pkg/openssl1/trunk/files

Tue Jun 2 09:41:27 CEST 2015

Revision: 25050
          http://sourceforge.net/p/gar/code/25050
Author:   janholzh
Date:     2015-06-02 07:41:26 +0000 (Tue, 02 Jun 2015)
Log Message:
-----------
openssl1/trunk: update more patches

Modified Paths:
--------------
    csw/mgar/pkg/openssl1/trunk/files/openssl-1.0.1m-t4-engine.sparc.5.11.patch
    csw/mgar/pkg/openssl1/trunk/files/openssl-1.0.1m-wanboot.patch

Modified: csw/mgar/pkg/openssl1/trunk/files/openssl-1.0.1m-t4-engine.sparc.5.11.patch
===================================================================

--- csw/mgar/pkg/openssl1/trunk/files/openssl-1.0.1m-t4-engine.sparc.5.11.patch	2015-06-02 06:06:13 UTC (rev 25049)
+++ csw/mgar/pkg/openssl1/trunk/files/openssl-1.0.1m-t4-engine.sparc.5.11.patch	2015-06-02 07:41:26 UTC (rev 25050)
@@ -13,9 +13,9 @@
  my $ia64_asm="ia64cpuid.o:bn-ia64.o ia64-mont.o::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o rc4_skey.o:::::ghash-ia64.o::void";
 -my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o:::sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o:::::::ghash-sparcv9.o::void";
 +my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o sparct4-mont.o sparcv9-gf2m.o:des_enc-sparc.o fcrypt_b.o dest4-sparcv9.o:aes_core.o aes_cbc.o aes-sparcv9.o aest4-sparcv9.o::md5-sparcv9.o:sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o:::::::ghash-sparcv9.o::void";
+ my $fips_sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o:::sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o:::::::ghash-sparcv9.o::void";
  my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o:::::::::::::void";
  my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o:::::sha1-alpha.o:::::::ghash-alpha.o::void";
- my $mips32_asm=":bn-mips.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o::::::::";
 Index: crypto/sparccpuid.S
 ===================================================================
 diff -ru openssl-1.0.1e/crypto/sparccpuid.S openssl-1.0.1e/crypto/sparccpuid.S
@@ -29,20 +29,7 @@
  #if defined(__SUNPRO_C) && defined(__sparcv9)
  # define ABI64  /* They've said -xarch=v9 at command line */
  #elif defined(__GNUC__) && defined(__arch64__)
-@@ -235,10 +239,10 @@
- .global	_sparcv9_vis1_probe
- .align	8
- _sparcv9_vis1_probe:
-+	.word	0x81b00d80	!fxor	%f0,%f0,%f0
- 	add	%sp,BIAS+2,%o1
--	.word	0xc19a5a40	!ldda	[%o1]ASI_FP16_P,%f0
- 	retl
--	.word	0x81b00d80	!fxor	%f0,%f0,%f0
-+	.word	0xc19a5a40	!ldda	[%o1]ASI_FP16_P,%f0
- .type	_sparcv9_vis1_probe,#function
- .size	_sparcv9_vis1_probe,.-_sparcv9_vis1_probe
- 
-@@ -251,7 +255,12 @@
+@@ -241,7 +245,12 @@
  !	UltraSPARC IIe		7
  !	UltraSPARC III		7
  !	UltraSPARC T1		24
@@ -55,7 +42,7 @@
  ! Numbers for T2 and SPARC64 V-VII are more than welcomed.
  !
  ! It would be possible to detect specifically US-T1 by instrumenting
-@@ -260,6 +269,8 @@
+@@ -250,6 +259,8 @@
  .global	_sparcv9_vis1_instrument
  .align	8
  _sparcv9_vis1_instrument:
@@ -64,9 +51,9 @@
  	.word	0x91410000	!rd	%tick,%o0
  	.word	0x81b00d80	!fxor	%f0,%f0,%f0
  	.word	0x85b08d82	!fxor	%f2,%f2,%f2
-@@ -314,6 +325,30 @@
- .type	_sparcv9_fmadd_probe,#function
- .size	_sparcv9_fmadd_probe,.-_sparcv9_fmadd_probe
+@@ -286,6 +297,30 @@
+ .type	_sparcv9_vis1_instrument,#function
+ .size	_sparcv9_vis1_instrument,.-_sparcv9_vis1_instrument
  
 +.global	_sparcv9_rdcfr
 +.align	8
@@ -95,7 +82,7 @@
  .global	OPENSSL_cleanse
  .align	32
  OPENSSL_cleanse:
-@@ -398,6 +433,102 @@
+@@ -370,6 +405,102 @@
  .size	OPENSSL_cleanse,.-OPENSSL_cleanse
  
  #ifndef _BOOT
@@ -203,18 +190,20 @@
 diff -ru openssl-1.0.1e/crypto/sparcv9cap.c openssl-1.0.1e/crypto/sparcv9cap.c
 --- openssl-1.0.1e/crypto/sparcv9cap.c 2011-05-24 17:02:24.000000000 -0700
 +++ openssl-1.0.1e/crypto/sparcv9cap.c 2011-07-27 10:48:17.817470000 -0700
-@@ -4,34 +4,58 @@
+@@ -3,36 +3,59 @@
+ #include <string.h>
  #include <setjmp.h>
- #include <signal.h>
  #include <sys/time.h>
 +#include <unistd.h>
  #include <openssl/bn.h>
+ #include <sys/auxv.h>
  
 -#define SPARCV9_TICK_PRIVILEGED (1<<0)
 -#define SPARCV9_PREFER_FPU      (1<<1)
 -#define SPARCV9_VIS1            (1<<2)
 -#define SPARCV9_VIS2            (1<<3) /* reserved */
 -#define SPARCV9_FMADD           (1<<4) /* reserved for SPARC64 V */
+-#define SPARCV9_BLK             (1<<5)
 +#include "sparc_arch.h"
  
 +#if defined(__GNUC__) && defined(__linux)
@@ -275,13 +264,11 @@
  }
  
  unsigned long _sparcv9_rdtick(void);
-@@ -39,11 +63,18 @@
+@@ -37,11 +60,16 @@
+ 
+ unsigned long _sparcv9_rdtick(void);
  unsigned long _sparcv9_vis1_instrument(void);
- void _sparcv9_vis2_probe(void);
- void _sparcv9_fmadd_probe(void);
 +unsigned long _sparcv9_rdcfr(void);
-+void _sparcv9_vis3_probe(void);
-+unsigned long _sparcv9_random(void);
 +#ifndef _BOOT
 +size_t _sparcv9_vis1_instrument_bus(unsigned int *,size_t);
 +size_t _sparcv9_vis1_instrument_bus2(unsigned int *,size_t,size_t);
@@ -295,7 +282,7 @@
  #if defined(__sun) && defined(__SVR4)
          return gethrtime();
  #else
-@@ -52,6 +83,24 @@
+@@ -50,6 +80,24 @@
      else
          return _sparcv9_rdtick();
  }
@@ -320,7 +307,7 @@
  #endif
 
  #if defined(_BOOT)
-@@ -61,7 +110,7 @@
+@@ -59,7 +107,7 @@
   */
  void OPENSSL_cpuid_setup(void)
         {
@@ -329,7 +316,7 @@
         }
  
  #elif 0 && defined(__sun) && defined(__SVR4)
-@@ -90,11 +139,11 @@
+@@ -88,11 +136,11 @@
      if (!strcmp(name, "SUNW,UltraSPARC") ||
          /* covers II,III,IV */
          !strncmp(name, "SUNW,UltraSPARC-I", 17)) {
@@ -343,7 +330,7 @@
  
          return DI_WALK_TERMINATE;
      }
-@@ -100,7 +149,7 @@
+@@ -98,7 +146,7 @@
      }
      /* This is expected to catch remaining UltraSPARCs, such as T1 */
      else if (!strncmp(name, "SUNW,UltraSPARC", 15)) {
@@ -352,7 +339,7 @@
  
          return DI_WALK_TERMINATE;
      }
-@@ -119,7 +168,7 @@
+@@ -117,7 +165,7 @@
      trigger = 1;
  
      if ((e = getenv("OPENSSL_sparcv9cap"))) {
@@ -361,7 +348,7 @@
          return;
      }
  
-@@ -126,15 +175,15 @@
+@@ -124,15 +172,15 @@
      if (sysinfo(SI_MACHINE, si, sizeof(si)) > 0) {
          if (strcmp(si, "sun4v"))
              /* FPU is preferred for all CPUs, but US-T1/2 */
@@ -381,7 +368,7 @@
              return;
          }
      }
-@@ -204,12 +253,14 @@
+@@ -195,7 +241,9 @@
      trigger = 1;
  
      if ((e = getenv("OPENSSL_sparcv9cap"))) {
@@ -392,73 +379,48 @@
          return;
      }
  
+@@ -202,21 +250,48 @@
+     (void) getisax(&ui, 1);
+ 
      /* Initial value, fits UltraSPARC-I&II... */
--    OPENSSL_sparcv9cap_P = SPARCV9_PREFER_FPU | SPARCV9_TICK_PRIVILEGED;
-+    OPENSSL_sparcv9cap_P[0] = SPARCV9_PREFER_FPU | SPARCV9_TICK_PRIVILEGED;
+-    OPENSSL_sparcv9cap_P = SPARCV9_BLK;
++    OPENSSL_sparcv9cap_P[0] = SPARCV9_BLK;
  
-     sigfillset(&all_masked);
-     sigdelset(&all_masked, SIGILL);
-@@ -232,18 +283,18 @@
- 
-     if (sigsetjmp(common_jmp, 1) == 0) {
-         _sparcv9_rdtick();
--        OPENSSL_sparcv9cap_P &= ~SPARCV9_TICK_PRIVILEGED;
-+        OPENSSL_sparcv9cap_P[0] &= ~SPARCV9_TICK_PRIVILEGED;
+     if (ui & AV_SPARC_VIS) {
+-        /* detect UltraSPARC-Tx, see sparccpuid.S for details... */
++        /* detect UltraSPARC-Tx, see sparccpud.S for details... */
+         if (_sparcv9_vis1_instrument() < 7)
+-            OPENSSL_sparcv9cap_P |= SPARCV9_TICK_PRIVILEGED;
++            OPENSSL_sparcv9cap_P[0] |= SPARCV9_TICK_PRIVILEGED;
+         if (_sparcv9_vis1_instrument() < 12) {
+-            OPENSSL_sparcv9cap_P |= SPARCV9_VIS1|SPARCV9_PREFER_FPU;
++            OPENSSL_sparcv9cap_P[0] |= (SPARCV9_VIS1 | SPARCV9_PREFER_FPU);
+             if (ui & AV_SPARC_VIS2)
+-                OPENSSL_sparcv9cap_P |= SPARCV9_VIS2;
+-        }
++                OPENSSL_sparcv9cap_P[0] |= SPARCV9_VIS2;
++         }
      }
  
-     if (sigsetjmp(common_jmp, 1) == 0) {
-         _sparcv9_vis1_probe();
--        OPENSSL_sparcv9cap_P |= SPARCV9_VIS1;
-+        OPENSSL_sparcv9cap_P[0] |= SPARCV9_VIS1 | SPARCV9_BLK;
-         /* detect UltraSPARC-Tx, see sparccpud.S for details... */
-         if (_sparcv9_vis1_instrument() >= 12)
--            OPENSSL_sparcv9cap_P &= ~(SPARCV9_VIS1 | SPARCV9_PREFER_FPU);
-+            OPENSSL_sparcv9cap_P[0] &= ~(SPARCV9_VIS1 | SPARCV9_PREFER_FPU);
-         else {
-             _sparcv9_vis2_probe();
--            OPENSSL_sparcv9cap_P |= SPARCV9_VIS2;
-+            OPENSSL_sparcv9cap_P[0] |= SPARCV9_VIS2;
-         }
-     }
- 
-@@ -249,13 +300,50 @@
- 
-     if (sigsetjmp(common_jmp, 1) == 0) {
-         _sparcv9_fmadd_probe();
+     if (ui & AV_SPARC_FMAF)
 -        OPENSSL_sparcv9cap_P |= SPARCV9_FMADD;
 +        OPENSSL_sparcv9cap_P[0] |= SPARCV9_FMADD;
-     }
- 
++
 +    /*
 +     * VIS3 flag is tested independently from VIS1, unlike VIS2 that is,
 +     * because VIS3 defines even integer instructions.
 +     */
-+    if (sigsetjmp(common_jmp,1) == 0) {
-+        _sparcv9_vis3_probe();
-+        OPENSSL_sparcv9cap_P[0] |= SPARCV9_VIS3;
-+    }
++    if (ui & AV_SPARC_VIS3)
++            OPENSSL_sparcv9cap_P[0] |= SPARCV9_VIS3;
 +
-+    if (sigsetjmp(common_jmp,1) == 0) {
-+        (void)_sparcv9_random();
-+        OPENSSL_sparcv9cap_P[0] |= SPARCV9_RANDOM;
-+    }
++#define AV_T4_MECHS     (AV_SPARC_AES | AV_SPARC_DES | AV_SPARC_KASUMI | \
++                         AV_SPARC_CAMELLIA | AV_SPARC_MD5 | AV_SPARC_SHA1 | \
++                         AV_SPARC_SHA256 | AV_SPARC_SHA512 | AV_SPARC_MPMUL | \
++                         AV_SPARC_CRC32C)
 +
-+    /*
-+     * In wait for better solution _sparcv9_rdcfr is masked by
-+     * VIS3 flag, because it goes to uninterruptable endless
-+     * loop on UltraSPARC II running Solaris. Things might be
-+     * different on Linux...
-+     */
-+    if ((OPENSSL_sparcv9cap_P[0]&SPARCV9_VIS3) &&
-+        sigsetjmp(common_jmp, 1) == 0) {
++    if ((OPENSSL_sparcv9cap_P[0]&SPARCV9_VIS3) && (ui & AV_T4_MECHS))
 +        OPENSSL_sparcv9cap_P[1] = (unsigned int)_sparcv9_rdcfr();
-+    }
 +
-     sigaction(SIGBUS, &bus_oact, NULL);
-     sigaction(SIGILL, &ill_oact, NULL);
- 
-     sigprocmask(SIG_SETMASK, &oset, NULL);
-+
 +    if (sizeof(size_t) == 8)
 +        OPENSSL_sparcv9cap_P[0] |= SPARCV9_64BIT_STACK;
 +#ifdef __linux
@@ -2265,5563 +2227,3 @@
      {ERR_FUNC(EVP_F_AES_XTS), "AES_XTS"},
      {ERR_FUNC(EVP_F_AES_XTS_CIPHER), "AES_XTS_CIPHER"},
      {ERR_FUNC(EVP_F_ALG_MODULE_INIT), "ALG_MODULE_INIT"},
-Index: crypto/sparc_arch.h
-===================================================================
-diff -uNr openssl-1.0.1m/crypto/sparc_arch.h openssl-1.0.1m/crypto/sparc_arch.h
---- openssl-1.0.1m/crypto/sparc_arch.h 1970-01-01 01:00:00.000000000 +0100
-+++ openssl-1.0.1m/crypto/sparc_arch.h 2015-03-21 16:27:38.578043100 +0100
-@@ -0,0 +1,101 @@
-+#ifndef __SPARC_ARCH_H__
-+#define	__SPARC_ARCH_H__
-+
-+#define	SPARCV9_TICK_PRIVILEGED	(1<<0)
-+#define	SPARCV9_PREFER_FPU	(1<<1)
-+#define	SPARCV9_VIS1		(1<<2)
-+#define	SPARCV9_VIS2		(1<<3)	/* reserved */
-+#define	SPARCV9_FMADD		(1<<4)	/* reserved for SPARC64 V */
-+#define	SPARCV9_BLK		(1<<5)	/* VIS1 block copy */
-+#define	SPARCV9_VIS3		(1<<6)
-+#define	SPARCV9_RANDOM		(1<<7)
-+#define	SPARCV9_64BIT_STACK	(1<<8)
-+
-+/*
-+ * OPENSSL_sparcv9cap_P[1] is copy of Compatibility Feature Register,
-+ * %asr26, SPARC-T4 and later. There is no SPARCV9_CFR bit in
-+ * OPENSSL_sparcv9cap_P[0], as %cfr copy is sufficient...
-+ */
-+#define	CFR_AES		0x00000001 /* Supports AES opcodes	*/
-+#define	CFR_DES		0x00000002 /* Supports DES opcodes	*/
-+#define	CFR_KASUMI	0x00000004 /* Supports KASUMI opcodes	*/
-+#define	CFR_CAMELLIA	0x00000008 /* Supports CAMELLIA opcodes	*/
-+#define	CFR_MD5		0x00000010 /* Supports MD5 opcodes	*/
-+#define	CFR_SHA1	0x00000020 /* Supports SHA1 opcodes	*/
-+#define	CFR_SHA256	0x00000040 /* Supports SHA256 opcodes	*/
-+#define	CFR_SHA512	0x00000080 /* Supports SHA512 opcodes	*/
-+#define	CFR_MPMUL	0x00000100 /* Supports MPMUL opcodes	*/
-+#define	CFR_MONTMUL	0x00000200 /* Supports MONTMUL opcodes	*/
-+#define	CFR_MONTSQR	0x00000400 /* Supports MONTSQR opcodes	*/
-+#define	CFR_CRC32C	0x00000800 /* Supports CRC32C opcodes	*/
-+
-+#if defined(OPENSSL_PIC) && !defined(__PIC__)
-+#define	__PIC__
-+#endif
-+
-+#if defined(__SUNPRO_C) && defined(__sparcv9) && !defined(__arch64__)
-+#define	__arch64__
-+#endif
-+
-+#define	SPARC_PIC_THUNK(reg)	\
-+	.align	32;		\
-+.Lpic_thunk:			\
-+	jmp	%o7 + 8;	\
-+	add	%o7, reg, reg;
-+
-+#define	SPARC_PIC_THUNK_CALL(reg)			\
-+	sethi	%hi(_GLOBAL_OFFSET_TABLE_-4), reg;	\
-+	call	.Lpic_thunk;				\
-+	or	reg, %lo(_GLOBAL_OFFSET_TABLE_+4), reg;
-+
-+#if 1
-+#define	SPARC_SETUP_GOT_REG(reg)	SPARC_PIC_THUNK_CALL(reg)
-+#else
-+#define	SPARC_SETUP_GOT_REG(reg)	\
-+	sethi	%hi(_GLOBAL_OFFSET_TABLE_-4), reg;	\
-+	call	.+8;					\
-+	or	reg, %lo(_GLOBAL_OFFSET_TABLE_+4), reg;	\
-+	add	%o7, reg, reg
-+#endif
-+
-+#if defined(__arch64__)
-+
-+#define	SPARC_LOAD_ADDRESS(SYM, reg)	\
-+	setx	SYM, %o7, reg;
-+#define	LDPTR		ldx
-+#define	SIZE_T_CC	%xcc
-+#define	STACK_FRAME	192
-+#define	STACK_BIAS	2047
-+#define	STACK_7thARG	(STACK_BIAS+176)
-+
-+#else
-+
-+#define	SPARC_LOAD_ADDRESS(SYM, reg)	\
-+	set	SYM, reg;
-+#define	LDPTR		ld
-+#define	SIZE_T_CC	%icc
-+#define	STACK_FRAME	112
-+#define	STACK_BIAS	0
-+#define	STACK_7thARG	92
-+#define	SPARC_LOAD_ADDRESS_LEAF(SYM, reg, tmp) SPARC_LOAD_ADDRESS(SYM, reg)
-+
-+#endif
-+
-+#ifdef __PIC__
-+#undef	SPARC_LOAD_ADDRESS
-+#undef SPARC_LOAD_ADDRESS_LEAF
-+#define	SPARC_LOAD_ADDRESS(SYM, reg)	\
-+	SPARC_SETUP_GOT_REG(reg);	\
-+	sethi	%hi(SYM), %o7;		\
-+	or	%o7, %lo(SYM), %o7;	\
-+	LDPTR	[reg + %o7], reg;
-+#endif
-+
-+#ifndef SPARC_LOAD_ADDRESS_LEAF
-+#define	SPARC_LOAD_ADDRESS_LEAF(SYM, reg, tmp)	\
-+	mov	%o7, tmp;			\
-+	SPARC_LOAD_ADDRESS(SYM, reg)		\
-+	mov	tmp, %o7;
-+#endif
-+
-+#endif	/* __SPARC_ARCH_H__ */
-Index: crypto/md5/asm/md5-sparcv9.pl
-===================================================================
-diff -uNr openssl-1.0.1m/crypto/md5/asm/md5-sparcv9.pl openssl-1.0.1m/crypto/md5/asm/md5-sparcv9.pl
---- openssl-1.0.1m/crypto/md5/asm/md5-sparcv9.pl 1970-01-01 01:00:00.000000000 +0100
-+++ openssl-1.0.1m/crypto/md5/asm/md5-sparcv9.pl 2015-03-21 16:27:38.578043100 +0100
-@@ -0,0 +1,434 @@
-+#!/usr/bin/env perl
-+
-+# ====================================================================
-+# Written by Andy Polyakov <appro at openssl.org> for the OpenSSL
-+# project. The module is, however, dual licensed under OpenSSL and
-+# CRYPTOGAMS licenses depending on where you obtain it. For further
-+# details see http://www.openssl.org/~appro/cryptogams/.
-+#
-+# Hardware SPARC T4 support by David S. Miller <davem at davemloft.net>.
-+# ====================================================================
-+
-+# MD5 for SPARCv9, 6.9 cycles per byte on UltraSPARC, >40% faster than
-+# code generated by Sun C 5.2.
-+
-+# SPARC T4 MD5 hardware achieves 3.20 cycles per byte, which is 2.1x
-+# faster than software. Multi-process benchmark saturates at 12x
-+# single-process result on 8-core processor, or ~11GBps per 2.85GHz
-+# socket.
-+
-+$bits=32;
-+for (@ARGV)	{ $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
-+if ($bits==64)	{ $bias=2047; $frame=192; }
-+else		{ $bias=0;    $frame=112; }
-+
-+$output=shift;
-+open STDOUT,">$output";
-+
-+use integer;
-+
-+($ctx,$inp,$len)=("%i0","%i1","%i2");	# input arguments
-+
-+# 64-bit values
-+ at X=("%o0","%o1","%o2","%o3","%o4","%o5","%o7","%g1","%g2");
-+$tx="%g3";
-+($AB,$CD)=("%g4","%g5");
-+
-+# 32-bit values
-+ at V=($A,$B,$C,$D)=map("%l$_",(0..3));
-+($t1,$t2,$t3,$saved_asi)=map("%l$_",(4..7));
-+($shr,$shl1,$shl2)=("%i3","%i4","%i5");
-+
-+my @K=(	0xd76aa478,0xe8c7b756,0x242070db,0xc1bdceee,
-+	0xf57c0faf,0x4787c62a,0xa8304613,0xfd469501,
-+	0x698098d8,0x8b44f7af,0xffff5bb1,0x895cd7be,
-+	0x6b901122,0xfd987193,0xa679438e,0x49b40821,
-+
-+	0xf61e2562,0xc040b340,0x265e5a51,0xe9b6c7aa,
-+	0xd62f105d,0x02441453,0xd8a1e681,0xe7d3fbc8,
-+	0x21e1cde6,0xc33707d6,0xf4d50d87,0x455a14ed,
-+	0xa9e3e905,0xfcefa3f8,0x676f02d9,0x8d2a4c8a,
-+
-+	0xfffa3942,0x8771f681,0x6d9d6122,0xfde5380c,
-+	0xa4beea44,0x4bdecfa9,0xf6bb4b60,0xbebfbc70,
-+	0x289b7ec6,0xeaa127fa,0xd4ef3085,0x04881d05,
-+	0xd9d4d039,0xe6db99e5,0x1fa27cf8,0xc4ac5665,
-+
-+	0xf4292244,0x432aff97,0xab9423a7,0xfc93a039,
-+	0x655b59c3,0x8f0ccc92,0xffeff47d,0x85845dd1,
-+	0x6fa87e4f,0xfe2ce6e0,0xa3014314,0x4e0811a1,
-+	0xf7537e82,0xbd3af235,0x2ad7d2bb,0xeb86d391, 0	);
-+
-+sub R0 {
-+  my ($i,$a,$b,$c,$d) = @_;
-+  my $rot = (7,12,17,22)[$i%4];
-+  my $j   = ($i+1)/2;
-+
-+  if ($i&1) {
-+    $code.=<<___;
-+	 srlx	@X[$j],$shr, at X[$j]	! align X[`$i+1`]
-+	and	$b,$t1,$t1		! round $i
-+	 sllx	@X[$j+1],$shl1,$tx
-+	add	$t2,$a,$a
-+	 sllx	$tx,$shl2,$tx
-+	xor	$d,$t1,$t1
-+	 or	$tx, at X[$j], at X[$j]
-+	 sethi	%hi(@K[$i+1]),$t2
-+	add	$t1,$a,$a
-+	 or	$t2,%lo(@K[$i+1]),$t2
-+	sll	$a,$rot,$t3
-+	 add	@X[$j],$t2,$t2		! X[`$i+1`]+K[`$i+1`]
-+	srl	$a,32-$rot,$a
-+	add	$b,$t3,$t3
-+	 xor	 $b,$c,$t1
-+	add	$t3,$a,$a
-+___
-+  } else {
-+    $code.=<<___;
-+	 srlx	@X[$j],32,$tx		! extract X[`2*$j+1`]
-+	and	$b,$t1,$t1		! round $i
-+	add	$t2,$a,$a
-+	xor	$d,$t1,$t1
-+	 sethi	%hi(@K[$i+1]),$t2
-+	add	$t1,$a,$a
-+	 or	$t2,%lo(@K[$i+1]),$t2
-+	sll	$a,$rot,$t3
-+	 add	$tx,$t2,$t2		! X[`2*$j+1`]+K[`$i+1`]
-+	srl	$a,32-$rot,$a
-+	add	$b,$t3,$t3
-+	 xor	 $b,$c,$t1
-+	add	$t3,$a,$a
-+___
-+  }
-+}
-+
-+sub R0_1 {
-+  my ($i,$a,$b,$c,$d) = @_;
-+  my $rot = (7,12,17,22)[$i%4];
-+
-+$code.=<<___;
-+	 srlx	@X[0],32,$tx		! extract X[1]
-+	and	$b,$t1,$t1		! round $i
-+	add	$t2,$a,$a
-+	xor	$d,$t1,$t1
-+	 sethi	%hi(@K[$i+1]),$t2
-+	add	$t1,$a,$a
-+	 or	$t2,%lo(@K[$i+1]),$t2
-+	sll	$a,$rot,$t3
-+	 add	$tx,$t2,$t2		! X[1]+K[`$i+1`]
-+	srl	$a,32-$rot,$a
-+	add	$b,$t3,$t3
-+	 andn	 $b,$c,$t1
-+	add	$t3,$a,$a
-+___
-+}
-+
-+sub R1 {
-+  my ($i,$a,$b,$c,$d) = @_;
-+  my $rot = (5,9,14,20)[$i%4];
-+  my $j   = $i<31 ? (1+5*($i+1))%16 : (5+3*($i+1))%16;
-+  my $xi  = @X[$j/2];
-+
-+$code.=<<___ if ($j&1 && ($xi=$tx));
-+	 srlx	@X[$j/2],32,$xi		! extract X[$j]
-+___
-+$code.=<<___;
-+	and	$b,$d,$t3		! round $i
-+	add	$t2,$a,$a
-+	or	$t3,$t1,$t1
-+	 sethi	%hi(@K[$i+1]),$t2
-+	add	$t1,$a,$a
-+	 or	$t2,%lo(@K[$i+1]),$t2
-+	sll	$a,$rot,$t3
-+	 add	$xi,$t2,$t2		! X[$j]+K[`$i+1`]
-+	srl	$a,32-$rot,$a
-+	add	$b,$t3,$t3
-+	 `$i<31?"andn":"xor"`	 $b,$c,$t1
-+	add	$t3,$a,$a
-+___
-+}
-+
-+sub R2 {
-+  my ($i,$a,$b,$c,$d) = @_;
-+  my $rot = (4,11,16,23)[$i%4];
-+  my $j   = $i<47 ? (5+3*($i+1))%16 : (0+7*($i+1))%16;
-+  my $xi  = @X[$j/2];
-+
-+$code.=<<___ if ($j&1 && ($xi=$tx));
-+	 srlx	@X[$j/2],32,$xi		! extract X[$j]
-+___
-+$code.=<<___;
-+	add	$t2,$a,$a		! round $i
-+	xor	$b,$t1,$t1
-+	 sethi	%hi(@K[$i+1]),$t2
-+	add	$t1,$a,$a
-+	 or	$t2,%lo(@K[$i+1]),$t2
-+	sll	$a,$rot,$t3
-+	 add	$xi,$t2,$t2		! X[$j]+K[`$i+1`]
-+	srl	$a,32-$rot,$a
-+	add	$b,$t3,$t3
-+	 xor	 $b,$c,$t1
-+	add	$t3,$a,$a
-+___
-+}
-+
-+sub R3 {
-+  my ($i,$a,$b,$c,$d) = @_;
-+  my $rot = (6,10,15,21)[$i%4];
-+  my $j   = (0+7*($i+1))%16;
-+  my $xi  = @X[$j/2];
-+
-+$code.=<<___;
-+	add	$t2,$a,$a		! round $i
-+___
-+$code.=<<___ if ($j&1 && ($xi=$tx));
-+	 srlx	@X[$j/2],32,$xi		! extract X[$j]
-+___
-+$code.=<<___;
-+	orn	$b,$d,$t1
-+	 sethi	%hi(@K[$i+1]),$t2
-+	xor	$c,$t1,$t1
-+	 or	$t2,%lo(@K[$i+1]),$t2
-+	add	$t1,$a,$a
-+	sll	$a,$rot,$t3
-+	 add	$xi,$t2,$t2		! X[$j]+K[`$i+1`]
-+	srl	$a,32-$rot,$a
-+	add	$b,$t3,$t3
-+	add	$t3,$a,$a
-+___
-+}
-+
-+$code.=<<___ if ($bits==64);
-+.register	%g2,#scratch
-+.register	%g3,#scratch
-+___
-+$code.=<<___;
-+#include "sparc_arch.h"
-+
-+.section	".text",#alloc,#execinstr
-+
-+#ifdef __PIC__
-+SPARC_PIC_THUNK(%g1)
-+#endif
-+
-+.globl	md5_block_asm_data_order
-+.align	32
-+md5_block_asm_data_order:
-+	SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
-+	ld	[%g1+4],%g1		! OPENSSL_sparcv9cap_P[1]
-+
-+	andcc	%g1, CFR_MD5, %g0
-+	be	.Lsoftware
-+	nop
-+
-+	mov	4, %g1
-+	andcc	%o1, 0x7, %g0
-+	lda	[%o0 + %g0]0x88, %f0		! load context
-+	lda	[%o0 + %g1]0x88, %f1
-+	add	%o0, 8, %o0
-+	lda	[%o0 + %g0]0x88, %f2
-+	lda	[%o0 + %g1]0x88, %f3
-+	bne,pn	%icc, .Lhwunaligned
-+	sub	%o0, 8, %o0
-+
-+.Lhw_loop:
-+	ldd	[%o1 + 0x00], %f8
-+	ldd	[%o1 + 0x08], %f10
-+	ldd	[%o1 + 0x10], %f12
-+	ldd	[%o1 + 0x18], %f14
-+	ldd	[%o1 + 0x20], %f16
-+	ldd	[%o1 + 0x28], %f18
-+	ldd	[%o1 + 0x30], %f20
-+	subcc	%o2, 1, %o2		! done yet? 
-+	ldd	[%o1 + 0x38], %f22
-+	add	%o1, 0x40, %o1
-+	prefetch [%o1 + 63], 20
-+
-+	.word	0x81b02800		! MD5
-+
-+	bne,pt	`$bits==64?"%xcc":"%icc"`, .Lhw_loop
-+	nop
-+
-+.Lhwfinish:
-+	sta	%f0, [%o0 + %g0]0x88	! store context
-+	sta	%f1, [%o0 + %g1]0x88
-+	add	%o0, 8, %o0
-+	sta	%f2, [%o0 + %g0]0x88
-+	sta	%f3, [%o0 + %g1]0x88
-+	retl
-+	nop
-+
-+.align	8
-+.Lhwunaligned:
-+	alignaddr %o1, %g0, %o1
-+
-+	ldd	[%o1 + 0x00], %f10
-+.Lhwunaligned_loop:
-+	ldd	[%o1 + 0x08], %f12
-+	ldd	[%o1 + 0x10], %f14
-+	ldd	[%o1 + 0x18], %f16
-+	ldd	[%o1 + 0x20], %f18
-+	ldd	[%o1 + 0x28], %f20
-+	ldd	[%o1 + 0x30], %f22
-+	ldd	[%o1 + 0x38], %f24
-+	subcc	%o2, 1, %o2		! done yet?
-+	ldd	[%o1 + 0x40], %f26
-+	add	%o1, 0x40, %o1
-+	prefetch [%o1 + 63], 20
-+
-+	faligndata %f10, %f12, %f8
-+	faligndata %f12, %f14, %f10
-+	faligndata %f14, %f16, %f12
-+	faligndata %f16, %f18, %f14
-+	faligndata %f18, %f20, %f16
-+	faligndata %f20, %f22, %f18
-+	faligndata %f22, %f24, %f20
-+	faligndata %f24, %f26, %f22
-+
-+	.word	0x81b02800		! MD5
-+
-+	bne,pt	`$bits==64?"%xcc":"%icc"`, .Lhwunaligned_loop
-+	for	%f26, %f26, %f10	! %f10=%f26
-+
-+	ba	.Lhwfinish
-+	nop
-+
-+.align	16
-+.Lsoftware:
-+	save	%sp,-$frame,%sp
-+
-+	rd	%asi,$saved_asi
-+	wr	%g0,0x88,%asi		! ASI_PRIMARY_LITTLE
-+	and	$inp,7,$shr
-+	andn	$inp,7,$inp
-+
-+	sll	$shr,3,$shr		! *=8
-+	mov	56,$shl2
-+	ld	[$ctx+0],$A
-+	sub	$shl2,$shr,$shl2
-+	ld	[$ctx+4],$B
-+	and	$shl2,32,$shl1
-+	add	$shl2,8,$shl2
-+	ld	[$ctx+8],$C
-+	sub	$shl2,$shl1,$shl2	! shr+shl1+shl2==64
-+	ld	[$ctx+12],$D
-+	nop
-+
-+.Loop:
-+	 cmp	$shr,0			! was inp aligned?
-+	ldxa	[$inp+0]%asi, at X[0]	! load little-endian input
-+	ldxa	[$inp+8]%asi, at X[1]
-+	ldxa	[$inp+16]%asi, at X[2]
-+	ldxa	[$inp+24]%asi, at X[3]
-+	ldxa	[$inp+32]%asi, at X[4]
-+	 sllx	$A,32,$AB		! pack A,B
-+	ldxa	[$inp+40]%asi, at X[5]
-+	 sllx	$C,32,$CD		! pack C,D
-+	ldxa	[$inp+48]%asi, at X[6]
-+	 or	$B,$AB,$AB
-+	ldxa	[$inp+56]%asi, at X[7]
-+	 or	$D,$CD,$CD
-+	bnz,a,pn	%icc,.+8
-+	ldxa	[$inp+64]%asi, at X[8]
-+
-+	srlx	@X[0],$shr, at X[0]	! align X[0]
-+	sllx	@X[1],$shl1,$tx
-+	 sethi	%hi(@K[0]),$t2
-+	sllx	$tx,$shl2,$tx
-+	 or	$t2,%lo(@K[0]),$t2
-+	or	$tx, at X[0], at X[0]
-+	 xor	$C,$D,$t1
-+	 add	@X[0],$t2,$t2		! X[0]+K[0]
-+___
-+	for ($i=0;$i<15;$i++)	{ &R0($i, at V);	unshift(@V,pop(@V)); }
-+	for (;$i<16;$i++)	{ &R0_1($i, at V);	unshift(@V,pop(@V)); }
-+	for (;$i<32;$i++)	{ &R1($i, at V);	unshift(@V,pop(@V)); }
-+	for (;$i<48;$i++)	{ &R2($i, at V);	unshift(@V,pop(@V)); }
-+	for (;$i<64;$i++)	{ &R3($i, at V);	unshift(@V,pop(@V)); }
-+$code.=<<___;
-+	srlx	$AB,32,$t1		! unpack A,B,C,D and accumulate
-+	add	$inp,64,$inp		! advance inp
-+	srlx	$CD,32,$t2
-+	add	$t1,$A,$A
-+	subcc	$len,1,$len		! done yet?
-+	add	$AB,$B,$B
-+	add	$t2,$C,$C
-+	add	$CD,$D,$D
-+	srl	$B,0,$B			! clruw	$B
-+	bne	`$bits==64?"%xcc":"%icc"`,.Loop
-+	srl	$D,0,$D			! clruw	$D
-+
-+	st	$A,[$ctx+0]		! write out ctx
-+	st	$B,[$ctx+4]
-+	st	$C,[$ctx+8]
-+	st	$D,[$ctx+12]
-+
-+	wr	%g0,$saved_asi,%asi
-+	ret
-+	restore
-+.type	md5_block_asm_data_order,#function
-+.size	md5_block_asm_data_order,(.-md5_block_asm_data_order)
-+
-+.asciz	"MD5 block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
-+.align	4
-+___
-+
-+# Purpose of these subroutines is to explicitly encode VIS instructions,
-+# so that one can compile the module without having to specify VIS
-+# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
-+# Idea is to reserve for option to produce "universal" binary and let
-+# programmer detect if current CPU is VIS capable at run-time.
-+sub unvis {
-+my ($mnemonic,$rs1,$rs2,$rd)=@_;
-+my $ref,$opf;
-+my %visopf = (	"faligndata"	=> 0x048,
-+		"for"		=> 0x07c	);
-+
-+    $ref = "$mnemonic\t$rs1,$rs2,$rd";
-+
-+    if ($opf=$visopf{$mnemonic}) {
-+	foreach ($rs1,$rs2,$rd) {
-+	    return $ref if (!/%f([0-9]{1,2})/);
-+	    $_=$1;
-+	    if ($1>=32) {
-+		return $ref if ($1&1);
-+		# re-encode for upper double register addressing
-+		$_=($1|$1>>5)&31;
-+	    }
-+	}
-+
-+	return	sprintf ".word\t0x%08x !%s",
-+			0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
-+			$ref;
-+    } else {
-+	return $ref;
-+    }
-+}
-+sub unalignaddr {
-+my ($mnemonic,$rs1,$rs2,$rd)=@_;
-+my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
-+my $ref="$mnemonic\t$rs1,$rs2,$rd";
-+
-+    foreach ($rs1,$rs2,$rd) {
-+	if (/%([goli])([0-7])/)	{ $_=$bias{$1}+$2; }
-+	else			{ return $ref; }
-+    }
-+    return  sprintf ".word\t0x%08x !%s",
-+		    0x81b00300|$rd<<25|$rs1<<14|$rs2,
-+		    $ref;
-+}
-+
-+foreach (split("\n",$code)) {
-+	s/\`([^\`]*)\`/eval $1/ge;
-+
-+	s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
-+		&unvis($1,$2,$3,$4)
-+	 /ge;
-+	s/\b(alignaddr)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
-+		&unalignaddr($1,$2,$3,$4)
-+	 /ge;
-+
-+	print $_,"\n";
-+}
-+
-+close STDOUT;
-Index: crypto/aes/asm/aest4-sparcv9.pl
-===================================================================
-diff -uNr openssl-1.0.1m/crypto/aes/asm/aest4-sparcv9.pl openssl-1.0.1m/crypto/aes/asm/aest4-sparcv9.pl
---- openssl-1.0.1m/crypto/aes/asm/aest4-sparcv9.pl 1970-01-01 01:00:00.000000000 +0100
-+++ openssl-1.0.1m/crypto/aes/asm/aest4-sparcv9.pl 2015-03-21 16:27:38.578043100 +0100
-@@ -0,0 +1,902 @@
-+#!/usr/bin/env perl
-+
-+# ====================================================================
-+# Written by David S. Miller <davem at devemloft.net> and Andy Polyakov
-+# <appro at openssl.org>. The module is licensed under 2-clause BSD
-+# license. October 2012. All rights reserved.
-+# ====================================================================
-+
-+######################################################################
-+# AES for SPARC T4.
-+#
-+# AES round instructions complete in 3 cycles and can be issued every
-+# cycle. It means that round calculations should take 4*rounds cycles,
-+# because any given round instruction depends on result of *both*
-+# previous instructions:
-+#
-+#	|0 |1 |2 |3 |4
-+#	|01|01|01|
-+#	   |23|23|23|
-+#	            |01|01|...
-+#	               |23|...
-+#
-+# Provided that fxor [with IV] takes 3 cycles to complete, critical
-+# path length for CBC encrypt would be 3+4*rounds, or in other words
-+# it should process one byte in at least (3+4*rounds)/16 cycles. This
-+# estimate doesn't account for "collateral" instructions, such as
-+# fetching input from memory, xor-ing it with zero-round key and
-+# storing the result. Yet, *measured* performance [for data aligned
-+# at 64-bit boundary!] deviates from this equation by less than 0.5%:
-+#
-+#		128-bit key	192-		256-
-+# CBC encrypt	2.70/2.90(*)	3.20/3.40	3.70/3.90
-+#			 (*) numbers after slash are for
-+#			     misaligned data.
-+#
-+# Out-of-order execution logic managed to fully overlap "collateral"
-+# instructions with those on critical path. Amazing!
-+#
-+# As with Intel AES-NI, question is if it's possible to improve
-+# performance of parallelizeable modes by interleaving round
-+# instructions. Provided round instruction latency and throughput
-+# optimal interleave factor is 2. But can we expect 2x performance
-+# improvement? Well, as round instructions can be issued one per
-+# cycle, they don't saturate the 2-way issue pipeline and therefore
-+# there is room for "collateral" calculations... Yet, 2x speed-up
-+# over CBC encrypt remains unattaintable:
-+#
-+#		128-bit key	192-		256-
-+# CBC decrypt	1.64/2.11	1.89/2.37	2.23/2.61
-+# CTR		1.64/2.08(*)	1.89/2.33	2.23/2.61
-+#			 (*) numbers after slash are for
-+#			     misaligned data.
-+#
-+# Estimates based on amount of instructions under assumption that
-+# round instructions are not pairable with any other instruction
-+# suggest that latter is the actual case and pipeline runs
-+# underutilized. It should be noted that T4 out-of-order execution
-+# logic is so capable that performance gain from 2x interleave is
-+# not even impressive, ~7-13% over non-interleaved code, largest
-+# for 256-bit keys.
-+
-+# To anchor to something else, software implementation processes
-+# one byte in 29 cycles with 128-bit key on same processor. Intel
-+# Sandy Bridge encrypts byte in 5.07 cycles in CBC mode and decrypts
-+# in 0.93, naturally with AES-NI.
-+
-+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-+push(@INC,"${dir}","${dir}../../perlasm");
-+require "sparcv9_modes.pl";
-+
-+&asm_init(@ARGV);
-+
-+$::evp=1;	# if $evp is set to 0, script generates module with
-+# AES_[en|de]crypt, AES_set_[en|de]crypt_key and AES_cbc_encrypt entry
-+# points. These however are not fully compatible with openssl/aes.h,
-+# because they expect AES_KEY to be aligned at 64-bit boundary. When
-+# used through EVP, alignment is arranged at EVP layer. Second thing
-+# that is arranged by EVP is at least 32-bit alignment of IV.
-+
-+######################################################################
-+# single-round subroutines
-+#
-+{
-+my ($inp,$out,$key,$rounds,$tmp,$mask)=map("%o$_",(0..5));
-+
-+$code=<<___;
-+.text
-+
-+.globl	aes_t4_encrypt
-+.align	32
-+aes_t4_encrypt:
-+	andcc		$inp, 7, %g1		! is input aligned?
-+	andn		$inp, 7, $inp
-+
-+	ldx		[$key + 0], %g4
-+	ldx		[$key + 8], %g5
-+
-+	ldx		[$inp + 0], %o4
-+	bz,pt		%icc, 1f
-+	ldx		[$inp + 8], %o5
-+	ldx		[$inp + 16], $inp
-+	sll		%g1, 3, %g1
-+	sub		%g0, %g1, %o3
-+	sllx		%o4, %g1, %o4
-+	sllx		%o5, %g1, %g1
-+	srlx		%o5, %o3, %o5
-+	srlx		$inp, %o3, %o3
-+	or		%o5, %o4, %o4
-+	or		%o3, %g1, %o5
-+1:
-+	ld		[$key + 240], $rounds
-+	ldd		[$key + 16], %f12
-+	ldd		[$key + 24], %f14
-+	xor		%g4, %o4, %o4
-+	xor		%g5, %o5, %o5
-+	movxtod		%o4, %f0
-+	movxtod		%o5, %f2
-+	srl		$rounds, 1, $rounds
-+	ldd		[$key + 32], %f16
-+	sub		$rounds, 1, $rounds
-+	ldd		[$key + 40], %f18
-+	add		$key, 48, $key
-+
-+.Lenc:
-+	aes_eround01	%f12, %f0, %f2, %f4
-+	aes_eround23	%f14, %f0, %f2, %f2
-+	ldd		[$key + 0], %f12
-+	ldd		[$key + 8], %f14
-+	sub		$rounds,1,$rounds
-+	aes_eround01	%f16, %f4, %f2, %f0
-+	aes_eround23	%f18, %f4, %f2, %f2
-+	ldd		[$key + 16], %f16
-+	ldd		[$key + 24], %f18
-+	brnz,pt		$rounds, .Lenc
-+	add		$key, 32, $key
-+
-+	andcc		$out, 7, $tmp		! is output aligned?
-+	aes_eround01	%f12, %f0, %f2, %f4
-+	aes_eround23	%f14, %f0, %f2, %f2
-+	aes_eround01_l	%f16, %f4, %f2, %f0
-+	aes_eround23_l	%f18, %f4, %f2, %f2
-+
-+	bnz,pn		%icc, 2f
-+	nop
-+
-+	std		%f0, [$out + 0]
-+	retl
-+	std		%f2, [$out + 8]
-+
-+2:	alignaddrl	$out, %g0, $out
-+	mov		0xff, $mask
-+	srl		$mask, $tmp, $mask
-+
-+	faligndata	%f0, %f0, %f4
-+	faligndata	%f0, %f2, %f6
-+	faligndata	%f2, %f2, %f8
-+
-+	stda		%f4, [$out + $mask]0xc0	! partial store
-+	std		%f6, [$out + 8]
-+	add		$out, 16, $out
-+	orn		%g0, $mask, $mask
-+	retl
-+	stda		%f8, [$out + $mask]0xc0	! partial store
-+.type	aes_t4_encrypt,#function
-+.size	aes_t4_encrypt,.-aes_t4_encrypt
-+
-+.globl	aes_t4_decrypt
-+.align	32
-+aes_t4_decrypt:
-+	andcc		$inp, 7, %g1		! is input aligned?
-+	andn		$inp, 7, $inp
-+
-+	ldx		[$key + 0], %g4
-+	ldx		[$key + 8], %g5
-+
-+	ldx		[$inp + 0], %o4
-+	bz,pt		%icc, 1f
-+	ldx		[$inp + 8], %o5
-+	ldx		[$inp + 16], $inp
-+	sll		%g1, 3, %g1
-+	sub		%g0, %g1, %o3
-+	sllx		%o4, %g1, %o4
-+	sllx		%o5, %g1, %g1
-+	srlx		%o5, %o3, %o5
-+	srlx		$inp, %o3, %o3
-+	or		%o5, %o4, %o4
-+	or		%o3, %g1, %o5
-+1:
-+	ld		[$key + 240], $rounds
-+	ldd		[$key + 16], %f12
-+	ldd		[$key + 24], %f14
-+	xor		%g4, %o4, %o4
-+	xor		%g5, %o5, %o5
-+	movxtod		%o4, %f0
-+	movxtod		%o5, %f2
-+	srl		$rounds, 1, $rounds
-+	ldd		[$key + 32], %f16
-+	sub		$rounds, 1, $rounds
-+	ldd		[$key + 40], %f18
-+	add		$key, 48, $key
-+
-+.Ldec:
-+	aes_dround01	%f12, %f0, %f2, %f4
-+	aes_dround23	%f14, %f0, %f2, %f2
-+	ldd		[$key + 0], %f12
-+	ldd		[$key + 8], %f14
-+	sub		$rounds,1,$rounds
-+	aes_dround01	%f16, %f4, %f2, %f0
-+	aes_dround23	%f18, %f4, %f2, %f2
-+	ldd		[$key + 16], %f16
-+	ldd		[$key + 24], %f18
-+	brnz,pt		$rounds, .Ldec
-+	add		$key, 32, $key
-+
-+	andcc		$out, 7, $tmp		! is output aligned?
-+	aes_dround01	%f12, %f0, %f2, %f4
-+	aes_dround23	%f14, %f0, %f2, %f2
-+	aes_dround01_l	%f16, %f4, %f2, %f0
-+	aes_dround23_l	%f18, %f4, %f2, %f2
-+
-+	bnz,pn		%icc, 2f
-+	nop
-+
-+	std		%f0, [$out + 0]
-+	retl
-+	std		%f2, [$out + 8]
-+
-+2:	alignaddrl	$out, %g0, $out
-+	mov		0xff, $mask
-+	srl		$mask, $tmp, $mask
-+
-+	faligndata	%f0, %f0, %f4
-+	faligndata	%f0, %f2, %f6
-+	faligndata	%f2, %f2, %f8
-+
-+	stda		%f4, [$out + $mask]0xc0	! partial store
-+	std		%f6, [$out + 8]
-+	add		$out, 16, $out
-+	orn		%g0, $mask, $mask
-+	retl
-+	stda		%f8, [$out + $mask]0xc0	! partial store
-+.type	aes_t4_decrypt,#function
-+.size	aes_t4_decrypt,.-aes_t4_decrypt
-+___
-+}
-+
-+######################################################################
-+# key setup subroutines
-+#
-+{
-+my ($inp,$bits,$out,$tmp)=map("%o$_",(0..5));
-+$code.=<<___;
-+.globl	aes_t4_set_encrypt_key
-+.align	32
-+aes_t4_set_encrypt_key:
-+.Lset_encrypt_key:
-+	and		$inp, 7, $tmp
-+	alignaddr	$inp, %g0, $inp
-+	cmp		$bits, 192
-+	ldd		[$inp + 0], %f0
-+	bl,pt		%icc,.L128
-+	ldd		[$inp + 8], %f2
-+
-+	be,pt		%icc,.L192
-+	ldd		[$inp + 16], %f4
-+	brz,pt		$tmp, .L256aligned
-+	ldd		[$inp + 24], %f6
-+
-+	ldd		[$inp + 32], %f8
-+	faligndata	%f0, %f2, %f0
-+	faligndata	%f2, %f4, %f2
-+	faligndata	%f4, %f6, %f4
-+	faligndata	%f6, %f8, %f6
-+.L256aligned:
-+___
-+for ($i=0; $i<6; $i++) {
-+    $code.=<<___;
-+	std		%f0, [$out + `32*$i+0`]
-+	aes_kexpand1	%f0, %f6, $i, %f0
-+	std		%f2, [$out + `32*$i+8`]
-+	aes_kexpand2	%f2, %f0, %f2
-+	std		%f4, [$out + `32*$i+16`]
-+	aes_kexpand0	%f4, %f2, %f4
-+	std		%f6, [$out + `32*$i+24`]
-+	aes_kexpand2	%f6, %f4, %f6
-+___
-+}
-+$code.=<<___;
-+	std		%f0, [$out + `32*$i+0`]
-+	aes_kexpand1	%f0, %f6, $i, %f0
-+	std		%f2, [$out + `32*$i+8`]
-+	aes_kexpand2	%f2, %f0, %f2
-+	std		%f4, [$out + `32*$i+16`]
-+	std		%f6, [$out + `32*$i+24`]
-+	std		%f0, [$out + `32*$i+32`]
-+	std		%f2, [$out + `32*$i+40`]
-+
-+	mov		14, $tmp
-+	st		$tmp, [$out + 240]
-+	retl
-+	xor		%o0, %o0, %o0
-+
-+.align	16
-+.L192:
-+	brz,pt		$tmp, .L192aligned
-+	nop
-+
-+	ldd		[$inp + 24], %f6
-+	faligndata	%f0, %f2, %f0
-+	faligndata	%f2, %f4, %f2
-+	faligndata	%f4, %f6, %f4
-+.L192aligned:
-+___
-+for ($i=0; $i<7; $i++) {
-+    $code.=<<___;
-+	std		%f0, [$out + `24*$i+0`]
-+	aes_kexpand1	%f0, %f4, $i, %f0
-+	std		%f2, [$out + `24*$i+8`]
-+	aes_kexpand2	%f2, %f0, %f2
-+	std		%f4, [$out + `24*$i+16`]
-+	aes_kexpand2	%f4, %f2, %f4
-+___
-+}
-+$code.=<<___;
-+	std		%f0, [$out + `24*$i+0`]
-+	aes_kexpand1	%f0, %f4, $i, %f0
-+	std		%f2, [$out + `24*$i+8`]
-+	aes_kexpand2	%f2, %f0, %f2
-+	std		%f4, [$out + `24*$i+16`]
-+	std		%f0, [$out + `24*$i+24`]
-+	std		%f2, [$out + `24*$i+32`]
-+
-+	mov		12, $tmp
-+	st		$tmp, [$out + 240]
-+	retl
-+	xor		%o0, %o0, %o0
-+
-+.align	16
-+.L128:
-+	brz,pt		$tmp, .L128aligned
-+	nop
-+
-+	ldd		[$inp + 16], %f4
-+	faligndata	%f0, %f2, %f0
-+	faligndata	%f2, %f4, %f2
-+.L128aligned:
-+___
-+for ($i=0; $i<10; $i++) {
-+    $code.=<<___;
-+	std		%f0, [$out + `16*$i+0`]
-+	aes_kexpand1	%f0, %f2, $i, %f0
-+	std		%f2, [$out + `16*$i+8`]
-+	aes_kexpand2	%f2, %f0, %f2
-+___
-+}
-+$code.=<<___;
-+	std		%f0, [$out + `16*$i+0`]
-+	std		%f2, [$out + `16*$i+8`]
-+
-+	mov		10, $tmp
-+	st		$tmp, [$out + 240]
-+	retl
-+	xor		%o0, %o0, %o0
-+.type	aes_t4_set_encrypt_key,#function
-+.size	aes_t4_set_encrypt_key,.-aes_t4_set_encrypt_key
-+
-+.globl	aes_t4_set_decrypt_key
-+.align	32
-+aes_t4_set_decrypt_key:
-+	mov		%o7, %o5
-+	call		.Lset_encrypt_key
-+	nop
-+
-+	mov		%o5, %o7
-+	sll		$tmp, 4, $inp		! $tmp is number of rounds
-+	add		$tmp, 2, $tmp
-+	add		$out, $inp, $inp	! $inp=$out+16*rounds
-+	srl		$tmp, 2, $tmp		! $tmp=(rounds+2)/4
-+
-+.Lkey_flip:
-+	ldd		[$out + 0],  %f0
-+	ldd		[$out + 8],  %f2
-+	ldd		[$out + 16], %f4
-+	ldd		[$out + 24], %f6
-+	ldd		[$inp + 0],  %f8
-+	ldd		[$inp + 8],  %f10
-+	ldd		[$inp - 16], %f12
-+	ldd		[$inp - 8],  %f14
-+	sub		$tmp, 1, $tmp
-+	std		%f0, [$inp + 0]
-+	std		%f2, [$inp + 8]
-+	std		%f4, [$inp - 16]
-+	std		%f6, [$inp - 8]
-+	std		%f8, [$out + 0]
-+	std		%f10, [$out + 8]
-+	std		%f12, [$out + 16]
-+	std		%f14, [$out + 24]
-+	add		$out, 32, $out
-+	brnz		$tmp, .Lkey_flip
-+	sub		$inp, 32, $inp
-+
-+	retl
-+	xor		%o0, %o0, %o0
-+.type	aes_t4_set_decrypt_key,#function
-+.size	aes_t4_set_decrypt_key,.-aes_t4_set_decrypt_key
-+___
-+}
-+
-+{{{
-+my ($inp,$out,$len,$key,$ivec,$enc)=map("%i$_",(0..5));
-+my ($ileft,$iright,$ooff,$omask,$ivoff)=map("%l$_",(1..7));
-+
-+$code.=<<___;
-+.align	32
-+_aes128_loadkey:
-+	ldx		[$key + 0], %g4
-+	ldx		[$key + 8], %g5
-+___
-+for ($i=2; $i<22;$i++) {			# load key schedule
-+    $code.=<<___;
-+	ldd		[$key + `8*$i`], %f`12+2*$i`
-+___
-+}
-+$code.=<<___;
-+	retl
-+	nop
-+.type	_aes128_loadkey,#function
-+.size	_aes128_loadkey,.-_aes128_loadkey
-+_aes128_load_enckey=_aes128_loadkey
-+_aes128_load_deckey=_aes128_loadkey
-+
-+.align	32
-+_aes128_encrypt_1x:
-+___
-+for ($i=0; $i<4; $i++) {
-+    $code.=<<___;
-+	aes_eround01	%f`16+8*$i+0`, %f0, %f2, %f4
-+	aes_eround23	%f`16+8*$i+2`, %f0, %f2, %f2
-+	aes_eround01	%f`16+8*$i+4`, %f4, %f2, %f0
-+	aes_eround23	%f`16+8*$i+6`, %f4, %f2, %f2
-+___
-+}
-+$code.=<<___;
-+	aes_eround01	%f48, %f0, %f2, %f4
-+	aes_eround23	%f50, %f0, %f2, %f2
-+	aes_eround01_l	%f52, %f4, %f2, %f0
-+	retl
-+	aes_eround23_l	%f54, %f4, %f2, %f2
-+.type	_aes128_encrypt_1x,#function
-+.size	_aes128_encrypt_1x,.-_aes128_encrypt_1x
-+
-+.align	32
-+_aes128_encrypt_2x:
-+___
-+for ($i=0; $i<4; $i++) {
-+    $code.=<<___;
-+	aes_eround01	%f`16+8*$i+0`, %f0, %f2, %f8
-+	aes_eround23	%f`16+8*$i+2`, %f0, %f2, %f2
-+	aes_eround01	%f`16+8*$i+0`, %f4, %f6, %f10
-+	aes_eround23	%f`16+8*$i+2`, %f4, %f6, %f6
-+	aes_eround01	%f`16+8*$i+4`, %f8, %f2, %f0
-+	aes_eround23	%f`16+8*$i+6`, %f8, %f2, %f2
-+	aes_eround01	%f`16+8*$i+4`, %f10, %f6, %f4
-+	aes_eround23	%f`16+8*$i+6`, %f10, %f6, %f6
-+___
-+}
-+$code.=<<___;
-+	aes_eround01	%f48, %f0, %f2, %f8
-+	aes_eround23	%f50, %f0, %f2, %f2
-+	aes_eround01	%f48, %f4, %f6, %f10
-+	aes_eround23	%f50, %f4, %f6, %f6
-+	aes_eround01_l	%f52, %f8, %f2, %f0
-+	aes_eround23_l	%f54, %f8, %f2, %f2
-+	aes_eround01_l	%f52, %f10, %f6, %f4
-+	retl
-+	aes_eround23_l	%f54, %f10, %f6, %f6
-+.type	_aes128_encrypt_2x,#function
-+.size	_aes128_encrypt_2x,.-_aes128_encrypt_2x
-+
-+.align	32
-+_aes128_decrypt_1x:
-+___
-+for ($i=0; $i<4; $i++) {
-+    $code.=<<___;
-+	aes_dround01	%f`16+8*$i+0`, %f0, %f2, %f4
-+	aes_dround23	%f`16+8*$i+2`, %f0, %f2, %f2
-+	aes_dround01	%f`16+8*$i+4`, %f4, %f2, %f0
-+	aes_dround23	%f`16+8*$i+6`, %f4, %f2, %f2
-+___
-+}
-+$code.=<<___;
-+	aes_dround01	%f48, %f0, %f2, %f4
-+	aes_dround23	%f50, %f0, %f2, %f2
-+	aes_dround01_l	%f52, %f4, %f2, %f0
-+	retl
-+	aes_dround23_l	%f54, %f4, %f2, %f2
-+.type	_aes128_decrypt_1x,#function
-+.size	_aes128_decrypt_1x,.-_aes128_decrypt_1x
-+
-+.align	32
-+_aes128_decrypt_2x:
-+___
-+for ($i=0; $i<4; $i++) {
-+    $code.=<<___;
-+	aes_dround01	%f`16+8*$i+0`, %f0, %f2, %f8
-+	aes_dround23	%f`16+8*$i+2`, %f0, %f2, %f2
-+	aes_dround01	%f`16+8*$i+0`, %f4, %f6, %f10
-+	aes_dround23	%f`16+8*$i+2`, %f4, %f6, %f6
-+	aes_dround01	%f`16+8*$i+4`, %f8, %f2, %f0
-+	aes_dround23	%f`16+8*$i+6`, %f8, %f2, %f2
-+	aes_dround01	%f`16+8*$i+4`, %f10, %f6, %f4
-+	aes_dround23	%f`16+8*$i+6`, %f10, %f6, %f6
-+___
-+}
-+$code.=<<___;
-+	aes_dround01	%f48, %f0, %f2, %f8
-+	aes_dround23	%f50, %f0, %f2, %f2
-+	aes_dround01	%f48, %f4, %f6, %f10
-+	aes_dround23	%f50, %f4, %f6, %f6
-+	aes_dround01_l	%f52, %f8, %f2, %f0
-+	aes_dround23_l	%f54, %f8, %f2, %f2
-+	aes_dround01_l	%f52, %f10, %f6, %f4
-+	retl
-+	aes_dround23_l	%f54, %f10, %f6, %f6
-+.type	_aes128_decrypt_2x,#function
-+.size	_aes128_decrypt_2x,.-_aes128_decrypt_2x
-+
-+.align	32
-+_aes192_loadkey:
-+_aes256_loadkey:
-+	ldx		[$key + 0], %g4
-+	ldx		[$key + 8], %g5
-+___
-+for ($i=2; $i<26;$i++) {			# load key schedule
-+    $code.=<<___;
-+	ldd		[$key + `8*$i`], %f`12+2*$i`
-+___
-+}
-+$code.=<<___;
-+	retl
-+	nop
-+.type	_aes192_loadkey,#function
-+.size	_aes192_loadkey,.-_aes192_loadkey
-+_aes192_load_enckey=_aes192_loadkey
-+_aes192_load_deckey=_aes192_loadkey
-+_aes256_load_enckey=_aes192_loadkey
-+_aes256_load_deckey=_aes192_loadkey
-+
-+.align	32
-+_aes192_encrypt_1x:
-+___
-+for ($i=0; $i<5; $i++) {
-+    $code.=<<___;
-+	aes_eround01	%f`16+8*$i+0`, %f0, %f2, %f4
-+	aes_eround23	%f`16+8*$i+2`, %f0, %f2, %f2
-+	aes_eround01	%f`16+8*$i+4`, %f4, %f2, %f0
-+	aes_eround23	%f`16+8*$i+6`, %f4, %f2, %f2
-+___
-+}
-+$code.=<<___;
-+	aes_eround01	%f56, %f0, %f2, %f4
-+	aes_eround23	%f58, %f0, %f2, %f2
-+	aes_eround01_l	%f60, %f4, %f2, %f0
-+	retl
-+	aes_eround23_l	%f62, %f4, %f2, %f2
-+.type	_aes192_encrypt_1x,#function
-+.size	_aes192_encrypt_1x,.-_aes192_encrypt_1x
-+
-+.align	32
-+_aes192_encrypt_2x:
-+___
-+for ($i=0; $i<5; $i++) {
-+    $code.=<<___;
-+	aes_eround01	%f`16+8*$i+0`, %f0, %f2, %f8
-+	aes_eround23	%f`16+8*$i+2`, %f0, %f2, %f2
-+	aes_eround01	%f`16+8*$i+0`, %f4, %f6, %f10
-+	aes_eround23	%f`16+8*$i+2`, %f4, %f6, %f6
-+	aes_eround01	%f`16+8*$i+4`, %f8, %f2, %f0
-+	aes_eround23	%f`16+8*$i+6`, %f8, %f2, %f2
-+	aes_eround01	%f`16+8*$i+4`, %f10, %f6, %f4
-+	aes_eround23	%f`16+8*$i+6`, %f10, %f6, %f6
-+___
-+}
-+$code.=<<___;
-+	aes_eround01	%f56, %f0, %f2, %f8
-+	aes_eround23	%f58, %f0, %f2, %f2
-+	aes_eround01	%f56, %f4, %f6, %f10
-+	aes_eround23	%f58, %f4, %f6, %f6
-+	aes_eround01_l	%f60, %f8, %f2, %f0
-+	aes_eround23_l	%f62, %f8, %f2, %f2
-+	aes_eround01_l	%f60, %f10, %f6, %f4
-+	retl
-+	aes_eround23_l	%f62, %f10, %f6, %f6
-+.type	_aes192_encrypt_2x,#function
-+.size	_aes192_encrypt_2x,.-_aes192_encrypt_2x
-+
-+.align	32
-+_aes192_decrypt_1x:
-+___
-+for ($i=0; $i<5; $i++) {
-+    $code.=<<___;
-+	aes_dround01	%f`16+8*$i+0`, %f0, %f2, %f4
-+	aes_dround23	%f`16+8*$i+2`, %f0, %f2, %f2
-+	aes_dround01	%f`16+8*$i+4`, %f4, %f2, %f0
-+	aes_dround23	%f`16+8*$i+6`, %f4, %f2, %f2
-+___
-+}
-+$code.=<<___;
-+	aes_dround01	%f56, %f0, %f2, %f4
-+	aes_dround23	%f58, %f0, %f2, %f2
-+	aes_dround01_l	%f60, %f4, %f2, %f0
-+	retl
-+	aes_dround23_l	%f62, %f4, %f2, %f2
-+.type	_aes192_decrypt_1x,#function
-+.size	_aes192_decrypt_1x,.-_aes192_decrypt_1x
-+
-+.align	32
-+_aes192_decrypt_2x:
-+___
-+for ($i=0; $i<5; $i++) {
-+    $code.=<<___;
-+	aes_dround01	%f`16+8*$i+0`, %f0, %f2, %f8
-+	aes_dround23	%f`16+8*$i+2`, %f0, %f2, %f2
-+	aes_dround01	%f`16+8*$i+0`, %f4, %f6, %f10
-+	aes_dround23	%f`16+8*$i+2`, %f4, %f6, %f6
-+	aes_dround01	%f`16+8*$i+4`, %f8, %f2, %f0
-+	aes_dround23	%f`16+8*$i+6`, %f8, %f2, %f2
-+	aes_dround01	%f`16+8*$i+4`, %f10, %f6, %f4
-+	aes_dround23	%f`16+8*$i+6`, %f10, %f6, %f6
-+___
-+}
-+$code.=<<___;
-+	aes_dround01	%f56, %f0, %f2, %f8
-+	aes_dround23	%f58, %f0, %f2, %f2
-+	aes_dround01	%f56, %f4, %f6, %f10
-+	aes_dround23	%f58, %f4, %f6, %f6
-+	aes_dround01_l	%f60, %f8, %f2, %f0
-+	aes_dround23_l	%f62, %f8, %f2, %f2
-+	aes_dround01_l	%f60, %f10, %f6, %f4
-+	retl
-+	aes_dround23_l	%f62, %f10, %f6, %f6
-+.type	_aes192_decrypt_2x,#function
-+.size	_aes192_decrypt_2x,.-_aes192_decrypt_2x
-+
-+.align	32
-+_aes256_encrypt_1x:
-+	aes_eround01	%f16, %f0, %f2, %f4
-+	aes_eround23	%f18, %f0, %f2, %f2
-+	ldd		[$key + 208], %f16
-+	ldd		[$key + 216], %f18
-+	aes_eround01	%f20, %f4, %f2, %f0
-+	aes_eround23	%f22, %f4, %f2, %f2
-+	ldd		[$key + 224], %f20
-+	ldd		[$key + 232], %f22
-+___
-+for ($i=1; $i<6; $i++) {
-+    $code.=<<___;
-+	aes_eround01	%f`16+8*$i+0`, %f0, %f2, %f4
-+	aes_eround23	%f`16+8*$i+2`, %f0, %f2, %f2
-+	aes_eround01	%f`16+8*$i+4`, %f4, %f2, %f0
-+	aes_eround23	%f`16+8*$i+6`, %f4, %f2, %f2
-+___
-+}
-+$code.=<<___;
-+	aes_eround01	%f16, %f0, %f2, %f4
-+	aes_eround23	%f18, %f0, %f2, %f2
-+	ldd		[$key + 16], %f16
-+	ldd		[$key + 24], %f18
-+	aes_eround01_l	%f20, %f4, %f2, %f0
-+	aes_eround23_l	%f22, %f4, %f2, %f2
-+	ldd		[$key + 32], %f20
-+	retl
-+	ldd		[$key + 40], %f22
-+.type	_aes256_encrypt_1x,#function
-+.size	_aes256_encrypt_1x,.-_aes256_encrypt_1x
-+
-+.align	32
-+_aes256_encrypt_2x:
-+	aes_eround01	%f16, %f0, %f2, %f8
-+	aes_eround23	%f18, %f0, %f2, %f2
-+	aes_eround01	%f16, %f4, %f6, %f10
-+	aes_eround23	%f18, %f4, %f6, %f6
-+	ldd		[$key + 208], %f16
-+	ldd		[$key + 216], %f18
-+	aes_eround01	%f20, %f8, %f2, %f0
-+	aes_eround23	%f22, %f8, %f2, %f2
-+	aes_eround01	%f20, %f10, %f6, %f4
-+	aes_eround23	%f22, %f10, %f6, %f6
-+	ldd		[$key + 224], %f20
-+	ldd		[$key + 232], %f22
-+___
-+for ($i=1; $i<6; $i++) {
-+    $code.=<<___;
-+	aes_eround01	%f`16+8*$i+0`, %f0, %f2, %f8
-+	aes_eround23	%f`16+8*$i+2`, %f0, %f2, %f2
-+	aes_eround01	%f`16+8*$i+0`, %f4, %f6, %f10
-+	aes_eround23	%f`16+8*$i+2`, %f4, %f6, %f6
-+	aes_eround01	%f`16+8*$i+4`, %f8, %f2, %f0
-+	aes_eround23	%f`16+8*$i+6`, %f8, %f2, %f2
-+	aes_eround01	%f`16+8*$i+4`, %f10, %f6, %f4
-+	aes_eround23	%f`16+8*$i+6`, %f10, %f6, %f6
-+___
-+}
-+$code.=<<___;
-+	aes_eround01	%f16, %f0, %f2, %f8
-+	aes_eround23	%f18, %f0, %f2, %f2
-+	aes_eround01	%f16, %f4, %f6, %f10
-+	aes_eround23	%f18, %f4, %f6, %f6
-+	ldd		[$key + 16], %f16
-+	ldd		[$key + 24], %f18
-+	aes_eround01_l	%f20, %f8, %f2, %f0
-+	aes_eround23_l	%f22, %f8, %f2, %f2
-+	aes_eround01_l	%f20, %f10, %f6, %f4
-+	aes_eround23_l	%f22, %f10, %f6, %f6
-+	ldd		[$key + 32], %f20
-+	retl
-+	ldd		[$key + 40], %f22
-+.type	_aes256_encrypt_2x,#function
-+.size	_aes256_encrypt_2x,.-_aes256_encrypt_2x
-+
-+.align	32
-+_aes256_decrypt_1x:
-+	aes_dround01	%f16, %f0, %f2, %f4
-+	aes_dround23	%f18, %f0, %f2, %f2
-+	ldd		[$key + 208], %f16
-+	ldd		[$key + 216], %f18
-+	aes_dround01	%f20, %f4, %f2, %f0
-+	aes_dround23	%f22, %f4, %f2, %f2
-+	ldd		[$key + 224], %f20
-+	ldd		[$key + 232], %f22
-+___
-+for ($i=1; $i<6; $i++) {
-+    $code.=<<___;
-+	aes_dround01	%f`16+8*$i+0`, %f0, %f2, %f4
-+	aes_dround23	%f`16+8*$i+2`, %f0, %f2, %f2
-+	aes_dround01	%f`16+8*$i+4`, %f4, %f2, %f0
-+	aes_dround23	%f`16+8*$i+6`, %f4, %f2, %f2
-+___
-+}
-+$code.=<<___;
-+	aes_dround01	%f16, %f0, %f2, %f4
-+	aes_dround23	%f18, %f0, %f2, %f2
-+	ldd		[$key + 16], %f16
-+	ldd		[$key + 24], %f18
-+	aes_dround01_l	%f20, %f4, %f2, %f0
-+	aes_dround23_l	%f22, %f4, %f2, %f2
-+	ldd		[$key + 32], %f20
-+	retl
-+	ldd		[$key + 40], %f22
-+.type	_aes256_decrypt_1x,#function
-+.size	_aes256_decrypt_1x,.-_aes256_decrypt_1x
-+
-+.align	32
-+_aes256_decrypt_2x:
-+	aes_dround01	%f16, %f0, %f2, %f8
-+	aes_dround23	%f18, %f0, %f2, %f2
-+	aes_dround01	%f16, %f4, %f6, %f10
-+	aes_dround23	%f18, %f4, %f6, %f6
-+	ldd		[$key + 208], %f16
-+	ldd		[$key + 216], %f18
-+	aes_dround01	%f20, %f8, %f2, %f0
-+	aes_dround23	%f22, %f8, %f2, %f2
-+	aes_dround01	%f20, %f10, %f6, %f4
-+	aes_dround23	%f22, %f10, %f6, %f6
-+	ldd		[$key + 224], %f20
-+	ldd		[$key + 232], %f22
-+___
-+for ($i=1; $i<6; $i++) {
-+    $code.=<<___;
-+	aes_dround01	%f`16+8*$i+0`, %f0, %f2, %f8
-+	aes_dround23	%f`16+8*$i+2`, %f0, %f2, %f2
-+	aes_dround01	%f`16+8*$i+0`, %f4, %f6, %f10
-+	aes_dround23	%f`16+8*$i+2`, %f4, %f6, %f6
-+	aes_dround01	%f`16+8*$i+4`, %f8, %f2, %f0
-+	aes_dround23	%f`16+8*$i+6`, %f8, %f2, %f2
-+	aes_dround01	%f`16+8*$i+4`, %f10, %f6, %f4
-+	aes_dround23	%f`16+8*$i+6`, %f10, %f6, %f6
-+___
-+}
-+$code.=<<___;
-+	aes_dround01	%f16, %f0, %f2, %f8
-+	aes_dround23	%f18, %f0, %f2, %f2
-+	aes_dround01	%f16, %f4, %f6, %f10
-+	aes_dround23	%f18, %f4, %f6, %f6
-+	ldd		[$key + 16], %f16
-+	ldd		[$key + 24], %f18
-+	aes_dround01_l	%f20, %f8, %f2, %f0
-+	aes_dround23_l	%f22, %f8, %f2, %f2
-+	aes_dround01_l	%f20, %f10, %f6, %f4
-+	aes_dround23_l	%f22, %f10, %f6, %f6
-+	ldd		[$key + 32], %f20
-+	retl
-+	ldd		[$key + 40], %f22
-+.type	_aes256_decrypt_2x,#function
-+.size	_aes256_decrypt_2x,.-_aes256_decrypt_2x
-+___
-+
-+&alg_cbc_encrypt_implement("aes",128);
-+&alg_cbc_encrypt_implement("aes",192);
-+&alg_cbc_encrypt_implement("aes",256);
-+
-+&alg_cbc_decrypt_implement("aes",128);
-+&alg_cbc_decrypt_implement("aes",192);
-+&alg_cbc_decrypt_implement("aes",256);
-+
-+if ($::evp) {
-+    &alg_ctr32_implement("aes",128);
-+    &alg_ctr32_implement("aes",192);
-+    &alg_ctr32_implement("aes",256);
-+}
-+}}}
-+
-+if (!$::evp) {
-+$code.=<<___;
-+.global	AES_encrypt
-+AES_encrypt=aes_t4_encrypt
-+.global	AES_decrypt
-+AES_decrypt=aes_t4_decrypt
-+.global	AES_set_encrypt_key
-+.align	32
-+AES_set_encrypt_key:
-+	andcc		%o2, 7, %g0		! check alignment
-+	bnz,a,pn	%icc, 1f
-+	mov		-1, %o0
-+	brz,a,pn	%o0, 1f
-+	mov		-1, %o0
-+	brz,a,pn	%o2, 1f
-+	mov		-1, %o0
-+	andncc		%o1, 0x1c0, %g0
-+	bnz,a,pn	%icc, 1f
-+	mov		-2, %o0
-+	cmp		%o1, 128
-+	bl,a,pn		%icc, 1f
-+	mov		-2, %o0
-+	b		aes_t4_set_encrypt_key
-+	nop
-+1:	retl
-+	nop
-+.type	AES_set_encrypt_key,#function
-+.size	AES_set_encrypt_key,.-AES_set_encrypt_key
-+
-+.global	AES_set_decrypt_key
-+.align	32
-+AES_set_decrypt_key:
-+	andcc		%o2, 7, %g0		! check alignment
-+	bnz,a,pn	%icc, 1f
-+	mov		-1, %o0
-+	brz,a,pn	%o0, 1f
-+	mov		-1, %o0
-+	brz,a,pn	%o2, 1f
-+	mov		-1, %o0
-+	andncc		%o1, 0x1c0, %g0
-+	bnz,a,pn	%icc, 1f
-+	mov		-2, %o0
-+	cmp		%o1, 128
-+	bl,a,pn		%icc, 1f
-+	mov		-2, %o0
-+	b		aes_t4_set_decrypt_key
-+	nop
-+1:	retl
-+	nop
-+.type	AES_set_decrypt_key,#function
-+.size	AES_set_decrypt_key,.-AES_set_decrypt_key
-+___
-+
-+my ($inp,$out,$len,$key,$ivec,$enc)=map("%o$_",(0..5));
-+
-+$code.=<<___;
-+.globl	AES_cbc_encrypt
-+.align	32
-+AES_cbc_encrypt:
-+	ld		[$key + 240], %g1
-+	nop
-+	brz		$enc, .Lcbc_decrypt
-+	cmp		%g1, 12
-+
-+	bl,pt		%icc, aes128_t4_cbc_encrypt
-+	nop
-+	be,pn		%icc, aes192_t4_cbc_encrypt
-+	nop
-+	ba		aes256_t4_cbc_encrypt
-+	nop
-+
-+.Lcbc_decrypt:
-+	bl,pt		%icc, aes128_t4_cbc_decrypt
-+	nop
-+	be,pn		%icc, aes192_t4_cbc_decrypt
-+	nop
-+	ba		aes256_t4_cbc_decrypt
-+	nop
-+.type	AES_cbc_encrypt,#function
-+.size	AES_cbc_encrypt,.-AES_cbc_encrypt
-+___
-+}
-+$code.=<<___;
-+.asciz	"AES for SPARC T4, David S. Miller, Andy Polyakov"
-+.align	4
-+___
-+
-+&emit_assembler();
-+
-+close STDOUT;
-Index: crypto/des/asm/dest4-sparcv9.pl
-===================================================================
-diff -uNr openssl-1.0.1m/crypto/des/asm/dest4-sparcv9.pl openssl-1.0.1m/crypto/des/asm/dest4-sparcv9.pl
---- openssl-1.0.1m/crypto/des/asm/dest4-sparcv9.pl 1970-01-01 01:00:00.000000000 +0100
-+++ openssl-1.0.1m/crypto/des/asm/dest4-sparcv9.pl 2015-03-21 16:27:38.578043100 +0100
-@@ -0,0 +1,602 @@
-+#!/usr/bin/env perl
-+
-+# ====================================================================
-+# Written by David S. Miller <davem at devemloft.net> and Andy Polyakov
-+# <appro at openssl.org>. The module is licensed under 2-clause BSD
-+# license. March 2013. All rights reserved.
-+# ====================================================================
-+
-+######################################################################
-+# DES for SPARC T4.
-+#
-+# As with other hardware-assisted ciphers CBC encrypt results [for
-+# aligned data] are virtually identical to critical path lengths:
-+#
-+#		DES		Triple-DES
-+# CBC encrypt	4.14/4.15(*)	11.7/11.7
-+# CBC decrypt	1.77/4.11(**)	6.42/7.47
-+#
-+#			 (*)	numbers after slash are for
-+#				misaligned data;
-+#			 (**)	this is result for largest
-+#				block size, unlike all other
-+#				cases smaller blocks results
-+#				are better[?];
-+
-+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-+push(@INC,"${dir}","${dir}../../perlasm");
-+require "sparcv9_modes.pl";
-+
-+&asm_init(@ARGV);
-+
-+$code.=<<___ if ($::abibits==64);
-+.register       %g2,#scratch
-+.register       %g3,#scratch
-+___
-+
-+$code.=<<___;
-+.text
-+___
-+
-+{ my ($inp,$out)=("%o0","%o1");
-+
-+$code.=<<___;
-+.align	32
-+.globl	des_t4_key_expand
-+.type	des_t4_key_expand,#function
-+des_t4_key_expand:
-+	andcc		$inp, 0x7, %g0
-+	alignaddr	$inp, %g0, $inp
-+	bz,pt		%icc, 1f
-+	ldd		[$inp + 0x00], %f0
-+	ldd		[$inp + 0x08], %f2
-+	faligndata	%f0, %f2, %f0
-+1:	des_kexpand	%f0, 0, %f0
-+	des_kexpand	%f0, 1, %f2
-+	std		%f0, [$out + 0x00]
-+	des_kexpand	%f2, 3, %f6
-+	std		%f2, [$out + 0x08]
-+	des_kexpand	%f2, 2, %f4
-+	des_kexpand	%f6, 3, %f10
-+	std		%f6, [$out + 0x18]
-+	des_kexpand	%f6, 2, %f8
-+	std		%f4, [$out + 0x10]
-+	des_kexpand	%f10, 3, %f14
-+	std		%f10, [$out + 0x28]
-+	des_kexpand	%f10, 2, %f12
-+	std		%f8, [$out + 0x20]
-+	des_kexpand	%f14, 1, %f16
-+	std		%f14, [$out + 0x38]
-+	des_kexpand	%f16, 3, %f20
-+	std		%f12, [$out + 0x30]
-+	des_kexpand	%f16, 2, %f18
-+	std		%f16, [$out + 0x40]
-+	des_kexpand	%f20, 3, %f24
-+	std		%f20, [$out + 0x50]
-+	des_kexpand	%f20, 2, %f22
-+	std		%f18, [$out + 0x48]
-+	des_kexpand	%f24, 3, %f28
-+	std		%f24, [$out + 0x60]
-+	des_kexpand	%f24, 2, %f26
-+	std		%f22, [$out + 0x58]
-+	des_kexpand	%f28, 1, %f30
-+	std		%f28, [$out + 0x70]
-+	std		%f26, [$out + 0x68]
-+	retl
-+	std		%f30, [$out + 0x78]
-+.size	des_t4_key_expand,.-des_t4_key_expand
-+___
-+}
-+{ my ($inp,$out,$len,$key,$ivec) = map("%o$_",(0..4));
-+  my ($ileft,$iright,$omask) = map("%g$_",(1..3));
-+
-+$code.=<<___;
-+.globl	des_t4_cbc_encrypt
-+.align	32
-+des_t4_cbc_encrypt:
-+	ld		[$ivec + 0], %f0	! load ivec
-+	ld		[$ivec + 4], %f1
-+
-+	and		$inp, 7, $ileft
-+	andn		$inp, 7, $inp
-+	sll		$ileft, 3, $ileft
-+	mov		0xff, $omask
-+	prefetch	[$inp], 20
-+	prefetch	[$inp + 63], 20
-+	sub		%g0, $ileft, $iright
-+	and		$out, 7, %g4
-+	alignaddrl	$out, %g0, $out
-+	srl		$omask, %g4, $omask
-+	srlx		$len, 3, $len
-+	movrz		%g4, 0, $omask
-+	prefetch	[$out], 22
-+
-+	ldd		[$key + 0x00], %f4	! load key schedule
-+	ldd		[$key + 0x08], %f6
-+	ldd		[$key + 0x10], %f8
-+	ldd		[$key + 0x18], %f10
-+	ldd		[$key + 0x20], %f12
-+	ldd		[$key + 0x28], %f14
-+	ldd		[$key + 0x30], %f16
-+	ldd		[$key + 0x38], %f18
-+	ldd		[$key + 0x40], %f20
-+	ldd		[$key + 0x48], %f22
-+	ldd		[$key + 0x50], %f24
-+	ldd		[$key + 0x58], %f26
-+	ldd		[$key + 0x60], %f28
-+	ldd		[$key + 0x68], %f30
-+	ldd		[$key + 0x70], %f32
-+	ldd		[$key + 0x78], %f34
-+
-+.Ldes_cbc_enc_loop:
-+	ldx		[$inp + 0], %g4
-+	brz,pt		$ileft, 4f
-+	nop
-+
-+	ldx		[$inp + 8], %g5
-+	sllx		%g4, $ileft, %g4
-+	srlx		%g5, $iright, %g5
-+	or		%g5, %g4, %g4
-+4:
-+	movxtod		%g4, %f2
-+	prefetch	[$inp + 8+63], 20
-+	add		$inp, 8, $inp
-+	fxor		%f2, %f0, %f0		! ^= ivec
-+	prefetch	[$out + 63], 22
-+
-+	des_ip		%f0, %f0
-+	des_round	%f4, %f6, %f0, %f0
-+	des_round	%f8, %f10, %f0, %f0
-+	des_round	%f12, %f14, %f0, %f0
-+	des_round	%f16, %f18, %f0, %f0
-+	des_round	%f20, %f22, %f0, %f0
-+	des_round	%f24, %f26, %f0, %f0
-+	des_round	%f28, %f30, %f0, %f0
-+	des_round	%f32, %f34, %f0, %f0
-+	des_iip		%f0, %f0
-+
-+	brnz,pn		$omask, 2f
-+	sub		$len, 1, $len
-+
-+	std		%f0, [$out + 0]
-+	brnz,pt		$len, .Ldes_cbc_enc_loop
-+	add		$out, 8, $out
-+
-+	st		%f0, [$ivec + 0]	! write out ivec
-+	retl
-+	st		%f1, [$ivec + 4]
-+
-+.align	16
-+2:	ldxa		[$inp]0x82, %g4		! avoid read-after-write hazard
-+						! and ~4x deterioration
-+						! in inp==out case
-+	faligndata	%f0, %f0, %f2		! handle unaligned output
-+
-+	stda		%f2, [$out + $omask]0xc0	! partial store
-+	add		$out, 8, $out
-+	orn		%g0, $omask, $omask
-+	stda		%f2, [$out + $omask]0xc0	! partial store
-+
-+	brnz,pt		$len, .Ldes_cbc_enc_loop+4
-+	orn		%g0, $omask, $omask
-+
-+	st		%f0, [$ivec + 0]	! write out ivec
-+	retl
-+	st		%f1, [$ivec + 4]
-+.type	des_t4_cbc_encrypt,#function
-+.size	des_t4_cbc_encrypt,.-des_t4_cbc_encrypt
-+
-+.globl	des_t4_cbc_decrypt
-+.align	32
-+des_t4_cbc_decrypt:
-+	ld		[$ivec + 0], %f2	! load ivec
-+	ld		[$ivec + 4], %f3
-+
-+	and		$inp, 7, $ileft
-+	andn		$inp, 7, $inp
-+	sll		$ileft, 3, $ileft
-+	mov		0xff, $omask
-+	prefetch	[$inp], 20
-+	prefetch	[$inp + 63], 20
-+	sub		%g0, $ileft, $iright
-+	and		$out, 7, %g4
-+	alignaddrl	$out, %g0, $out
-+	srl		$omask, %g4, $omask
-+	srlx		$len, 3, $len
-+	movrz		%g4, 0, $omask
-+	prefetch	[$out], 22
-+
-+	ldd		[$key + 0x78], %f4	! load key schedule
-+	ldd		[$key + 0x70], %f6
-+	ldd		[$key + 0x68], %f8
-+	ldd		[$key + 0x60], %f10
-+	ldd		[$key + 0x58], %f12
-+	ldd		[$key + 0x50], %f14
-+	ldd		[$key + 0x48], %f16
-+	ldd		[$key + 0x40], %f18
-+	ldd		[$key + 0x38], %f20
-+	ldd		[$key + 0x30], %f22
-+	ldd		[$key + 0x28], %f24
-+	ldd		[$key + 0x20], %f26
-+	ldd		[$key + 0x18], %f28
-+	ldd		[$key + 0x10], %f30
-+	ldd		[$key + 0x08], %f32
-+	ldd		[$key + 0x00], %f34
-+
-+.Ldes_cbc_dec_loop:
-+	ldx		[$inp + 0], %g4
-+	brz,pt		$ileft, 4f
-+	nop
-+
-+	ldx		[$inp + 8], %g5
-+	sllx		%g4, $ileft, %g4
-+	srlx		%g5, $iright, %g5
-+	or		%g5, %g4, %g4
-+4:
-+	movxtod		%g4, %f0
-+	prefetch	[$inp + 8+63], 20
-+	add		$inp, 8, $inp
-+	prefetch	[$out + 63], 22
-+
-+	des_ip		%f0, %f0
-+	des_round	%f4, %f6, %f0, %f0
-+	des_round	%f8, %f10, %f0, %f0
-+	des_round	%f12, %f14, %f0, %f0
-+	des_round	%f16, %f18, %f0, %f0
-+	des_round	%f20, %f22, %f0, %f0
-+	des_round	%f24, %f26, %f0, %f0
-+	des_round	%f28, %f30, %f0, %f0
-+	des_round	%f32, %f34, %f0, %f0
-+	des_iip		%f0, %f0
-+
-+	fxor		%f2, %f0, %f0		! ^= ivec
-+	movxtod		%g4, %f2
-+
-+	brnz,pn		$omask, 2f
-+	sub		$len, 1, $len
-+
-+	std		%f0, [$out + 0]
-+	brnz,pt		$len, .Ldes_cbc_dec_loop
-+	add		$out, 8, $out
-+
-+	st		%f2, [$ivec + 0]	! write out ivec
-+	retl
-+	st		%f3, [$ivec + 4]
-+
-+.align	16
-+2:	ldxa		[$inp]0x82, %g4		! avoid read-after-write hazard
-+						! and ~4x deterioration
-+						! in inp==out case
-+	faligndata	%f0, %f0, %f0		! handle unaligned output
-+
-+	stda		%f0, [$out + $omask]0xc0	! partial store
-+	add		$out, 8, $out
-+	orn		%g0, $omask, $omask
-+	stda		%f0, [$out + $omask]0xc0	! partial store
-+
-+	brnz,pt		$len, .Ldes_cbc_dec_loop+4
-+	orn		%g0, $omask, $omask
-+
-+	st		%f2, [$ivec + 0]	! write out ivec
-+	retl
-+	st		%f3, [$ivec + 4]
-+.type	des_t4_cbc_decrypt,#function
-+.size	des_t4_cbc_decrypt,.-des_t4_cbc_decrypt
-+___
-+
-+# One might wonder why does one have back-to-back des_iip/des_ip
-+# pairs between EDE passes. Indeed, aren't they inverse of each other?
-+# They almost are. Outcome of the pair is 32-bit words being swapped
-+# in target register. Consider pair of des_iip/des_ip as a way to
-+# perform the due swap, it's actually fastest way in this case.
-+
-+$code.=<<___;
-+.globl	des_t4_ede3_cbc_encrypt
-+.align	32
-+des_t4_ede3_cbc_encrypt:
-+	ld		[$ivec + 0], %f0	! load ivec
-+	ld		[$ivec + 4], %f1
-+
-+	and		$inp, 7, $ileft
-+	andn		$inp, 7, $inp
-+	sll		$ileft, 3, $ileft
-+	mov		0xff, $omask
-+	prefetch	[$inp], 20
-+	prefetch	[$inp + 63], 20
-+	sub		%g0, $ileft, $iright
-+	and		$out, 7, %g4
-+	alignaddrl	$out, %g0, $out
-+	srl		$omask, %g4, $omask
-+	srlx		$len, 3, $len
-+	movrz		%g4, 0, $omask
-+	prefetch	[$out], 22
-+
-+	ldd		[$key + 0x00], %f4	! load key schedule
-+	ldd		[$key + 0x08], %f6
-+	ldd		[$key + 0x10], %f8
-+	ldd		[$key + 0x18], %f10
-+	ldd		[$key + 0x20], %f12
-+	ldd		[$key + 0x28], %f14
-+	ldd		[$key + 0x30], %f16
-+	ldd		[$key + 0x38], %f18
-+	ldd		[$key + 0x40], %f20
-+	ldd		[$key + 0x48], %f22
-+	ldd		[$key + 0x50], %f24
-+	ldd		[$key + 0x58], %f26
-+	ldd		[$key + 0x60], %f28
-+	ldd		[$key + 0x68], %f30
-+	ldd		[$key + 0x70], %f32
-+	ldd		[$key + 0x78], %f34
-+
-+.Ldes_ede3_cbc_enc_loop:
-+	ldx		[$inp + 0], %g4
-+	brz,pt		$ileft, 4f
-+	nop
-+
-+	ldx		[$inp + 8], %g5
-+	sllx		%g4, $ileft, %g4
-+	srlx		%g5, $iright, %g5
-+	or		%g5, %g4, %g4
-+4:
-+	movxtod		%g4, %f2
-+	prefetch	[$inp + 8+63], 20
-+	add		$inp, 8, $inp
-+	fxor		%f2, %f0, %f0		! ^= ivec
-+	prefetch	[$out + 63], 22
-+
-+	des_ip		%f0, %f0
-+	des_round	%f4, %f6, %f0, %f0
-+	des_round	%f8, %f10, %f0, %f0
-+	des_round	%f12, %f14, %f0, %f0
-+	des_round	%f16, %f18, %f0, %f0
-+	ldd		[$key + 0x100-0x08], %f36
-+	ldd		[$key + 0x100-0x10], %f38
-+	des_round	%f20, %f22, %f0, %f0
-+	ldd		[$key + 0x100-0x18], %f40
-+	ldd		[$key + 0x100-0x20], %f42
-+	des_round	%f24, %f26, %f0, %f0
-+	ldd		[$key + 0x100-0x28], %f44
-+	ldd		[$key + 0x100-0x30], %f46
-+	des_round	%f28, %f30, %f0, %f0
-+	ldd		[$key + 0x100-0x38], %f48
-+	ldd		[$key + 0x100-0x40], %f50
-+	des_round	%f32, %f34, %f0, %f0
-+	ldd		[$key + 0x100-0x48], %f52
-+	ldd		[$key + 0x100-0x50], %f54
-+	des_iip		%f0, %f0
-+
-+	ldd		[$key + 0x100-0x58], %f56
-+	ldd		[$key + 0x100-0x60], %f58
-+	des_ip		%f0, %f0
-+	ldd		[$key + 0x100-0x68], %f60
-+	ldd		[$key + 0x100-0x70], %f62
-+	des_round	%f36, %f38, %f0, %f0
-+	ldd		[$key + 0x100-0x78], %f36
-+	ldd		[$key + 0x100-0x80], %f38
-+	des_round	%f40, %f42, %f0, %f0
-+	des_round	%f44, %f46, %f0, %f0
-+	des_round	%f48, %f50, %f0, %f0
-+	ldd		[$key + 0x100+0x00], %f40
-+	ldd		[$key + 0x100+0x08], %f42
-+	des_round	%f52, %f54, %f0, %f0
-+	ldd		[$key + 0x100+0x10], %f44
-+	ldd		[$key + 0x100+0x18], %f46
-+	des_round	%f56, %f58, %f0, %f0
-+	ldd		[$key + 0x100+0x20], %f48
-+	ldd		[$key + 0x100+0x28], %f50
-+	des_round	%f60, %f62, %f0, %f0
-+	ldd		[$key + 0x100+0x30], %f52
-+	ldd		[$key + 0x100+0x38], %f54
-+	des_round	%f36, %f38, %f0, %f0
-+	ldd		[$key + 0x100+0x40], %f56
-+	ldd		[$key + 0x100+0x48], %f58
-+	des_iip		%f0, %f0
-+
-+	ldd		[$key + 0x100+0x50], %f60
-+	ldd		[$key + 0x100+0x58], %f62
-+	des_ip		%f0, %f0
-+	ldd		[$key + 0x100+0x60], %f36
-+	ldd		[$key + 0x100+0x68], %f38
-+	des_round	%f40, %f42, %f0, %f0
-+	ldd		[$key + 0x100+0x70], %f40
-+	ldd		[$key + 0x100+0x78], %f42
-+	des_round	%f44, %f46, %f0, %f0
-+	des_round	%f48, %f50, %f0, %f0
-+	des_round	%f52, %f54, %f0, %f0
-+	des_round	%f56, %f58, %f0, %f0
-+	des_round	%f60, %f62, %f0, %f0
-+	des_round	%f36, %f38, %f0, %f0
-+	des_round	%f40, %f42, %f0, %f0
-+	des_iip		%f0, %f0
-+
-+	brnz,pn		$omask, 2f
-+	sub		$len, 1, $len
-+
-+	std		%f0, [$out + 0]
-+	brnz,pt		$len, .Ldes_ede3_cbc_enc_loop
-+	add		$out, 8, $out
-+
-+	st		%f0, [$ivec + 0]	! write out ivec
-+	retl
-+	st		%f1, [$ivec + 4]
-+
-+.align	16
-+2:	ldxa		[$inp]0x82, %g4		! avoid read-after-write hazard
-+						! and ~2x deterioration
-+						! in inp==out case
-+	faligndata	%f0, %f0, %f2		! handle unaligned output
-+
-+	stda		%f2, [$out + $omask]0xc0	! partial store
-+	add		$out, 8, $out
-+	orn		%g0, $omask, $omask
-+	stda		%f2, [$out + $omask]0xc0	! partial store
-+
-+	brnz,pt		$len, .Ldes_ede3_cbc_enc_loop+4
-+	orn		%g0, $omask, $omask
-+
-+	st		%f0, [$ivec + 0]	! write out ivec
-+	retl
-+	st		%f1, [$ivec + 4]
-+.type	des_t4_ede3_cbc_encrypt,#function
-+.size	des_t4_ede3_cbc_encrypt,.-des_t4_ede3_cbc_encrypt
-+
-+.globl	des_t4_ede3_cbc_decrypt
-+.align	32
-+des_t4_ede3_cbc_decrypt:
-+	ld		[$ivec + 0], %f2	! load ivec
-+	ld		[$ivec + 4], %f3
-+
-+	and		$inp, 7, $ileft
-+	andn		$inp, 7, $inp
-+	sll		$ileft, 3, $ileft
-+	mov		0xff, $omask
-+	prefetch	[$inp], 20
-+	prefetch	[$inp + 63], 20
-+	sub		%g0, $ileft, $iright
-+	and		$out, 7, %g4
-+	alignaddrl	$out, %g0, $out
-+	srl		$omask, %g4, $omask
-+	srlx		$len, 3, $len
-+	movrz		%g4, 0, $omask
-+	prefetch	[$out], 22
-+
-+	ldd		[$key + 0x100+0x78], %f4	! load key schedule
-+	ldd		[$key + 0x100+0x70], %f6
-+	ldd		[$key + 0x100+0x68], %f8
-+	ldd		[$key + 0x100+0x60], %f10
-+	ldd		[$key + 0x100+0x58], %f12
-+	ldd		[$key + 0x100+0x50], %f14
-+	ldd		[$key + 0x100+0x48], %f16
-+	ldd		[$key + 0x100+0x40], %f18
-+	ldd		[$key + 0x100+0x38], %f20
-+	ldd		[$key + 0x100+0x30], %f22
-+	ldd		[$key + 0x100+0x28], %f24
-+	ldd		[$key + 0x100+0x20], %f26
-+	ldd		[$key + 0x100+0x18], %f28
-+	ldd		[$key + 0x100+0x10], %f30
-+	ldd		[$key + 0x100+0x08], %f32
-+	ldd		[$key + 0x100+0x00], %f34
-+
-+.Ldes_ede3_cbc_dec_loop:
-+	ldx		[$inp + 0], %g4
-+	brz,pt		$ileft, 4f
-+	nop
-+
-+	ldx		[$inp + 8], %g5
-+	sllx		%g4, $ileft, %g4
-+	srlx		%g5, $iright, %g5
-+	or		%g5, %g4, %g4
-+4:
-+	movxtod		%g4, %f0
-+	prefetch	[$inp + 8+63], 20
-+	add		$inp, 8, $inp
-+	prefetch	[$out + 63], 22
-+
-+	des_ip		%f0, %f0
-+	des_round	%f4, %f6, %f0, %f0
-+	des_round	%f8, %f10, %f0, %f0
-+	des_round	%f12, %f14, %f0, %f0
-+	des_round	%f16, %f18, %f0, %f0
-+	ldd		[$key + 0x80+0x00], %f36
-+	ldd		[$key + 0x80+0x08], %f38
-+	des_round	%f20, %f22, %f0, %f0
-+	ldd		[$key + 0x80+0x10], %f40
-+	ldd		[$key + 0x80+0x18], %f42
-+	des_round	%f24, %f26, %f0, %f0
-+	ldd		[$key + 0x80+0x20], %f44
-+	ldd		[$key + 0x80+0x28], %f46
-+	des_round	%f28, %f30, %f0, %f0
-+	ldd		[$key + 0x80+0x30], %f48
-+	ldd		[$key + 0x80+0x38], %f50
-+	des_round	%f32, %f34, %f0, %f0
-+	ldd		[$key + 0x80+0x40], %f52
-+	ldd		[$key + 0x80+0x48], %f54
-+	des_iip		%f0, %f0
-+
-+	ldd		[$key + 0x80+0x50], %f56
-+	ldd		[$key + 0x80+0x58], %f58
-+	des_ip		%f0, %f0
-+	ldd		[$key + 0x80+0x60], %f60
-+	ldd		[$key + 0x80+0x68], %f62
-+	des_round	%f36, %f38, %f0, %f0
-+	ldd		[$key + 0x80+0x70], %f36
-+	ldd		[$key + 0x80+0x78], %f38
-+	des_round	%f40, %f42, %f0, %f0
-+	des_round	%f44, %f46, %f0, %f0
-+	des_round	%f48, %f50, %f0, %f0
-+	ldd		[$key + 0x80-0x08], %f40
-+	ldd		[$key + 0x80-0x10], %f42
-+	des_round	%f52, %f54, %f0, %f0
-+	ldd		[$key + 0x80-0x18], %f44
-+	ldd		[$key + 0x80-0x20], %f46
-+	des_round	%f56, %f58, %f0, %f0
-+	ldd		[$key + 0x80-0x28], %f48
-+	ldd		[$key + 0x80-0x30], %f50
-+	des_round	%f60, %f62, %f0, %f0
-+	ldd		[$key + 0x80-0x38], %f52
-+	ldd		[$key + 0x80-0x40], %f54
-+	des_round	%f36, %f38, %f0, %f0
-+	ldd		[$key + 0x80-0x48], %f56
-+	ldd		[$key + 0x80-0x50], %f58
-+	des_iip		%f0, %f0
-+
-+	ldd		[$key + 0x80-0x58], %f60
-+	ldd		[$key + 0x80-0x60], %f62
-+	des_ip		%f0, %f0
-+	ldd		[$key + 0x80-0x68], %f36
-+	ldd		[$key + 0x80-0x70], %f38
-+	des_round	%f40, %f42, %f0, %f0
-+	ldd		[$key + 0x80-0x78], %f40
-+	ldd		[$key + 0x80-0x80], %f42
-+	des_round	%f44, %f46, %f0, %f0
-+	des_round	%f48, %f50, %f0, %f0
-+	des_round	%f52, %f54, %f0, %f0
-+	des_round	%f56, %f58, %f0, %f0
-+	des_round	%f60, %f62, %f0, %f0
-+	des_round	%f36, %f38, %f0, %f0
-+	des_round	%f40, %f42, %f0, %f0
-+	des_iip		%f0, %f0
-+
-+	fxor		%f2, %f0, %f0		! ^= ivec
-+	movxtod		%g4, %f2
-+
-+	brnz,pn		$omask, 2f
-+	sub		$len, 1, $len
-+
-+	std		%f0, [$out + 0]
-+	brnz,pt		$len, .Ldes_ede3_cbc_dec_loop
-+	add		$out, 8, $out
-+
-+	st		%f2, [$ivec + 0]	! write out ivec
-+	retl
-+	st		%f3, [$ivec + 4]
-+
-+.align	16
-+2:	ldxa		[$inp]0x82, %g4		! avoid read-after-write hazard
-+						! and ~3x deterioration
-+						! in inp==out case
-+	faligndata	%f0, %f0, %f0		! handle unaligned output
-+
-+	stda		%f0, [$out + $omask]0xc0	! partial store
-+	add		$out, 8, $out
-+	orn		%g0, $omask, $omask
-+	stda		%f0, [$out + $omask]0xc0	! partial store
-+
-+	brnz,pt		$len, .Ldes_ede3_cbc_dec_loop+4
-+	orn		%g0, $omask, $omask
-+
-+	st		%f2, [$ivec + 0]	! write out ivec
-+	retl
-+	st		%f3, [$ivec + 4]
-+.type	des_t4_ede3_cbc_decrypt,#function
-+.size	des_t4_ede3_cbc_decrypt,.-des_t4_ede3_cbc_decrypt
-+___
-+}
-+$code.=<<___;
-+.asciz  "DES for SPARC T4, David S. Miller, Andy Polyakov"
-+.align  4
-+___
-+
-+&emit_assembler();
-+
-+close STDOUT;
-Index: crypto/perlasm/sparcv9_modes.pl
-===================================================================
-diff -uNr openssl-1.0.1m/crypto/perlasm/sparcv9_modes.pl openssl-1.0.1m/crypto/perlasm/sparcv9_modes.pl
---- openssl-1.0.1m/crypto/perlasm/sparcv9_modes.pl 1970-01-01 01:00:00.000000000 +0100
-+++ openssl-1.0.1m/crypto/perlasm/sparcv9_modes.pl 2015-03-21 16:27:38.578043100 +0100
-@@ -0,0 +1,1680 @@
-+#!/usr/bin/env perl
-+
-+# Specific modes implementations for SPARC Architecture 2011. There
-+# is T4 dependency though, an ASI value that is not specified in the
-+# Architecture Manual. But as SPARC universe is rather monocultural,
-+# we imply that processor capable of executing crypto instructions
-+# can handle the ASI in question as well. This means that we ought to
-+# keep eyes open when new processors emerge...
-+#
-+# As for above mentioned ASI. It's so called "block initializing
-+# store" which cancels "read" in "read-update-write" on cache lines.
-+# This is "cooperative" optimization, as it reduces overall pressure
-+# on memory interface. Benefits can't be observed/quantified with
-+# usual benchmarks, on the contrary you can notice that single-thread
-+# performance for parallelizable modes is ~1.5% worse for largest
-+# block sizes [though few percent better for not so long ones]. All
-+# this based on suggestions from David Miller.
-+
-+sub asm_init {		# to be called with @ARGV as argument
-+    for (@_)		{ $::abibits=64 if (/\-m64/ || /\-xarch\=v9/); }
-+    if ($::abibits==64)	{ $::bias=2047; $::frame=192; $::size_t_cc="%xcc"; }
-+    else		{ $::bias=0;    $::frame=112; $::size_t_cc="%icc"; }
-+}
-+
-+# unified interface
-+my ($inp,$out,$len,$key,$ivec)=map("%i$_",(0..5));
-+# local variables
-+my ($ileft,$iright,$ooff,$omask,$ivoff,$blk_init)=map("%l$_",(0..7));
-+
-+sub alg_cbc_encrypt_implement {
-+my ($alg,$bits) = @_;
-+
-+$::code.=<<___;
-+.globl	${alg}${bits}_t4_cbc_encrypt
-+.align	32
-+${alg}${bits}_t4_cbc_encrypt:
-+	save		%sp, -$::frame, %sp
-+	sub		$inp, $out, $blk_init	! $inp!=$out
-+___
-+$::code.=<<___ if (!$::evp);
-+	andcc		$ivec, 7, $ivoff
-+	alignaddr	$ivec, %g0, $ivec
-+
-+	ldd		[$ivec + 0], %f0	! load ivec
-+	bz,pt		%icc, 1f
-+	ldd		[$ivec + 8], %f2
-+	ldd		[$ivec + 16], %f4
-+	faligndata	%f0, %f2, %f0
-+	faligndata	%f2, %f4, %f2
-+1:
-+___
-+$::code.=<<___ if ($::evp);
-+	ld		[$ivec + 0], %f0
-+	ld		[$ivec + 4], %f1
-+	ld		[$ivec + 8], %f2
-+	ld		[$ivec + 12], %f3
-+___
-+$::code.=<<___;
-+	prefetch	[$inp], 20
-+	prefetch	[$inp + 63], 20
-+	call		_${alg}${bits}_load_enckey
-+	and		$inp, 7, $ileft
-+	andn		$inp, 7, $inp
-+	sll		$ileft, 3, $ileft
-+	mov		64, $iright
-+	mov		0xff, $omask
-+	sub		$iright, $ileft, $iright
-+	and		$out, 7, $ooff
-+	cmp		$len, 127
-+	movrnz		$ooff, 0, $blk_init		! if (	$out&7 ||
-+	movleu		$::size_t_cc, 0, $blk_init	!	$len<128 ||
-+	brnz,pn		$blk_init, .L${bits}cbc_enc_blk	!	$inp==$out)
-+	srl		$omask, $ooff, $omask
-+
-+	alignaddrl	$out, %g0, $out
-+	srlx		$len, 4, $len
-+	prefetch	[$out], 22
-+
-+.L${bits}_cbc_enc_loop:
-+	ldx		[$inp + 0], %o0
-+	brz,pt		$ileft, 4f
-+	ldx		[$inp + 8], %o1
-+
-+	ldx		[$inp + 16], %o2
-+	sllx		%o0, $ileft, %o0
-+	srlx		%o1, $iright, %g1
-+	sllx		%o1, $ileft, %o1
-+	or		%g1, %o0, %o0
-+	srlx		%o2, $iright, %o2
-+	or		%o2, %o1, %o1
-+4:
-+	xor		%g4, %o0, %o0		! ^= rk[0]
-+	xor		%g5, %o1, %o1
-+	movxtod		%o0, %f12
-+	movxtod		%o1, %f14
-+
-+	fxor		%f12, %f0, %f0		! ^= ivec
-+	fxor		%f14, %f2, %f2
-+	prefetch	[$out + 63], 22
-+	prefetch	[$inp + 16+63], 20
-+	call		_${alg}${bits}_encrypt_1x
-+	add		$inp, 16, $inp
-+
-+	brnz,pn		$ooff, 2f
-+	sub		$len, 1, $len
-+		
-+	std		%f0, [$out + 0]
-+	std		%f2, [$out + 8]
-+	brnz,pt		$len, .L${bits}_cbc_enc_loop
-+	add		$out, 16, $out
-+___
-+$::code.=<<___ if ($::evp);
-+	st		%f0, [$ivec + 0]
-+	st		%f1, [$ivec + 4]
-+	st		%f2, [$ivec + 8]
-+	st		%f3, [$ivec + 12]
-+___
-+$::code.=<<___ if (!$::evp);
-+	brnz,pn		$ivoff, 3f
-+	nop
-+
-+	std		%f0, [$ivec + 0]	! write out ivec
-+	std		%f2, [$ivec + 8]
-+___
-+$::code.=<<___;
-+	ret
-+	restore
-+
-+.align	16
-+2:	ldxa		[$inp]0x82, %o0		! avoid read-after-write hazard
-+						! and ~3x deterioration
-+						! in inp==out case
-+	faligndata	%f0, %f0, %f4		! handle unaligned output
-+	faligndata	%f0, %f2, %f6
-+	faligndata	%f2, %f2, %f8
-+
-+	stda		%f4, [$out + $omask]0xc0	! partial store
-+	std		%f6, [$out + 8]
-+	add		$out, 16, $out
-+	orn		%g0, $omask, $omask
-+	stda		%f8, [$out + $omask]0xc0	! partial store
-+
-+	brnz,pt		$len, .L${bits}_cbc_enc_loop+4
-+	orn		%g0, $omask, $omask
-+___
-+$::code.=<<___ if ($::evp);
-+	st		%f0, [$ivec + 0]
-+	st		%f1, [$ivec + 4]
-+	st		%f2, [$ivec + 8]
-+	st		%f3, [$ivec + 12]
-+___
-+$::code.=<<___ if (!$::evp);
-+	brnz,pn		$ivoff, 3f
-+	nop
-+
-+	std		%f0, [$ivec + 0]	! write out ivec
-+	std		%f2, [$ivec + 8]
-+	ret
-+	restore
-+
-+.align	16
-+3:	alignaddrl	$ivec, $ivoff, %g0	! handle unaligned ivec
-+	mov		0xff, $omask
-+	srl		$omask, $ivoff, $omask
-+	faligndata	%f0, %f0, %f4
-+	faligndata	%f0, %f2, %f6
-+	faligndata	%f2, %f2, %f8
-+	stda		%f4, [$ivec + $omask]0xc0
-+	std		%f6, [$ivec + 8]
-+	add		$ivec, 16, $ivec
-+	orn		%g0, $omask, $omask
-+	stda		%f8, [$ivec + $omask]0xc0
-+___
-+$::code.=<<___;
-+	ret
-+	restore
-+
-+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-+.align	32
-+.L${bits}cbc_enc_blk:
-+	add	$out, $len, $blk_init
-+	and	$blk_init, 63, $blk_init	! tail
-+	sub	$len, $blk_init, $len
-+	add	$blk_init, 15, $blk_init	! round up to 16n
-+	srlx	$len, 4, $len
-+	srl	$blk_init, 4, $blk_init
-+
-+.L${bits}_cbc_enc_blk_loop:
-+	ldx		[$inp + 0], %o0
-+	brz,pt		$ileft, 5f
-+	ldx		[$inp + 8], %o1
-+
-+	ldx		[$inp + 16], %o2
-+	sllx		%o0, $ileft, %o0
-+	srlx		%o1, $iright, %g1
-+	sllx		%o1, $ileft, %o1
-+	or		%g1, %o0, %o0
-+	srlx		%o2, $iright, %o2
-+	or		%o2, %o1, %o1
-+5:
-+	xor		%g4, %o0, %o0		! ^= rk[0]
-+	xor		%g5, %o1, %o1
-+	movxtod		%o0, %f12
-+	movxtod		%o1, %f14
-+
-+	fxor		%f12, %f0, %f0		! ^= ivec
-+	fxor		%f14, %f2, %f2
-+	prefetch	[$inp + 16+63], 20
-+	call		_${alg}${bits}_encrypt_1x
-+	add		$inp, 16, $inp
-+	sub		$len, 1, $len
-+		
-+	stda		%f0, [$out]0xe2		! ASI_BLK_INIT, T4-specific
-+	add		$out, 8, $out
-+	stda		%f2, [$out]0xe2		! ASI_BLK_INIT, T4-specific
-+	brnz,pt		$len, .L${bits}_cbc_enc_blk_loop
-+	add		$out, 8, $out
-+
-+	membar		#StoreLoad|#StoreStore
-+	brnz,pt		$blk_init, .L${bits}_cbc_enc_loop
-+	mov		$blk_init, $len
-+___
-+$::code.=<<___ if ($::evp);
-+	st		%f0, [$ivec + 0]
-+	st		%f1, [$ivec + 4]
-+	st		%f2, [$ivec + 8]
-+	st		%f3, [$ivec + 12]
-+___
-+$::code.=<<___ if (!$::evp);
-+	brnz,pn		$ivoff, 3b
-+	nop
-+
-+	std		%f0, [$ivec + 0]	! write out ivec
-+	std		%f2, [$ivec + 8]
-+___
-+$::code.=<<___;
-+	ret
-+	restore
-+.type	${alg}${bits}_t4_cbc_encrypt,#function
-+.size	${alg}${bits}_t4_cbc_encrypt,.-${alg}${bits}_t4_cbc_encrypt
-+___
-+}
-+
-+sub alg_cbc_decrypt_implement {
-+my ($alg,$bits) = @_;
-+
-+$::code.=<<___;
-+.globl	${alg}${bits}_t4_cbc_decrypt
-+.align	32
-+${alg}${bits}_t4_cbc_decrypt:
-+	save		%sp, -$::frame, %sp
-+	sub		$inp, $out, $blk_init	! $inp!=$out
-+___
-+$::code.=<<___ if (!$::evp);
-+	andcc		$ivec, 7, $ivoff
-+	alignaddr	$ivec, %g0, $ivec
-+
-+	ldd		[$ivec + 0], %f12	! load ivec
-+	bz,pt		%icc, 1f
-+	ldd		[$ivec + 8], %f14
-+	ldd		[$ivec + 16], %f0
-+	faligndata	%f12, %f14, %f12
-+	faligndata	%f14, %f0, %f14
-+1:
-+___
-+$::code.=<<___ if ($::evp);
-+	ld		[$ivec + 0], %f12	! load ivec
-+	ld		[$ivec + 4], %f13
-+	ld		[$ivec + 8], %f14
-+	ld		[$ivec + 12], %f15
-+___
-+$::code.=<<___;
-+	prefetch	[$inp], 20
-+	prefetch	[$inp + 63], 20
-+	call		_${alg}${bits}_load_deckey
-+	and		$inp, 7, $ileft
-+	andn		$inp, 7, $inp
-+	sll		$ileft, 3, $ileft
-+	mov		64, $iright
-+	mov		0xff, $omask
-+	sub		$iright, $ileft, $iright
-+	and		$out, 7, $ooff
-+	cmp		$len, 255
-+	movrnz		$ooff, 0, $blk_init		! if (	$out&7 ||
-+	movleu		$::size_t_cc, 0, $blk_init	!	$len<256 ||
-+	brnz,pn		$blk_init, .L${bits}cbc_dec_blk	!	$inp==$out)
-+	srl		$omask, $ooff, $omask
-+
-+	andcc		$len, 16, %g0		! is number of blocks even?
-+	srlx		$len, 4, $len
-+	alignaddrl	$out, %g0, $out
-+	bz		%icc, .L${bits}_cbc_dec_loop2x
-+	prefetch	[$out], 22
-+.L${bits}_cbc_dec_loop:
-+	ldx		[$inp + 0], %o0
-+	brz,pt		$ileft, 4f
-+	ldx		[$inp + 8], %o1
-+
-+	ldx		[$inp + 16], %o2
-+	sllx		%o0, $ileft, %o0
-+	srlx		%o1, $iright, %g1
-+	sllx		%o1, $ileft, %o1
-+	or		%g1, %o0, %o0
-+	srlx		%o2, $iright, %o2
-+	or		%o2, %o1, %o1
-+4:
-+	xor		%g4, %o0, %o2		! ^= rk[0]
-+	xor		%g5, %o1, %o3
-+	movxtod		%o2, %f0
-+	movxtod		%o3, %f2
-+
-+	prefetch	[$out + 63], 22
-+	prefetch	[$inp + 16+63], 20
-+	call		_${alg}${bits}_decrypt_1x
-+	add		$inp, 16, $inp
-+
-+	fxor		%f12, %f0, %f0		! ^= ivec
-+	fxor		%f14, %f2, %f2
-+	movxtod		%o0, %f12
-+	movxtod		%o1, %f14
-+
-+	brnz,pn		$ooff, 2f
-+	sub		$len, 1, $len
-+		
-+	std		%f0, [$out + 0]
-+	std		%f2, [$out + 8]
-+	brnz,pt		$len, .L${bits}_cbc_dec_loop2x
-+	add		$out, 16, $out
-+___
-+$::code.=<<___ if ($::evp);
-+	st		%f12, [$ivec + 0]
-+	st		%f13, [$ivec + 4]
-+	st		%f14, [$ivec + 8]
-+	st		%f15, [$ivec + 12]
-+___
-+$::code.=<<___ if (!$::evp);
-+	brnz,pn		$ivoff, .L${bits}_cbc_dec_unaligned_ivec
-+	nop
-+
-+	std		%f12, [$ivec + 0]	! write out ivec
-+	std		%f14, [$ivec + 8]
-+___
-+$::code.=<<___;
-+	ret
-+	restore
-+
-+.align	16
-+2:	ldxa		[$inp]0x82, %o0		! avoid read-after-write hazard
-+						! and ~3x deterioration
-+						! in inp==out case
-+	faligndata	%f0, %f0, %f4		! handle unaligned output
-+	faligndata	%f0, %f2, %f6
-+	faligndata	%f2, %f2, %f8
-+
-+	stda		%f4, [$out + $omask]0xc0	! partial store
-+	std		%f6, [$out + 8]
-+	add		$out, 16, $out
-+	orn		%g0, $omask, $omask
-+	stda		%f8, [$out + $omask]0xc0	! partial store
-+
-+	brnz,pt		$len, .L${bits}_cbc_dec_loop2x+4
-+	orn		%g0, $omask, $omask
-+___
-+$::code.=<<___ if ($::evp);
-+	st		%f12, [$ivec + 0]
-+	st		%f13, [$ivec + 4]
-+	st		%f14, [$ivec + 8]
-+	st		%f15, [$ivec + 12]
-+___
-+$::code.=<<___ if (!$::evp);
-+	brnz,pn		$ivoff, .L${bits}_cbc_dec_unaligned_ivec
-+	nop
-+
-+	std		%f12, [$ivec + 0]	! write out ivec
-+	std		%f14, [$ivec + 8]
-+___
-+$::code.=<<___;
-+	ret
-+	restore
-+
-+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-+.align	32
-+.L${bits}_cbc_dec_loop2x:
-+	ldx		[$inp + 0], %o0
-+	ldx		[$inp + 8], %o1
-+	ldx		[$inp + 16], %o2
-+	brz,pt		$ileft, 4f
-+	ldx		[$inp + 24], %o3
-+
-+	ldx		[$inp + 32], %o4
-+	sllx		%o0, $ileft, %o0
-+	srlx		%o1, $iright, %g1
-+	or		%g1, %o0, %o0
-+	sllx		%o1, $ileft, %o1
-+	srlx		%o2, $iright, %g1
-+	or		%g1, %o1, %o1
-+	sllx		%o2, $ileft, %o2
-+	srlx		%o3, $iright, %g1
-+	or		%g1, %o2, %o2
-+	sllx		%o3, $ileft, %o3
-+	srlx		%o4, $iright, %o4
-+	or		%o4, %o3, %o3
-+4:
-+	xor		%g4, %o0, %o4		! ^= rk[0]
-+	xor		%g5, %o1, %o5
-+	movxtod		%o4, %f0
-+	movxtod		%o5, %f2
-+	xor		%g4, %o2, %o4
-+	xor		%g5, %o3, %o5
-+	movxtod		%o4, %f4
-+	movxtod		%o5, %f6
-+
-+	prefetch	[$out + 63], 22
-+	prefetch	[$inp + 32+63], 20
-+	call		_${alg}${bits}_decrypt_2x
-+	add		$inp, 32, $inp
-+
-+	movxtod		%o0, %f8
-+	movxtod		%o1, %f10
-+	fxor		%f12, %f0, %f0		! ^= ivec
-+	fxor		%f14, %f2, %f2
-+	movxtod		%o2, %f12
-+	movxtod		%o3, %f14
-+	fxor		%f8, %f4, %f4
-+	fxor		%f10, %f6, %f6
-+
-+	brnz,pn		$ooff, 2f
-+	sub		$len, 2, $len
-+		
-+	std		%f0, [$out + 0]
-+	std		%f2, [$out + 8]
-+	std		%f4, [$out + 16]
-+	std		%f6, [$out + 24]
-+	brnz,pt		$len, .L${bits}_cbc_dec_loop2x
-+	add		$out, 32, $out
-+___
-+$::code.=<<___ if ($::evp);
-+	st		%f12, [$ivec + 0]
-+	st		%f13, [$ivec + 4]
-+	st		%f14, [$ivec + 8]
-+	st		%f15, [$ivec + 12]
-+___
-+$::code.=<<___ if (!$::evp);
-+	brnz,pn		$ivoff, .L${bits}_cbc_dec_unaligned_ivec
-+	nop
-+
-+	std		%f12, [$ivec + 0]	! write out ivec
-+	std		%f14, [$ivec + 8]
-+___
-+$::code.=<<___;
-+	ret
-+	restore
-+
-+.align	16
-+2:	ldxa		[$inp]0x82, %o0		! avoid read-after-write hazard
-+						! and ~3x deterioration
-+						! in inp==out case
-+	faligndata	%f0, %f0, %f8		! handle unaligned output
-+	faligndata	%f0, %f2, %f0
-+	faligndata	%f2, %f4, %f2
-+	faligndata	%f4, %f6, %f4
-+	faligndata	%f6, %f6, %f6
-+	stda		%f8, [$out + $omask]0xc0	! partial store
-+	std		%f0, [$out + 8]
-+	std		%f2, [$out + 16]
-+	std		%f4, [$out + 24]
-+	add		$out, 32, $out
-+	orn		%g0, $omask, $omask
-+	stda		%f6, [$out + $omask]0xc0	! partial store
-+
-+	brnz,pt		$len, .L${bits}_cbc_dec_loop2x+4
-+	orn		%g0, $omask, $omask
-+___
-+$::code.=<<___ if ($::evp);
-+	st		%f12, [$ivec + 0]
-+	st		%f13, [$ivec + 4]
-+	st		%f14, [$ivec + 8]
-+	st		%f15, [$ivec + 12]
-+___
-+$::code.=<<___ if (!$::evp);
-+	brnz,pn		$ivoff, .L${bits}_cbc_dec_unaligned_ivec
-+	nop
-+
-+	std		%f12, [$ivec + 0]	! write out ivec
-+	std		%f14, [$ivec + 8]
-+	ret
-+	restore
-+
-+.align	16
-+.L${bits}_cbc_dec_unaligned_ivec:
-+	alignaddrl	$ivec, $ivoff, %g0	! handle unaligned ivec
-+	mov		0xff, $omask
-+	srl		$omask, $ivoff, $omask
-+	faligndata	%f12, %f12, %f0
-+	faligndata	%f12, %f14, %f2
-+	faligndata	%f14, %f14, %f4
-+	stda		%f0, [$ivec + $omask]0xc0
-+	std		%f2, [$ivec + 8]
-+	add		$ivec, 16, $ivec
-+	orn		%g0, $omask, $omask
-+	stda		%f4, [$ivec + $omask]0xc0
-+___
-+$::code.=<<___;
-+	ret
-+	restore
-+
-+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-+.align	32
-+.L${bits}cbc_dec_blk:
-+	add	$out, $len, $blk_init
-+	and	$blk_init, 63, $blk_init	! tail
-+	sub	$len, $blk_init, $len
-+	add	$blk_init, 15, $blk_init	! round up to 16n
-+	srlx	$len, 4, $len
-+	srl	$blk_init, 4, $blk_init
-+	sub	$len, 1, $len
-+	add	$blk_init, 1, $blk_init
-+
-+.L${bits}_cbc_dec_blk_loop2x:
-+	ldx		[$inp + 0], %o0
-+	ldx		[$inp + 8], %o1
-+	ldx		[$inp + 16], %o2
-+	brz,pt		$ileft, 5f
-+	ldx		[$inp + 24], %o3
-+
-+	ldx		[$inp + 32], %o4
-+	sllx		%o0, $ileft, %o0
-+	srlx		%o1, $iright, %g1
-+	or		%g1, %o0, %o0
-+	sllx		%o1, $ileft, %o1
-+	srlx		%o2, $iright, %g1
-+	or		%g1, %o1, %o1
-+	sllx		%o2, $ileft, %o2
-+	srlx		%o3, $iright, %g1
-+	or		%g1, %o2, %o2
-+	sllx		%o3, $ileft, %o3
-+	srlx		%o4, $iright, %o4
-+	or		%o4, %o3, %o3
-+5:
-+	xor		%g4, %o0, %o4		! ^= rk[0]
-+	xor		%g5, %o1, %o5
-+	movxtod		%o4, %f0
-+	movxtod		%o5, %f2
-+	xor		%g4, %o2, %o4
-+	xor		%g5, %o3, %o5
-+	movxtod		%o4, %f4
-+	movxtod		%o5, %f6
-+
-+	prefetch	[$inp + 32+63], 20
-+	call		_${alg}${bits}_decrypt_2x
-+	add		$inp, 32, $inp
-+	subcc		$len, 2, $len
-+
-+	movxtod		%o0, %f8
-+	movxtod		%o1, %f10
-+	fxor		%f12, %f0, %f0		! ^= ivec
-+	fxor		%f14, %f2, %f2
-+	movxtod		%o2, %f12
-+	movxtod		%o3, %f14
-+	fxor		%f8, %f4, %f4
-+	fxor		%f10, %f6, %f6
-+
-+	stda		%f0, [$out]0xe2		! ASI_BLK_INIT, T4-specific
-+	add		$out, 8, $out
-+	stda		%f2, [$out]0xe2		! ASI_BLK_INIT, T4-specific
-+	add		$out, 8, $out
-+	stda		%f4, [$out]0xe2		! ASI_BLK_INIT, T4-specific
-+	add		$out, 8, $out
-+	stda		%f6, [$out]0xe2		! ASI_BLK_INIT, T4-specific
-+	bgu,pt		$::size_t_cc, .L${bits}_cbc_dec_blk_loop2x
-+	add		$out, 8, $out
-+
-+	add		$blk_init, $len, $len
-+	andcc		$len, 1, %g0		! is number of blocks even?
-+	membar		#StoreLoad|#StoreStore
-+	bnz,pt		%icc, .L${bits}_cbc_dec_loop
-+	srl		$len, 0, $len
-+	brnz,pn		$len, .L${bits}_cbc_dec_loop2x
-+	nop
-+___
-+$::code.=<<___ if ($::evp);
-+	st		%f12, [$ivec + 0]	! write out ivec
-+	st		%f13, [$ivec + 4]
-+	st		%f14, [$ivec + 8]
-+	st		%f15, [$ivec + 12]
-+___
-+$::code.=<<___ if (!$::evp);
-+	brnz,pn		$ivoff, 3b
-+	nop
-+
-+	std		%f12, [$ivec + 0]	! write out ivec
-+	std		%f14, [$ivec + 8]
-+___
-+$::code.=<<___;
-+	ret
-+	restore
-+.type	${alg}${bits}_t4_cbc_decrypt,#function
-+.size	${alg}${bits}_t4_cbc_decrypt,.-${alg}${bits}_t4_cbc_decrypt
-+___
-+}
-+
-+sub alg_ctr32_implement {
-+my ($alg,$bits) = @_;
-+
-+$::code.=<<___;
-+.globl	${alg}${bits}_t4_ctr32_encrypt
-+.align	32
-+${alg}${bits}_t4_ctr32_encrypt:
-+	save		%sp, -$::frame, %sp
-+
-+	prefetch	[$inp], 20
-+	prefetch	[$inp + 63], 20
-+	call		_${alg}${bits}_load_enckey
-+	sllx		$len, 4, $len
-+
-+	ld		[$ivec + 0], %l4	! counter
-+	ld		[$ivec + 4], %l5
-+	ld		[$ivec + 8], %l6
-+	ld		[$ivec + 12], %l7
-+
-+	sllx		%l4, 32, %o5
-+	or		%l5, %o5, %o5
-+	sllx		%l6, 32, %g1
-+	xor		%o5, %g4, %g4		! ^= rk[0]
-+	xor		%g1, %g5, %g5
-+	movxtod		%g4, %f14		! most significant 64 bits
-+
-+	sub		$inp, $out, $blk_init	! $inp!=$out
-+	and		$inp, 7, $ileft
-+	andn		$inp, 7, $inp
-+	sll		$ileft, 3, $ileft
-+	mov		64, $iright
-+	mov		0xff, $omask
-+	sub		$iright, $ileft, $iright
-+	and		$out, 7, $ooff
-+	cmp		$len, 255
-+	movrnz		$ooff, 0, $blk_init		! if (	$out&7 ||
-+	movleu		$::size_t_cc, 0, $blk_init	!	$len<256 ||
-+	brnz,pn		$blk_init, .L${bits}_ctr32_blk	!	$inp==$out)
-+	srl		$omask, $ooff, $omask
-+
-+	andcc		$len, 16, %g0		! is number of blocks even?
-+	alignaddrl	$out, %g0, $out
-+	bz		%icc, .L${bits}_ctr32_loop2x
-+	srlx		$len, 4, $len
-+.L${bits}_ctr32_loop:
-+	ldx		[$inp + 0], %o0
-+	brz,pt		$ileft, 4f
-+	ldx		[$inp + 8], %o1
-+
-+	ldx		[$inp + 16], %o2
-+	sllx		%o0, $ileft, %o0
-+	srlx		%o1, $iright, %g1
-+	sllx		%o1, $ileft, %o1
-+	or		%g1, %o0, %o0
-+	srlx		%o2, $iright, %o2
-+	or		%o2, %o1, %o1
-+4:
-+	xor		%g5, %l7, %g1		! ^= rk[0]
-+	add		%l7, 1, %l7
-+	movxtod		%g1, %f2
-+	srl		%l7, 0, %l7		! clruw
-+	prefetch	[$out + 63], 22
-+	prefetch	[$inp + 16+63], 20
-+___
-+$::code.=<<___ if ($alg eq "aes");
-+	aes_eround01	%f16, %f14, %f2, %f4
-+	aes_eround23	%f18, %f14, %f2, %f2
-+___
-+$::code.=<<___ if ($alg eq "cmll");
-+	camellia_f	%f16, %f2, %f14, %f2
-+	camellia_f	%f18, %f14, %f2, %f0
-+___
-+$::code.=<<___;
-+	call		_${alg}${bits}_encrypt_1x+8
-+	add		$inp, 16, $inp
-+
-+	movxtod		%o0, %f10
-+	movxtod		%o1, %f12
-+	fxor		%f10, %f0, %f0		! ^= inp
-+	fxor		%f12, %f2, %f2
-+
-+	brnz,pn		$ooff, 2f
-+	sub		$len, 1, $len
-+		
-+	std		%f0, [$out + 0]
-+	std		%f2, [$out + 8]
-+	brnz,pt		$len, .L${bits}_ctr32_loop2x
-+	add		$out, 16, $out
-+
-+	ret
-+	restore
-+
-+.align	16
-+2:	ldxa		[$inp]0x82, %o0		! avoid read-after-write hazard
-+						! and ~3x deterioration
-+						! in inp==out case
-+	faligndata	%f0, %f0, %f4		! handle unaligned output
-+	faligndata	%f0, %f2, %f6
-+	faligndata	%f2, %f2, %f8
-+	stda		%f4, [$out + $omask]0xc0	! partial store
-+	std		%f6, [$out + 8]
-+	add		$out, 16, $out
-+	orn		%g0, $omask, $omask
-+	stda		%f8, [$out + $omask]0xc0	! partial store
-+
-+	brnz,pt		$len, .L${bits}_ctr32_loop2x+4
-+	orn		%g0, $omask, $omask
-+
-+	ret
-+	restore
-+
-+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-+.align	32
-+.L${bits}_ctr32_loop2x:
-+	ldx		[$inp + 0], %o0
-+	ldx		[$inp + 8], %o1
-+	ldx		[$inp + 16], %o2
-+	brz,pt		$ileft, 4f
-+	ldx		[$inp + 24], %o3
-+
-+	ldx		[$inp + 32], %o4
-+	sllx		%o0, $ileft, %o0
-+	srlx		%o1, $iright, %g1
-+	or		%g1, %o0, %o0
-+	sllx		%o1, $ileft, %o1
-+	srlx		%o2, $iright, %g1
-+	or		%g1, %o1, %o1
-+	sllx		%o2, $ileft, %o2
-+	srlx		%o3, $iright, %g1
-+	or		%g1, %o2, %o2
-+	sllx		%o3, $ileft, %o3
-+	srlx		%o4, $iright, %o4
-+	or		%o4, %o3, %o3
-+4:
-+	xor		%g5, %l7, %g1		! ^= rk[0]
-+	add		%l7, 1, %l7
-+	movxtod		%g1, %f2
-+	srl		%l7, 0, %l7		! clruw
-+	xor		%g5, %l7, %g1
-+	add		%l7, 1, %l7
-+	movxtod		%g1, %f6
-+	srl		%l7, 0, %l7		! clruw
-+	prefetch	[$out + 63], 22
-+	prefetch	[$inp + 32+63], 20
-+___
-+$::code.=<<___ if ($alg eq "aes");
-+	aes_eround01	%f16, %f14, %f2, %f8
-+	aes_eround23	%f18, %f14, %f2, %f2
-+	aes_eround01	%f16, %f14, %f6, %f10
-+	aes_eround23	%f18, %f14, %f6, %f6
-+___
-+$::code.=<<___ if ($alg eq "cmll");
-+	camellia_f	%f16, %f2, %f14, %f2
-+	camellia_f	%f16, %f6, %f14, %f6
-+	camellia_f	%f18, %f14, %f2, %f0
-+	camellia_f	%f18, %f14, %f6, %f4
-+___
-+$::code.=<<___;
-+	call		_${alg}${bits}_encrypt_2x+16
-+	add		$inp, 32, $inp
-+
-+	movxtod		%o0, %f8
-+	movxtod		%o1, %f10
-+	movxtod		%o2, %f12
-+	fxor		%f8, %f0, %f0		! ^= inp
-+	movxtod		%o3, %f8
-+	fxor		%f10, %f2, %f2
-+	fxor		%f12, %f4, %f4
-+	fxor		%f8, %f6, %f6
-+
-+	brnz,pn		$ooff, 2f
-+	sub		$len, 2, $len
-+		
-+	std		%f0, [$out + 0]
-+	std		%f2, [$out + 8]
-+	std		%f4, [$out + 16]
-+	std		%f6, [$out + 24]
-+	brnz,pt		$len, .L${bits}_ctr32_loop2x
-+	add		$out, 32, $out
-+
-+	ret
-+	restore
-+
-+.align	16
-+2:	ldxa		[$inp]0x82, %o0		! avoid read-after-write hazard
-+						! and ~3x deterioration
-+						! in inp==out case
-+	faligndata	%f0, %f0, %f8		! handle unaligned output
-+	faligndata	%f0, %f2, %f0
-+	faligndata	%f2, %f4, %f2
-+	faligndata	%f4, %f6, %f4
-+	faligndata	%f6, %f6, %f6
-+
-+	stda		%f8, [$out + $omask]0xc0	! partial store
-+	std		%f0, [$out + 8]
-+	std		%f2, [$out + 16]
-+	std		%f4, [$out + 24]
-+	add		$out, 32, $out
-+	orn		%g0, $omask, $omask
-+	stda		%f6, [$out + $omask]0xc0	! partial store
-+
-+	brnz,pt		$len, .L${bits}_ctr32_loop2x+4
-+	orn		%g0, $omask, $omask
-+
-+	ret
-+	restore
-+
-+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-+.align	32
-+.L${bits}_ctr32_blk:
-+	add	$out, $len, $blk_init
-+	and	$blk_init, 63, $blk_init	! tail
-+	sub	$len, $blk_init, $len
-+	add	$blk_init, 15, $blk_init	! round up to 16n
-+	srlx	$len, 4, $len
-+	srl	$blk_init, 4, $blk_init
-+	sub	$len, 1, $len
-+	add	$blk_init, 1, $blk_init
-+
-+.L${bits}_ctr32_blk_loop2x:
-+	ldx		[$inp + 0], %o0
-+	ldx		[$inp + 8], %o1
-+	ldx		[$inp + 16], %o2
-+	brz,pt		$ileft, 5f
-+	ldx		[$inp + 24], %o3
-+
-+	ldx		[$inp + 32], %o4
-+	sllx		%o0, $ileft, %o0
-+	srlx		%o1, $iright, %g1
-+	or		%g1, %o0, %o0
-+	sllx		%o1, $ileft, %o1
-+	srlx		%o2, $iright, %g1
-+	or		%g1, %o1, %o1
-+	sllx		%o2, $ileft, %o2
-+	srlx		%o3, $iright, %g1
-+	or		%g1, %o2, %o2
-+	sllx		%o3, $ileft, %o3
-+	srlx		%o4, $iright, %o4
-+	or		%o4, %o3, %o3
-+5:
-+	xor		%g5, %l7, %g1		! ^= rk[0]
-+	add		%l7, 1, %l7
-+	movxtod		%g1, %f2
-+	srl		%l7, 0, %l7		! clruw
-+	xor		%g5, %l7, %g1
-+	add		%l7, 1, %l7
-+	movxtod		%g1, %f6
-+	srl		%l7, 0, %l7		! clruw
-+	prefetch	[$inp + 32+63], 20
-+___
-+$::code.=<<___ if ($alg eq "aes");
-+	aes_eround01	%f16, %f14, %f2, %f8
-+	aes_eround23	%f18, %f14, %f2, %f2
-+	aes_eround01	%f16, %f14, %f6, %f10
-+	aes_eround23	%f18, %f14, %f6, %f6
-+___
-+$::code.=<<___ if ($alg eq "cmll");
-+	camellia_f	%f16, %f2, %f14, %f2
-+	camellia_f	%f16, %f6, %f14, %f6
-+	camellia_f	%f18, %f14, %f2, %f0
-+	camellia_f	%f18, %f14, %f6, %f4
-+___
-+$::code.=<<___;
-+	call		_${alg}${bits}_encrypt_2x+16
-+	add		$inp, 32, $inp
-+	subcc		$len, 2, $len
-+
-+	movxtod		%o0, %f8
-+	movxtod		%o1, %f10
-+	movxtod		%o2, %f12
-+	fxor		%f8, %f0, %f0		! ^= inp
-+	movxtod		%o3, %f8
-+	fxor		%f10, %f2, %f2
-+	fxor		%f12, %f4, %f4
-+	fxor		%f8, %f6, %f6
-+
-+	stda		%f0, [$out]0xe2		! ASI_BLK_INIT, T4-specific
-+	add		$out, 8, $out
-+	stda		%f2, [$out]0xe2		! ASI_BLK_INIT, T4-specific
-+	add		$out, 8, $out
-+	stda		%f4, [$out]0xe2		! ASI_BLK_INIT, T4-specific
-+	add		$out, 8, $out
-+	stda		%f6, [$out]0xe2		! ASI_BLK_INIT, T4-specific
-+	bgu,pt		$::size_t_cc, .L${bits}_ctr32_blk_loop2x
-+	add		$out, 8, $out
-+
-+	add		$blk_init, $len, $len
-+	andcc		$len, 1, %g0		! is number of blocks even?
-+	membar		#StoreLoad|#StoreStore
-+	bnz,pt		%icc, .L${bits}_ctr32_loop
-+	srl		$len, 0, $len
-+	brnz,pn		$len, .L${bits}_ctr32_loop2x
-+	nop
-+
-+	ret
-+	restore
-+.type	${alg}${bits}_t4_ctr32_encrypt,#function
-+.size	${alg}${bits}_t4_ctr32_encrypt,.-${alg}${bits}_t4_ctr32_encrypt
-+___
-+}
-+
-+sub alg_xts_implement {
-+my ($alg,$bits,$dir) = @_;
-+my ($inp,$out,$len,$key1,$key2,$ivec)=map("%i$_",(0..5));
-+my $rem=$ivec;
-+
-+$::code.=<<___;
-+.globl	${alg}${bits}_t4_xts_${dir}crypt
-+.align	32
-+${alg}${bits}_t4_xts_${dir}crypt:
-+	save		%sp, -$::frame-16, %sp
-+
-+	mov		$ivec, %o0
-+	add		%fp, $::bias-16, %o1
-+	call		${alg}_t4_encrypt
-+	mov		$key2, %o2
-+
-+	add		%fp, $::bias-16, %l7
-+	ldxa		[%l7]0x88, %g2
-+	add		%fp, $::bias-8, %l7
-+	ldxa		[%l7]0x88, %g3		! %g3:%g2 is tweak
-+
-+	sethi		%hi(0x76543210), %l7
-+	or		%l7, %lo(0x76543210), %l7
-+	bmask		%l7, %g0, %g0		! byte swap mask
-+
-+	prefetch	[$inp], 20
-+	prefetch	[$inp + 63], 20
-+	call		_${alg}${bits}_load_${dir}ckey
-+	and		$len, 15,  $rem
-+	and		$len, -16, $len
-+___
-+$code.=<<___ if ($dir eq "de");
-+	mov		0, %l7
-+	movrnz		$rem, 16,  %l7
-+	sub		$len, %l7, $len
-+___
-+$code.=<<___;
-+
-+	sub		$inp, $out, $blk_init	! $inp!=$out
-+	and		$inp, 7, $ileft
-+	andn		$inp, 7, $inp
-+	sll		$ileft, 3, $ileft
-+	mov		64, $iright
-+	mov		0xff, $omask
-+	sub		$iright, $ileft, $iright
-+	and		$out, 7, $ooff
-+	cmp		$len, 255
-+	movrnz		$ooff, 0, $blk_init		! if (	$out&7 ||
-+	movleu		$::size_t_cc, 0, $blk_init	!	$len<256 ||
-+	brnz,pn		$blk_init, .L${bits}_xts_${dir}blk !	$inp==$out)
-+	srl		$omask, $ooff, $omask
-+
-+	andcc		$len, 16, %g0		! is number of blocks even?
-+___
-+$code.=<<___ if ($dir eq "de");
-+	brz,pn		$len, .L${bits}_xts_${dir}steal
-+___
-+$code.=<<___;
-+	alignaddrl	$out, %g0, $out
-+	bz		%icc, .L${bits}_xts_${dir}loop2x
-+	srlx		$len, 4, $len
-+.L${bits}_xts_${dir}loop:
-+	ldx		[$inp + 0], %o0
-+	brz,pt		$ileft, 4f
-+	ldx		[$inp + 8], %o1
-+
-+	ldx		[$inp + 16], %o2
-+	sllx		%o0, $ileft, %o0
-+	srlx		%o1, $iright, %g1
-+	sllx		%o1, $ileft, %o1
-+	or		%g1, %o0, %o0
-+	srlx		%o2, $iright, %o2
-+	or		%o2, %o1, %o1
-+4:
-+	movxtod		%g2, %f12
-+	movxtod		%g3, %f14
-+	bshuffle	%f12, %f12, %f12
-+	bshuffle	%f14, %f14, %f14
-+
-+	xor		%g4, %o0, %o0		! ^= rk[0]
-+	xor		%g5, %o1, %o1
-+	movxtod		%o0, %f0
-+	movxtod		%o1, %f2
-+
-+	fxor		%f12, %f0, %f0		! ^= tweak[0]
-+	fxor		%f14, %f2, %f2
-+
-+	prefetch	[$out + 63], 22
-+	prefetch	[$inp + 16+63], 20
-+	call		_${alg}${bits}_${dir}crypt_1x
-+	add		$inp, 16, $inp
-+
-+	fxor		%f12, %f0, %f0		! ^= tweak[0]
-+	fxor		%f14, %f2, %f2
-+
-+	srax		%g3, 63, %l7		! next tweak value
-+	addcc		%g2, %g2, %g2
-+	and		%l7, 0x87, %l7
-+	addxc		%g3, %g3, %g3
-+	xor		%l7, %g2, %g2
-+
-+	brnz,pn		$ooff, 2f
-+	sub		$len, 1, $len
-+		
-+	std		%f0, [$out + 0]
-+	std		%f2, [$out + 8]
-+	brnz,pt		$len, .L${bits}_xts_${dir}loop2x
-+	add		$out, 16, $out
-+
-+	brnz,pn		$rem, .L${bits}_xts_${dir}steal
-+	nop
-+
-+	ret
-+	restore
-+
-+.align	16
-+2:	ldxa		[$inp]0x82, %o0		! avoid read-after-write hazard
-+						! and ~3x deterioration
-+						! in inp==out case
-+	faligndata	%f0, %f0, %f4		! handle unaligned output
-+	faligndata	%f0, %f2, %f6
-+	faligndata	%f2, %f2, %f8
-+	stda		%f4, [$out + $omask]0xc0	! partial store
-+	std		%f6, [$out + 8]
-+	add		$out, 16, $out
-+	orn		%g0, $omask, $omask
-+	stda		%f8, [$out + $omask]0xc0	! partial store
-+
-+	brnz,pt		$len, .L${bits}_xts_${dir}loop2x+4
-+	orn		%g0, $omask, $omask
-+
-+	brnz,pn		$rem, .L${bits}_xts_${dir}steal
-+	nop
-+
-+	ret
-+	restore
-+
-+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-+.align	32
-+.L${bits}_xts_${dir}loop2x:
-+	ldx		[$inp + 0], %o0
-+	ldx		[$inp + 8], %o1
-+	ldx		[$inp + 16], %o2
-+	brz,pt		$ileft, 4f
-+	ldx		[$inp + 24], %o3
-+
-+	ldx		[$inp + 32], %o4
-+	sllx		%o0, $ileft, %o0
-+	srlx		%o1, $iright, %g1
-+	or		%g1, %o0, %o0
-+	sllx		%o1, $ileft, %o1
-+	srlx		%o2, $iright, %g1
-+	or		%g1, %o1, %o1
-+	sllx		%o2, $ileft, %o2
-+	srlx		%o3, $iright, %g1
-+	or		%g1, %o2, %o2
-+	sllx		%o3, $ileft, %o3
-+	srlx		%o4, $iright, %o4
-+	or		%o4, %o3, %o3
-+4:
-+	movxtod		%g2, %f12
-+	movxtod		%g3, %f14
-+	bshuffle	%f12, %f12, %f12
-+	bshuffle	%f14, %f14, %f14
-+
-+	srax		%g3, 63, %l7		! next tweak value
-+	addcc		%g2, %g2, %g2
-+	and		%l7, 0x87, %l7
-+	addxc		%g3, %g3, %g3
-+	xor		%l7, %g2, %g2
-+
-+	movxtod		%g2, %f8
-+	movxtod		%g3, %f10
-+	bshuffle	%f8,  %f8,  %f8
-+	bshuffle	%f10, %f10, %f10
-+
-+	xor		%g4, %o0, %o0		! ^= rk[0]
-+	xor		%g5, %o1, %o1
-+	xor		%g4, %o2, %o2		! ^= rk[0]
-+	xor		%g5, %o3, %o3
-+	movxtod		%o0, %f0
-+	movxtod		%o1, %f2
-+	movxtod		%o2, %f4
-+	movxtod		%o3, %f6
-+
-+	fxor		%f12, %f0, %f0		! ^= tweak[0]
-+	fxor		%f14, %f2, %f2
-+	fxor		%f8,  %f4, %f4		! ^= tweak[0]
-+	fxor		%f10, %f6, %f6
-+
-+	prefetch	[$out + 63], 22
-+	prefetch	[$inp + 32+63], 20
-+	call		_${alg}${bits}_${dir}crypt_2x
-+	add		$inp, 32, $inp
-+
-+	movxtod		%g2, %f8
-+	movxtod		%g3, %f10
-+
-+	srax		%g3, 63, %l7		! next tweak value
-+	addcc		%g2, %g2, %g2
-+	and		%l7, 0x87, %l7
-+	addxc		%g3, %g3, %g3
-+	xor		%l7, %g2, %g2
-+
-+	bshuffle	%f8,  %f8,  %f8
-+	bshuffle	%f10, %f10, %f10
-+
-+	fxor		%f12, %f0, %f0		! ^= tweak[0]
-+	fxor		%f14, %f2, %f2
-+	fxor		%f8,  %f4, %f4
-+	fxor		%f10, %f6, %f6
-+
-+	brnz,pn		$ooff, 2f
-+	sub		$len, 2, $len
-+		
-+	std		%f0, [$out + 0]
-+	std		%f2, [$out + 8]
-+	std		%f4, [$out + 16]
-+	std		%f6, [$out + 24]
-+	brnz,pt		$len, .L${bits}_xts_${dir}loop2x
-+	add		$out, 32, $out
-+
-+	fsrc2		%f4, %f0
-+	fsrc2		%f6, %f2
-+	brnz,pn		$rem, .L${bits}_xts_${dir}steal
-+	nop
-+
-+	ret
-+	restore
-+
-+.align	16
-+2:	ldxa		[$inp]0x82, %o0		! avoid read-after-write hazard
-+						! and ~3x deterioration
-+						! in inp==out case
-+	faligndata	%f0, %f0, %f8		! handle unaligned output
-+	faligndata	%f0, %f2, %f10
-+	faligndata	%f2, %f4, %f12
-+	faligndata	%f4, %f6, %f14
-+	faligndata	%f6, %f6, %f0
-+
-+	stda		%f8, [$out + $omask]0xc0	! partial store
-+	std		%f10, [$out + 8]
-+	std		%f12, [$out + 16]
-+	std		%f14, [$out + 24]
-+	add		$out, 32, $out
-+	orn		%g0, $omask, $omask
-+	stda		%f0, [$out + $omask]0xc0	! partial store
-+
-+	brnz,pt		$len, .L${bits}_xts_${dir}loop2x+4
-+	orn		%g0, $omask, $omask
-+
-+	fsrc2		%f4, %f0
-+	fsrc2		%f6, %f2
-+	brnz,pn		$rem, .L${bits}_xts_${dir}steal
-+	nop
-+
-+	ret
-+	restore
-+
-+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-+.align	32
-+.L${bits}_xts_${dir}blk:
-+	add	$out, $len, $blk_init
-+	and	$blk_init, 63, $blk_init	! tail
-+	sub	$len, $blk_init, $len
-+	add	$blk_init, 15, $blk_init	! round up to 16n
-+	srlx	$len, 4, $len
-+	srl	$blk_init, 4, $blk_init
-+	sub	$len, 1, $len
-+	add	$blk_init, 1, $blk_init
-+
-+.L${bits}_xts_${dir}blk2x:
-+	ldx		[$inp + 0], %o0
-+	ldx		[$inp + 8], %o1
-+	ldx		[$inp + 16], %o2
-+	brz,pt		$ileft, 5f
-+	ldx		[$inp + 24], %o3
-+
-+	ldx		[$inp + 32], %o4
-+	sllx		%o0, $ileft, %o0
-+	srlx		%o1, $iright, %g1
-+	or		%g1, %o0, %o0
-+	sllx		%o1, $ileft, %o1
-+	srlx		%o2, $iright, %g1
-+	or		%g1, %o1, %o1
-+	sllx		%o2, $ileft, %o2
-+	srlx		%o3, $iright, %g1
-+	or		%g1, %o2, %o2
-+	sllx		%o3, $ileft, %o3
-+	srlx		%o4, $iright, %o4
-+	or		%o4, %o3, %o3
-+5:
-+	movxtod		%g2, %f12
-+	movxtod		%g3, %f14
-+	bshuffle	%f12, %f12, %f12
-+	bshuffle	%f14, %f14, %f14
-+
-+	srax		%g3, 63, %l7		! next tweak value
-+	addcc		%g2, %g2, %g2
-+	and		%l7, 0x87, %l7
-+	addxc		%g3, %g3, %g3
-+	xor		%l7, %g2, %g2
-+
-+	movxtod		%g2, %f8
-+	movxtod		%g3, %f10
-+	bshuffle	%f8,  %f8,  %f8
-+	bshuffle	%f10, %f10, %f10
-+
-+	xor		%g4, %o0, %o0		! ^= rk[0]
-+	xor		%g5, %o1, %o1
-+	xor		%g4, %o2, %o2		! ^= rk[0]
-+	xor		%g5, %o3, %o3
-+	movxtod		%o0, %f0
-+	movxtod		%o1, %f2
-+	movxtod		%o2, %f4
-+	movxtod		%o3, %f6
-+
-+	fxor		%f12, %f0, %f0		! ^= tweak[0]
-+	fxor		%f14, %f2, %f2
-+	fxor		%f8,  %f4, %f4		! ^= tweak[0]
-+	fxor		%f10, %f6, %f6
-+
-+	prefetch	[$inp + 32+63], 20
-+	call		_${alg}${bits}_${dir}crypt_2x
-+	add		$inp, 32, $inp
-+
-+	movxtod		%g2, %f8
-+	movxtod		%g3, %f10
-+
-+	srax		%g3, 63, %l7		! next tweak value
-+	addcc		%g2, %g2, %g2
-+	and		%l7, 0x87, %l7
-+	addxc		%g3, %g3, %g3
-+	xor		%l7, %g2, %g2
-+
-+	bshuffle	%f8,  %f8,  %f8
-+	bshuffle	%f10, %f10, %f10
-+
-+	fxor		%f12, %f0, %f0		! ^= tweak[0]
-+	fxor		%f14, %f2, %f2
-+	fxor		%f8,  %f4, %f4
-+	fxor		%f10, %f6, %f6
-+
-+	stda		%f0, [$out]0xe2		! ASI_BLK_INIT, T4-specific
-+	add		$out, 8, $out
-+	stda		%f2, [$out]0xe2		! ASI_BLK_INIT, T4-specific
-+	add		$out, 8, $out
-+	stda		%f4, [$out]0xe2		! ASI_BLK_INIT, T4-specific
-+	add		$out, 8, $out
-+	stda		%f6, [$out]0xe2		! ASI_BLK_INIT, T4-specific
-+	bgu,pt		$::size_t_cc, .L${bits}_xts_${dir}blk2x
-+	add		$out, 8, $out
-+
-+	add		$blk_init, $len, $len
-+	andcc		$len, 1, %g0		! is number of blocks even?
-+	membar		#StoreLoad|#StoreStore
-+	bnz,pt		%icc, .L${bits}_xts_${dir}loop
-+	srl		$len, 0, $len
-+	brnz,pn		$len, .L${bits}_xts_${dir}loop2x
-+	nop
-+
-+	fsrc2		%f4, %f0
-+	fsrc2		%f6, %f2
-+	brnz,pn		$rem, .L${bits}_xts_${dir}steal
-+	nop
-+
-+	ret
-+	restore
-+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-+___
-+$code.=<<___ if ($dir eq "en");
-+.align	32
-+.L${bits}_xts_${dir}steal:
-+	std		%f0, [%fp + $::bias-16]	! copy of output
-+	std		%f2, [%fp + $::bias-8]
-+
-+	srl		$ileft, 3, $ileft
-+	add		%fp, $::bias-16, %l7
-+	add		$inp, $ileft, $inp	! original $inp+$len&-15
-+	add		$out, $ooff, $out	! original $out+$len&-15
-+	mov		0, $ileft
-+	nop					! align
-+
-+.L${bits}_xts_${dir}stealing:
-+	ldub		[$inp + $ileft], %o0
-+	ldub		[%l7  + $ileft], %o1
-+	dec		$rem
-+	stb		%o0, [%l7  + $ileft]
-+	stb		%o1, [$out + $ileft]
-+	brnz		$rem, .L${bits}_xts_${dir}stealing
-+	inc		$ileft
-+
-+	mov		%l7, $inp
-+	sub		$out, 16, $out
-+	mov		0, $ileft
-+	sub		$out, $ooff, $out
-+	ba		.L${bits}_xts_${dir}loop	! one more time
-+	mov		1, $len				! $rem is 0
-+___
-+$code.=<<___ if ($dir eq "de");
-+.align	32
-+.L${bits}_xts_${dir}steal:
-+	ldx		[$inp + 0], %o0
-+	brz,pt		$ileft, 8f
-+	ldx		[$inp + 8], %o1
-+
-+	ldx		[$inp + 16], %o2
-+	sllx		%o0, $ileft, %o0
-+	srlx		%o1, $iright, %g1
-+	sllx		%o1, $ileft, %o1
-+	or		%g1, %o0, %o0
-+	srlx		%o2, $iright, %o2
-+	or		%o2, %o1, %o1
-+8:
-+	srax		%g3, 63, %l7		! next tweak value
-+	addcc		%g2, %g2, %o2
-+	and		%l7, 0x87, %l7
-+	addxc		%g3, %g3, %o3
-+	xor		%l7, %o2, %o2
-+
-+	movxtod		%o2, %f12
-+	movxtod		%o3, %f14
-+	bshuffle	%f12, %f12, %f12
-+	bshuffle	%f14, %f14, %f14
-+
-+	xor		%g4, %o0, %o0		! ^= rk[0]
-+	xor		%g5, %o1, %o1
-+	movxtod		%o0, %f0
-+	movxtod		%o1, %f2
-+
-+	fxor		%f12, %f0, %f0		! ^= tweak[0]
-+	fxor		%f14, %f2, %f2
-+
-+	call		_${alg}${bits}_${dir}crypt_1x
-+	add		$inp, 16, $inp
-+
-+	fxor		%f12, %f0, %f0		! ^= tweak[0]
-+	fxor		%f14, %f2, %f2
-+
-+	std		%f0, [%fp + $::bias-16]
-+	std		%f2, [%fp + $::bias-8]
-+
-+	srl		$ileft, 3, $ileft
-+	add		%fp, $::bias-16, %l7
-+	add		$inp, $ileft, $inp	! original $inp+$len&-15
-+	add		$out, $ooff, $out	! original $out+$len&-15
-+	mov		0, $ileft
-+	add		$out, 16, $out
-+	nop					! align
-+
-+.L${bits}_xts_${dir}stealing:
-+	ldub		[$inp + $ileft], %o0
-+	ldub		[%l7  + $ileft], %o1
-+	dec		$rem
-+	stb		%o0, [%l7  + $ileft]
-+	stb		%o1, [$out + $ileft]
-+	brnz		$rem, .L${bits}_xts_${dir}stealing
-+	inc		$ileft
-+
-+	mov		%l7, $inp
-+	sub		$out, 16, $out
-+	mov		0, $ileft
-+	sub		$out, $ooff, $out
-+	ba		.L${bits}_xts_${dir}loop	! one more time
-+	mov		1, $len				! $rem is 0
-+___
-+$code.=<<___;
-+	ret
-+	restore
-+.type	${alg}${bits}_t4_xts_${dir}crypt,#function
-+.size	${alg}${bits}_t4_xts_${dir}crypt,.-${alg}${bits}_t4_xts_${dir}crypt
-+___
-+}
-+
-+# Purpose of these subroutines is to explicitly encode VIS instructions,
-+# so that one can compile the module without having to specify VIS
-+# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
-+# Idea is to reserve for option to produce "universal" binary and let
-+# programmer detect if current CPU is VIS capable at run-time.
-+sub unvis {
-+my ($mnemonic,$rs1,$rs2,$rd)=@_;
-+my ($ref,$opf);
-+my %visopf = (	"faligndata"	=> 0x048,
-+		"bshuffle"	=> 0x04c,
-+		"fnot2"		=> 0x066,
-+		"fxor"		=> 0x06c,
-+		"fsrc2"		=> 0x078	);
-+
-+    $ref = "$mnemonic\t$rs1,$rs2,$rd";
-+
-+    if ($opf=$visopf{$mnemonic}) {
-+	foreach ($rs1,$rs2,$rd) {
-+	    return $ref if (!/%f([0-9]{1,2})/);
-+	    $_=$1;
-+	    if ($1>=32) {
-+		return $ref if ($1&1);
-+		# re-encode for upper double register addressing
-+		$_=($1|$1>>5)&31;
-+	    }
-+	}
-+
-+	return	sprintf ".word\t0x%08x !%s",
-+			0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
-+			$ref;
-+    } else {
-+	return $ref;
-+    }
-+}
-+
-+sub unvis3 {
-+my ($mnemonic,$rs1,$rs2,$rd)=@_;
-+my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
-+my ($ref,$opf);
-+my %visopf = (	"addxc"		=> 0x011,
-+		"addxccc"	=> 0x013,
-+		"umulxhi"	=> 0x016,
-+		"alignaddr"	=> 0x018,
-+		"bmask"		=> 0x019,
-+		"alignaddrl"	=> 0x01a	);
-+
-+    $ref = "$mnemonic\t$rs1,$rs2,$rd";
-+
-+    if ($opf=$visopf{$mnemonic}) {
-+	foreach ($rs1,$rs2,$rd) {
-+	    return $ref if (!/%([goli])([0-9])/);
-+	    $_=$bias{$1}+$2;
-+	}
-+
-+	return	sprintf ".word\t0x%08x !%s",
-+			0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
-+			$ref;
-+    } else {
-+	return $ref;
-+    }
-+}
-+
-+sub unaes_round {	# 4-argument instructions
-+my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
-+my ($ref,$opf);
-+my %aesopf = (	"aes_eround01"	=> 0,
-+		"aes_eround23"	=> 1,
-+		"aes_dround01"	=> 2,
-+		"aes_dround23"	=> 3,
-+		"aes_eround01_l"=> 4,
-+		"aes_eround23_l"=> 5,
-+		"aes_dround01_l"=> 6,
-+		"aes_dround23_l"=> 7,
-+		"aes_kexpand1"	=> 8	);
-+
-+    $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
-+
-+    if (defined($opf=$aesopf{$mnemonic})) {
-+	$rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3;
-+	foreach ($rs1,$rs2,$rd) {
-+	    return $ref if (!/%f([0-9]{1,2})/);
-+	    $_=$1;
-+	    if ($1>=32) {
-+		return $ref if ($1&1);
-+		# re-encode for upper double register addressing
-+		$_=($1|$1>>5)&31;
-+	    }
-+	}
-+
-+	return	sprintf ".word\t0x%08x !%s",
-+			2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|$opf<<5|$rs2,
-+			$ref;
-+    } else {
-+	return $ref;
-+    }
-+}
-+
-+sub unaes_kexpand {	# 3-argument instructions
-+my ($mnemonic,$rs1,$rs2,$rd)=@_;
-+my ($ref,$opf);
-+my %aesopf = (	"aes_kexpand0"	=> 0x130,
-+		"aes_kexpand2"	=> 0x131	);
-+
-+    $ref = "$mnemonic\t$rs1,$rs2,$rd";
-+
-+    if (defined($opf=$aesopf{$mnemonic})) {
-+	foreach ($rs1,$rs2,$rd) {
-+	    return $ref if (!/%f([0-9]{1,2})/);
-+	    $_=$1;
-+	    if ($1>=32) {
-+		return $ref if ($1&1);
-+		# re-encode for upper double register addressing
-+		$_=($1|$1>>5)&31;
-+	    }
-+	}
-+
-+	return	sprintf ".word\t0x%08x !%s",
-+			2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
-+			$ref;
-+    } else {
-+	return $ref;
-+    }
-+}
-+
-+sub uncamellia_f {	# 4-argument instructions
-+my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
-+my ($ref,$opf);
-+
-+    $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
-+
-+    if (1) {
-+	$rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3;
-+	foreach ($rs1,$rs2,$rd) {
-+	    return $ref if (!/%f([0-9]{1,2})/);
-+	    $_=$1;

@@ Diff output truncated at 100000 characters. @@
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.