SF.net SVN: gar:[25051] csw/mgar/pkg/openssl1/trunk/files

janholzh at users.sourceforge.net janholzh at users.sourceforge.net
Tue Jun 2 09:53:16 CEST 2015


Revision: 25051
          http://sourceforge.net/p/gar/code/25051
Author:   janholzh
Date:     2015-06-02 07:53:16 +0000 (Tue, 02 Jun 2015)
Log Message:
-----------
openssl1/trunk: Oracle Moved pachtes around

Modified Paths:
--------------
    csw/mgar/pkg/openssl1/trunk/files/openssl-1.0.1m-t4-engine.sparc.5.11.patch
    csw/mgar/pkg/openssl1/trunk/files/openssl-1.0.1m-wanboot.patch
    csw/mgar/pkg/openssl1/trunk/files/update-t4-patch.sh
    csw/mgar/pkg/openssl1/trunk/files/update-wanboot-patch.sh

Modified: csw/mgar/pkg/openssl1/trunk/files/openssl-1.0.1m-t4-engine.sparc.5.11.patch
===================================================================
--- csw/mgar/pkg/openssl1/trunk/files/openssl-1.0.1m-t4-engine.sparc.5.11.patch	2015-06-02 07:41:26 UTC (rev 25050)
+++ csw/mgar/pkg/openssl1/trunk/files/openssl-1.0.1m-t4-engine.sparc.5.11.patch	2015-06-02 07:53:16 UTC (rev 25051)
@@ -2227,3 +2227,5563 @@
      {ERR_FUNC(EVP_F_AES_XTS), "AES_XTS"},
      {ERR_FUNC(EVP_F_AES_XTS_CIPHER), "AES_XTS_CIPHER"},
      {ERR_FUNC(EVP_F_ALG_MODULE_INIT), "ALG_MODULE_INIT"},
+Index: crypto/sparc_arch.h
+===================================================================
+diff -uNr openssl-1.0.1m/crypto/sparc_arch.h openssl-1.0.1m/crypto/sparc_arch.h
+--- openssl-1.0.1m/crypto/sparc_arch.h 1970-01-01 01:00:00.000000000 +0100
++++ openssl-1.0.1m/crypto/sparc_arch.h 2015-06-02 09:52:11.809148396 +0200
+@@ -0,0 +1,101 @@
++#ifndef __SPARC_ARCH_H__
++#define	__SPARC_ARCH_H__
++
++#define	SPARCV9_TICK_PRIVILEGED	(1<<0)
++#define	SPARCV9_PREFER_FPU	(1<<1)
++#define	SPARCV9_VIS1		(1<<2)
++#define	SPARCV9_VIS2		(1<<3)	/* reserved */
++#define	SPARCV9_FMADD		(1<<4)	/* reserved for SPARC64 V */
++#define	SPARCV9_BLK		(1<<5)	/* VIS1 block copy */
++#define	SPARCV9_VIS3		(1<<6)
++#define	SPARCV9_RANDOM		(1<<7)
++#define	SPARCV9_64BIT_STACK	(1<<8)
++
++/*
++ * OPENSSL_sparcv9cap_P[1] is copy of Compatibility Feature Register,
++ * %asr26, SPARC-T4 and later. There is no SPARCV9_CFR bit in
++ * OPENSSL_sparcv9cap_P[0], as %cfr copy is sufficient...
++ */
++#define	CFR_AES		0x00000001 /* Supports AES opcodes	*/
++#define	CFR_DES		0x00000002 /* Supports DES opcodes	*/
++#define	CFR_KASUMI	0x00000004 /* Supports KASUMI opcodes	*/
++#define	CFR_CAMELLIA	0x00000008 /* Supports CAMELLIA opcodes	*/
++#define	CFR_MD5		0x00000010 /* Supports MD5 opcodes	*/
++#define	CFR_SHA1	0x00000020 /* Supports SHA1 opcodes	*/
++#define	CFR_SHA256	0x00000040 /* Supports SHA256 opcodes	*/
++#define	CFR_SHA512	0x00000080 /* Supports SHA512 opcodes	*/
++#define	CFR_MPMUL	0x00000100 /* Supports MPMUL opcodes	*/
++#define	CFR_MONTMUL	0x00000200 /* Supports MONTMUL opcodes	*/
++#define	CFR_MONTSQR	0x00000400 /* Supports MONTSQR opcodes	*/
++#define	CFR_CRC32C	0x00000800 /* Supports CRC32C opcodes	*/
++
++#if defined(OPENSSL_PIC) && !defined(__PIC__)
++#define	__PIC__
++#endif
++
++#if defined(__SUNPRO_C) && defined(__sparcv9) && !defined(__arch64__)
++#define	__arch64__
++#endif
++
++#define	SPARC_PIC_THUNK(reg)	\
++	.align	32;		\
++.Lpic_thunk:			\
++	jmp	%o7 + 8;	\
++	add	%o7, reg, reg;
++
++#define	SPARC_PIC_THUNK_CALL(reg)			\
++	sethi	%hi(_GLOBAL_OFFSET_TABLE_-4), reg;	\
++	call	.Lpic_thunk;				\
++	or	reg, %lo(_GLOBAL_OFFSET_TABLE_+4), reg;
++
++#if 1
++#define	SPARC_SETUP_GOT_REG(reg)	SPARC_PIC_THUNK_CALL(reg)
++#else
++#define	SPARC_SETUP_GOT_REG(reg)	\
++	sethi	%hi(_GLOBAL_OFFSET_TABLE_-4), reg;	\
++	call	.+8;					\
++	or	reg, %lo(_GLOBAL_OFFSET_TABLE_+4), reg;	\
++	add	%o7, reg, reg
++#endif
++
++#if defined(__arch64__)
++
++#define	SPARC_LOAD_ADDRESS(SYM, reg)	\
++	setx	SYM, %o7, reg;
++#define	LDPTR		ldx
++#define	SIZE_T_CC	%xcc
++#define	STACK_FRAME	192
++#define	STACK_BIAS	2047
++#define	STACK_7thARG	(STACK_BIAS+176)
++
++#else
++
++#define	SPARC_LOAD_ADDRESS(SYM, reg)	\
++	set	SYM, reg;
++#define	LDPTR		ld
++#define	SIZE_T_CC	%icc
++#define	STACK_FRAME	112
++#define	STACK_BIAS	0
++#define	STACK_7thARG	92
++#define	SPARC_LOAD_ADDRESS_LEAF(SYM, reg, tmp) SPARC_LOAD_ADDRESS(SYM, reg)
++
++#endif
++
++#ifdef __PIC__
++#undef	SPARC_LOAD_ADDRESS
++#undef SPARC_LOAD_ADDRESS_LEAF
++#define	SPARC_LOAD_ADDRESS(SYM, reg)	\
++	SPARC_SETUP_GOT_REG(reg);	\
++	sethi	%hi(SYM), %o7;		\
++	or	%o7, %lo(SYM), %o7;	\
++	LDPTR	[reg + %o7], reg;
++#endif
++
++#ifndef SPARC_LOAD_ADDRESS_LEAF
++#define	SPARC_LOAD_ADDRESS_LEAF(SYM, reg, tmp)	\
++	mov	%o7, tmp;			\
++	SPARC_LOAD_ADDRESS(SYM, reg)		\
++	mov	tmp, %o7;
++#endif
++
++#endif	/* __SPARC_ARCH_H__ */
+Index: crypto/md5/asm/md5-sparcv9.pl
+===================================================================
+diff -uNr openssl-1.0.1m/crypto/md5/asm/md5-sparcv9.pl openssl-1.0.1m/crypto/md5/asm/md5-sparcv9.pl
+--- openssl-1.0.1m/crypto/md5/asm/md5-sparcv9.pl 1970-01-01 01:00:00.000000000 +0100
++++ openssl-1.0.1m/crypto/md5/asm/md5-sparcv9.pl 2015-06-02 09:52:11.809148396 +0200
+@@ -0,0 +1,434 @@
++#!/usr/bin/env perl
++
++# ====================================================================
++# Written by Andy Polyakov <appro at openssl.org> for the OpenSSL
++# project. The module is, however, dual licensed under OpenSSL and
++# CRYPTOGAMS licenses depending on where you obtain it. For further
++# details see http://www.openssl.org/~appro/cryptogams/.
++#
++# Hardware SPARC T4 support by David S. Miller <davem at davemloft.net>.
++# ====================================================================
++
++# MD5 for SPARCv9, 6.9 cycles per byte on UltraSPARC, >40% faster than
++# code generated by Sun C 5.2.
++
++# SPARC T4 MD5 hardware achieves 3.20 cycles per byte, which is 2.1x
++# faster than software. Multi-process benchmark saturates at 12x
++# single-process result on 8-core processor, or ~11GBps per 2.85GHz
++# socket.
++
++$bits=32;
++for (@ARGV)	{ $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
++if ($bits==64)	{ $bias=2047; $frame=192; }
++else		{ $bias=0;    $frame=112; }
++
++$output=shift;
++open STDOUT,">$output";
++
++use integer;
++
++($ctx,$inp,$len)=("%i0","%i1","%i2");	# input arguments
++
++# 64-bit values
++ at X=("%o0","%o1","%o2","%o3","%o4","%o5","%o7","%g1","%g2");
++$tx="%g3";
++($AB,$CD)=("%g4","%g5");
++
++# 32-bit values
++ at V=($A,$B,$C,$D)=map("%l$_",(0..3));
++($t1,$t2,$t3,$saved_asi)=map("%l$_",(4..7));
++($shr,$shl1,$shl2)=("%i3","%i4","%i5");
++
++my @K=(	0xd76aa478,0xe8c7b756,0x242070db,0xc1bdceee,
++	0xf57c0faf,0x4787c62a,0xa8304613,0xfd469501,
++	0x698098d8,0x8b44f7af,0xffff5bb1,0x895cd7be,
++	0x6b901122,0xfd987193,0xa679438e,0x49b40821,
++
++	0xf61e2562,0xc040b340,0x265e5a51,0xe9b6c7aa,
++	0xd62f105d,0x02441453,0xd8a1e681,0xe7d3fbc8,
++	0x21e1cde6,0xc33707d6,0xf4d50d87,0x455a14ed,
++	0xa9e3e905,0xfcefa3f8,0x676f02d9,0x8d2a4c8a,
++
++	0xfffa3942,0x8771f681,0x6d9d6122,0xfde5380c,
++	0xa4beea44,0x4bdecfa9,0xf6bb4b60,0xbebfbc70,
++	0x289b7ec6,0xeaa127fa,0xd4ef3085,0x04881d05,
++	0xd9d4d039,0xe6db99e5,0x1fa27cf8,0xc4ac5665,
++
++	0xf4292244,0x432aff97,0xab9423a7,0xfc93a039,
++	0x655b59c3,0x8f0ccc92,0xffeff47d,0x85845dd1,
++	0x6fa87e4f,0xfe2ce6e0,0xa3014314,0x4e0811a1,
++	0xf7537e82,0xbd3af235,0x2ad7d2bb,0xeb86d391, 0	);
++
++sub R0 {
++  my ($i,$a,$b,$c,$d) = @_;
++  my $rot = (7,12,17,22)[$i%4];
++  my $j   = ($i+1)/2;
++
++  if ($i&1) {
++    $code.=<<___;
++	 srlx	@X[$j],$shr, at X[$j]	! align X[`$i+1`]
++	and	$b,$t1,$t1		! round $i
++	 sllx	@X[$j+1],$shl1,$tx
++	add	$t2,$a,$a
++	 sllx	$tx,$shl2,$tx
++	xor	$d,$t1,$t1
++	 or	$tx, at X[$j], at X[$j]
++	 sethi	%hi(@K[$i+1]),$t2
++	add	$t1,$a,$a
++	 or	$t2,%lo(@K[$i+1]),$t2
++	sll	$a,$rot,$t3
++	 add	@X[$j],$t2,$t2		! X[`$i+1`]+K[`$i+1`]
++	srl	$a,32-$rot,$a
++	add	$b,$t3,$t3
++	 xor	 $b,$c,$t1
++	add	$t3,$a,$a
++___
++  } else {
++    $code.=<<___;
++	 srlx	@X[$j],32,$tx		! extract X[`2*$j+1`]
++	and	$b,$t1,$t1		! round $i
++	add	$t2,$a,$a
++	xor	$d,$t1,$t1
++	 sethi	%hi(@K[$i+1]),$t2
++	add	$t1,$a,$a
++	 or	$t2,%lo(@K[$i+1]),$t2
++	sll	$a,$rot,$t3
++	 add	$tx,$t2,$t2		! X[`2*$j+1`]+K[`$i+1`]
++	srl	$a,32-$rot,$a
++	add	$b,$t3,$t3
++	 xor	 $b,$c,$t1
++	add	$t3,$a,$a
++___
++  }
++}
++
++sub R0_1 {
++  my ($i,$a,$b,$c,$d) = @_;
++  my $rot = (7,12,17,22)[$i%4];
++
++$code.=<<___;
++	 srlx	@X[0],32,$tx		! extract X[1]
++	and	$b,$t1,$t1		! round $i
++	add	$t2,$a,$a
++	xor	$d,$t1,$t1
++	 sethi	%hi(@K[$i+1]),$t2
++	add	$t1,$a,$a
++	 or	$t2,%lo(@K[$i+1]),$t2
++	sll	$a,$rot,$t3
++	 add	$tx,$t2,$t2		! X[1]+K[`$i+1`]
++	srl	$a,32-$rot,$a
++	add	$b,$t3,$t3
++	 andn	 $b,$c,$t1
++	add	$t3,$a,$a
++___
++}
++
++sub R1 {
++  my ($i,$a,$b,$c,$d) = @_;
++  my $rot = (5,9,14,20)[$i%4];
++  my $j   = $i<31 ? (1+5*($i+1))%16 : (5+3*($i+1))%16;
++  my $xi  = @X[$j/2];
++
++$code.=<<___ if ($j&1 && ($xi=$tx));
++	 srlx	@X[$j/2],32,$xi		! extract X[$j]
++___
++$code.=<<___;
++	and	$b,$d,$t3		! round $i
++	add	$t2,$a,$a
++	or	$t3,$t1,$t1
++	 sethi	%hi(@K[$i+1]),$t2
++	add	$t1,$a,$a
++	 or	$t2,%lo(@K[$i+1]),$t2
++	sll	$a,$rot,$t3
++	 add	$xi,$t2,$t2		! X[$j]+K[`$i+1`]
++	srl	$a,32-$rot,$a
++	add	$b,$t3,$t3
++	 `$i<31?"andn":"xor"`	 $b,$c,$t1
++	add	$t3,$a,$a
++___
++}
++
++sub R2 {
++  my ($i,$a,$b,$c,$d) = @_;
++  my $rot = (4,11,16,23)[$i%4];
++  my $j   = $i<47 ? (5+3*($i+1))%16 : (0+7*($i+1))%16;
++  my $xi  = @X[$j/2];
++
++$code.=<<___ if ($j&1 && ($xi=$tx));
++	 srlx	@X[$j/2],32,$xi		! extract X[$j]
++___
++$code.=<<___;
++	add	$t2,$a,$a		! round $i
++	xor	$b,$t1,$t1
++	 sethi	%hi(@K[$i+1]),$t2
++	add	$t1,$a,$a
++	 or	$t2,%lo(@K[$i+1]),$t2
++	sll	$a,$rot,$t3
++	 add	$xi,$t2,$t2		! X[$j]+K[`$i+1`]
++	srl	$a,32-$rot,$a
++	add	$b,$t3,$t3
++	 xor	 $b,$c,$t1
++	add	$t3,$a,$a
++___
++}
++
++sub R3 {
++  my ($i,$a,$b,$c,$d) = @_;
++  my $rot = (6,10,15,21)[$i%4];
++  my $j   = (0+7*($i+1))%16;
++  my $xi  = @X[$j/2];
++
++$code.=<<___;
++	add	$t2,$a,$a		! round $i
++___
++$code.=<<___ if ($j&1 && ($xi=$tx));
++	 srlx	@X[$j/2],32,$xi		! extract X[$j]
++___
++$code.=<<___;
++	orn	$b,$d,$t1
++	 sethi	%hi(@K[$i+1]),$t2
++	xor	$c,$t1,$t1
++	 or	$t2,%lo(@K[$i+1]),$t2
++	add	$t1,$a,$a
++	sll	$a,$rot,$t3
++	 add	$xi,$t2,$t2		! X[$j]+K[`$i+1`]
++	srl	$a,32-$rot,$a
++	add	$b,$t3,$t3
++	add	$t3,$a,$a
++___
++}
++
++$code.=<<___ if ($bits==64);
++.register	%g2,#scratch
++.register	%g3,#scratch
++___
++$code.=<<___;
++#include "sparc_arch.h"
++
++.section	".text",#alloc,#execinstr
++
++#ifdef __PIC__
++SPARC_PIC_THUNK(%g1)
++#endif
++
++.globl	md5_block_asm_data_order
++.align	32
++md5_block_asm_data_order:
++	SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
++	ld	[%g1+4],%g1		! OPENSSL_sparcv9cap_P[1]
++
++	andcc	%g1, CFR_MD5, %g0
++	be	.Lsoftware
++	nop
++
++	mov	4, %g1
++	andcc	%o1, 0x7, %g0
++	lda	[%o0 + %g0]0x88, %f0		! load context
++	lda	[%o0 + %g1]0x88, %f1
++	add	%o0, 8, %o0
++	lda	[%o0 + %g0]0x88, %f2
++	lda	[%o0 + %g1]0x88, %f3
++	bne,pn	%icc, .Lhwunaligned
++	sub	%o0, 8, %o0
++
++.Lhw_loop:
++	ldd	[%o1 + 0x00], %f8
++	ldd	[%o1 + 0x08], %f10
++	ldd	[%o1 + 0x10], %f12
++	ldd	[%o1 + 0x18], %f14
++	ldd	[%o1 + 0x20], %f16
++	ldd	[%o1 + 0x28], %f18
++	ldd	[%o1 + 0x30], %f20
++	subcc	%o2, 1, %o2		! done yet? 
++	ldd	[%o1 + 0x38], %f22
++	add	%o1, 0x40, %o1
++	prefetch [%o1 + 63], 20
++
++	.word	0x81b02800		! MD5
++
++	bne,pt	`$bits==64?"%xcc":"%icc"`, .Lhw_loop
++	nop
++
++.Lhwfinish:
++	sta	%f0, [%o0 + %g0]0x88	! store context
++	sta	%f1, [%o0 + %g1]0x88
++	add	%o0, 8, %o0
++	sta	%f2, [%o0 + %g0]0x88
++	sta	%f3, [%o0 + %g1]0x88
++	retl
++	nop
++
++.align	8
++.Lhwunaligned:
++	alignaddr %o1, %g0, %o1
++
++	ldd	[%o1 + 0x00], %f10
++.Lhwunaligned_loop:
++	ldd	[%o1 + 0x08], %f12
++	ldd	[%o1 + 0x10], %f14
++	ldd	[%o1 + 0x18], %f16
++	ldd	[%o1 + 0x20], %f18
++	ldd	[%o1 + 0x28], %f20
++	ldd	[%o1 + 0x30], %f22
++	ldd	[%o1 + 0x38], %f24
++	subcc	%o2, 1, %o2		! done yet?
++	ldd	[%o1 + 0x40], %f26
++	add	%o1, 0x40, %o1
++	prefetch [%o1 + 63], 20
++
++	faligndata %f10, %f12, %f8
++	faligndata %f12, %f14, %f10
++	faligndata %f14, %f16, %f12
++	faligndata %f16, %f18, %f14
++	faligndata %f18, %f20, %f16
++	faligndata %f20, %f22, %f18
++	faligndata %f22, %f24, %f20
++	faligndata %f24, %f26, %f22
++
++	.word	0x81b02800		! MD5
++
++	bne,pt	`$bits==64?"%xcc":"%icc"`, .Lhwunaligned_loop
++	for	%f26, %f26, %f10	! %f10=%f26
++
++	ba	.Lhwfinish
++	nop
++
++.align	16
++.Lsoftware:
++	save	%sp,-$frame,%sp
++
++	rd	%asi,$saved_asi
++	wr	%g0,0x88,%asi		! ASI_PRIMARY_LITTLE
++	and	$inp,7,$shr
++	andn	$inp,7,$inp
++
++	sll	$shr,3,$shr		! *=8
++	mov	56,$shl2
++	ld	[$ctx+0],$A
++	sub	$shl2,$shr,$shl2
++	ld	[$ctx+4],$B
++	and	$shl2,32,$shl1
++	add	$shl2,8,$shl2
++	ld	[$ctx+8],$C
++	sub	$shl2,$shl1,$shl2	! shr+shl1+shl2==64
++	ld	[$ctx+12],$D
++	nop
++
++.Loop:
++	 cmp	$shr,0			! was inp aligned?
++	ldxa	[$inp+0]%asi, at X[0]	! load little-endian input
++	ldxa	[$inp+8]%asi, at X[1]
++	ldxa	[$inp+16]%asi, at X[2]
++	ldxa	[$inp+24]%asi, at X[3]
++	ldxa	[$inp+32]%asi, at X[4]
++	 sllx	$A,32,$AB		! pack A,B
++	ldxa	[$inp+40]%asi, at X[5]
++	 sllx	$C,32,$CD		! pack C,D
++	ldxa	[$inp+48]%asi, at X[6]
++	 or	$B,$AB,$AB
++	ldxa	[$inp+56]%asi, at X[7]
++	 or	$D,$CD,$CD
++	bnz,a,pn	%icc,.+8
++	ldxa	[$inp+64]%asi, at X[8]
++
++	srlx	@X[0],$shr, at X[0]	! align X[0]
++	sllx	@X[1],$shl1,$tx
++	 sethi	%hi(@K[0]),$t2
++	sllx	$tx,$shl2,$tx
++	 or	$t2,%lo(@K[0]),$t2
++	or	$tx, at X[0], at X[0]
++	 xor	$C,$D,$t1
++	 add	@X[0],$t2,$t2		! X[0]+K[0]
++___
++	for ($i=0;$i<15;$i++)	{ &R0($i, at V);	unshift(@V,pop(@V)); }
++	for (;$i<16;$i++)	{ &R0_1($i, at V);	unshift(@V,pop(@V)); }
++	for (;$i<32;$i++)	{ &R1($i, at V);	unshift(@V,pop(@V)); }
++	for (;$i<48;$i++)	{ &R2($i, at V);	unshift(@V,pop(@V)); }
++	for (;$i<64;$i++)	{ &R3($i, at V);	unshift(@V,pop(@V)); }
++$code.=<<___;
++	srlx	$AB,32,$t1		! unpack A,B,C,D and accumulate
++	add	$inp,64,$inp		! advance inp
++	srlx	$CD,32,$t2
++	add	$t1,$A,$A
++	subcc	$len,1,$len		! done yet?
++	add	$AB,$B,$B
++	add	$t2,$C,$C
++	add	$CD,$D,$D
++	srl	$B,0,$B			! clruw	$B
++	bne	`$bits==64?"%xcc":"%icc"`,.Loop
++	srl	$D,0,$D			! clruw	$D
++
++	st	$A,[$ctx+0]		! write out ctx
++	st	$B,[$ctx+4]
++	st	$C,[$ctx+8]
++	st	$D,[$ctx+12]
++
++	wr	%g0,$saved_asi,%asi
++	ret
++	restore
++.type	md5_block_asm_data_order,#function
++.size	md5_block_asm_data_order,(.-md5_block_asm_data_order)
++
++.asciz	"MD5 block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
++.align	4
++___
++
++# Purpose of these subroutines is to explicitly encode VIS instructions,
++# so that one can compile the module without having to specify VIS
++# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
++# Idea is to reserve for option to produce "universal" binary and let
++# programmer detect if current CPU is VIS capable at run-time.
++sub unvis {
++my ($mnemonic,$rs1,$rs2,$rd)=@_;
++my $ref,$opf;
++my %visopf = (	"faligndata"	=> 0x048,
++		"for"		=> 0x07c	);
++
++    $ref = "$mnemonic\t$rs1,$rs2,$rd";
++
++    if ($opf=$visopf{$mnemonic}) {
++	foreach ($rs1,$rs2,$rd) {
++	    return $ref if (!/%f([0-9]{1,2})/);
++	    $_=$1;
++	    if ($1>=32) {
++		return $ref if ($1&1);
++		# re-encode for upper double register addressing
++		$_=($1|$1>>5)&31;
++	    }
++	}
++
++	return	sprintf ".word\t0x%08x !%s",
++			0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
++			$ref;
++    } else {
++	return $ref;
++    }
++}
++sub unalignaddr {
++my ($mnemonic,$rs1,$rs2,$rd)=@_;
++my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
++my $ref="$mnemonic\t$rs1,$rs2,$rd";
++
++    foreach ($rs1,$rs2,$rd) {
++	if (/%([goli])([0-7])/)	{ $_=$bias{$1}+$2; }
++	else			{ return $ref; }
++    }
++    return  sprintf ".word\t0x%08x !%s",
++		    0x81b00300|$rd<<25|$rs1<<14|$rs2,
++		    $ref;
++}
++
++foreach (split("\n",$code)) {
++	s/\`([^\`]*)\`/eval $1/ge;
++
++	s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
++		&unvis($1,$2,$3,$4)
++	 /ge;
++	s/\b(alignaddr)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
++		&unalignaddr($1,$2,$3,$4)
++	 /ge;
++
++	print $_,"\n";
++}
++
++close STDOUT;
+Index: crypto/aes/asm/aest4-sparcv9.pl
+===================================================================
+diff -uNr openssl-1.0.1m/crypto/aes/asm/aest4-sparcv9.pl openssl-1.0.1m/crypto/aes/asm/aest4-sparcv9.pl
+--- openssl-1.0.1m/crypto/aes/asm/aest4-sparcv9.pl 1970-01-01 01:00:00.000000000 +0100
++++ openssl-1.0.1m/crypto/aes/asm/aest4-sparcv9.pl 2015-06-02 09:52:11.809148396 +0200
+@@ -0,0 +1,902 @@
++#!/usr/bin/env perl
++
++# ====================================================================
++# Written by David S. Miller <davem at devemloft.net> and Andy Polyakov
++# <appro at openssl.org>. The module is licensed under 2-clause BSD
++# license. October 2012. All rights reserved.
++# ====================================================================
++
++######################################################################
++# AES for SPARC T4.
++#
++# AES round instructions complete in 3 cycles and can be issued every
++# cycle. It means that round calculations should take 4*rounds cycles,
++# because any given round instruction depends on result of *both*
++# previous instructions:
++#
++#	|0 |1 |2 |3 |4
++#	|01|01|01|
++#	   |23|23|23|
++#	            |01|01|...
++#	               |23|...
++#
++# Provided that fxor [with IV] takes 3 cycles to complete, critical
++# path length for CBC encrypt would be 3+4*rounds, or in other words
++# it should process one byte in at least (3+4*rounds)/16 cycles. This
++# estimate doesn't account for "collateral" instructions, such as
++# fetching input from memory, xor-ing it with zero-round key and
++# storing the result. Yet, *measured* performance [for data aligned
++# at 64-bit boundary!] deviates from this equation by less than 0.5%:
++#
++#		128-bit key	192-		256-
++# CBC encrypt	2.70/2.90(*)	3.20/3.40	3.70/3.90
++#			 (*) numbers after slash are for
++#			     misaligned data.
++#
++# Out-of-order execution logic managed to fully overlap "collateral"
++# instructions with those on critical path. Amazing!
++#
++# As with Intel AES-NI, question is if it's possible to improve
++# performance of parallelizeable modes by interleaving round
++# instructions. Provided round instruction latency and throughput
++# optimal interleave factor is 2. But can we expect 2x performance
++# improvement? Well, as round instructions can be issued one per
++# cycle, they don't saturate the 2-way issue pipeline and therefore
++# there is room for "collateral" calculations... Yet, 2x speed-up
++# over CBC encrypt remains unattaintable:
++#
++#		128-bit key	192-		256-
++# CBC decrypt	1.64/2.11	1.89/2.37	2.23/2.61
++# CTR		1.64/2.08(*)	1.89/2.33	2.23/2.61
++#			 (*) numbers after slash are for
++#			     misaligned data.
++#
++# Estimates based on amount of instructions under assumption that
++# round instructions are not pairable with any other instruction
++# suggest that latter is the actual case and pipeline runs
++# underutilized. It should be noted that T4 out-of-order execution
++# logic is so capable that performance gain from 2x interleave is
++# not even impressive, ~7-13% over non-interleaved code, largest
++# for 256-bit keys.
++
++# To anchor to something else, software implementation processes
++# one byte in 29 cycles with 128-bit key on same processor. Intel
++# Sandy Bridge encrypts byte in 5.07 cycles in CBC mode and decrypts
++# in 0.93, naturally with AES-NI.
++
++$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
++push(@INC,"${dir}","${dir}../../perlasm");
++require "sparcv9_modes.pl";
++
++&asm_init(@ARGV);
++
++$::evp=1;	# if $evp is set to 0, script generates module with
++# AES_[en|de]crypt, AES_set_[en|de]crypt_key and AES_cbc_encrypt entry
++# points. These however are not fully compatible with openssl/aes.h,
++# because they expect AES_KEY to be aligned at 64-bit boundary. When
++# used through EVP, alignment is arranged at EVP layer. Second thing
++# that is arranged by EVP is at least 32-bit alignment of IV.
++
++######################################################################
++# single-round subroutines
++#
++{
++my ($inp,$out,$key,$rounds,$tmp,$mask)=map("%o$_",(0..5));
++
++$code=<<___;
++.text
++
++.globl	aes_t4_encrypt
++.align	32
++aes_t4_encrypt:
++	andcc		$inp, 7, %g1		! is input aligned?
++	andn		$inp, 7, $inp
++
++	ldx		[$key + 0], %g4
++	ldx		[$key + 8], %g5
++
++	ldx		[$inp + 0], %o4
++	bz,pt		%icc, 1f
++	ldx		[$inp + 8], %o5
++	ldx		[$inp + 16], $inp
++	sll		%g1, 3, %g1
++	sub		%g0, %g1, %o3
++	sllx		%o4, %g1, %o4
++	sllx		%o5, %g1, %g1
++	srlx		%o5, %o3, %o5
++	srlx		$inp, %o3, %o3
++	or		%o5, %o4, %o4
++	or		%o3, %g1, %o5
++1:
++	ld		[$key + 240], $rounds
++	ldd		[$key + 16], %f12
++	ldd		[$key + 24], %f14
++	xor		%g4, %o4, %o4
++	xor		%g5, %o5, %o5
++	movxtod		%o4, %f0
++	movxtod		%o5, %f2
++	srl		$rounds, 1, $rounds
++	ldd		[$key + 32], %f16
++	sub		$rounds, 1, $rounds
++	ldd		[$key + 40], %f18
++	add		$key, 48, $key
++
++.Lenc:
++	aes_eround01	%f12, %f0, %f2, %f4
++	aes_eround23	%f14, %f0, %f2, %f2
++	ldd		[$key + 0], %f12
++	ldd		[$key + 8], %f14
++	sub		$rounds,1,$rounds
++	aes_eround01	%f16, %f4, %f2, %f0
++	aes_eround23	%f18, %f4, %f2, %f2
++	ldd		[$key + 16], %f16
++	ldd		[$key + 24], %f18
++	brnz,pt		$rounds, .Lenc
++	add		$key, 32, $key
++
++	andcc		$out, 7, $tmp		! is output aligned?
++	aes_eround01	%f12, %f0, %f2, %f4
++	aes_eround23	%f14, %f0, %f2, %f2
++	aes_eround01_l	%f16, %f4, %f2, %f0
++	aes_eround23_l	%f18, %f4, %f2, %f2
++
++	bnz,pn		%icc, 2f
++	nop
++
++	std		%f0, [$out + 0]
++	retl
++	std		%f2, [$out + 8]
++
++2:	alignaddrl	$out, %g0, $out
++	mov		0xff, $mask
++	srl		$mask, $tmp, $mask
++
++	faligndata	%f0, %f0, %f4
++	faligndata	%f0, %f2, %f6
++	faligndata	%f2, %f2, %f8
++
++	stda		%f4, [$out + $mask]0xc0	! partial store
++	std		%f6, [$out + 8]
++	add		$out, 16, $out
++	orn		%g0, $mask, $mask
++	retl
++	stda		%f8, [$out + $mask]0xc0	! partial store
++.type	aes_t4_encrypt,#function
++.size	aes_t4_encrypt,.-aes_t4_encrypt
++
++.globl	aes_t4_decrypt
++.align	32
++aes_t4_decrypt:
++	andcc		$inp, 7, %g1		! is input aligned?
++	andn		$inp, 7, $inp
++
++	ldx		[$key + 0], %g4
++	ldx		[$key + 8], %g5
++
++	ldx		[$inp + 0], %o4
++	bz,pt		%icc, 1f
++	ldx		[$inp + 8], %o5
++	ldx		[$inp + 16], $inp
++	sll		%g1, 3, %g1
++	sub		%g0, %g1, %o3
++	sllx		%o4, %g1, %o4
++	sllx		%o5, %g1, %g1
++	srlx		%o5, %o3, %o5
++	srlx		$inp, %o3, %o3
++	or		%o5, %o4, %o4
++	or		%o3, %g1, %o5
++1:
++	ld		[$key + 240], $rounds
++	ldd		[$key + 16], %f12
++	ldd		[$key + 24], %f14
++	xor		%g4, %o4, %o4
++	xor		%g5, %o5, %o5
++	movxtod		%o4, %f0
++	movxtod		%o5, %f2
++	srl		$rounds, 1, $rounds
++	ldd		[$key + 32], %f16
++	sub		$rounds, 1, $rounds
++	ldd		[$key + 40], %f18
++	add		$key, 48, $key
++
++.Ldec:
++	aes_dround01	%f12, %f0, %f2, %f4
++	aes_dround23	%f14, %f0, %f2, %f2
++	ldd		[$key + 0], %f12
++	ldd		[$key + 8], %f14
++	sub		$rounds,1,$rounds
++	aes_dround01	%f16, %f4, %f2, %f0
++	aes_dround23	%f18, %f4, %f2, %f2
++	ldd		[$key + 16], %f16
++	ldd		[$key + 24], %f18
++	brnz,pt		$rounds, .Ldec
++	add		$key, 32, $key
++
++	andcc		$out, 7, $tmp		! is output aligned?
++	aes_dround01	%f12, %f0, %f2, %f4
++	aes_dround23	%f14, %f0, %f2, %f2
++	aes_dround01_l	%f16, %f4, %f2, %f0
++	aes_dround23_l	%f18, %f4, %f2, %f2
++
++	bnz,pn		%icc, 2f
++	nop
++
++	std		%f0, [$out + 0]
++	retl
++	std		%f2, [$out + 8]
++
++2:	alignaddrl	$out, %g0, $out
++	mov		0xff, $mask
++	srl		$mask, $tmp, $mask
++
++	faligndata	%f0, %f0, %f4
++	faligndata	%f0, %f2, %f6
++	faligndata	%f2, %f2, %f8
++
++	stda		%f4, [$out + $mask]0xc0	! partial store
++	std		%f6, [$out + 8]
++	add		$out, 16, $out
++	orn		%g0, $mask, $mask
++	retl
++	stda		%f8, [$out + $mask]0xc0	! partial store
++.type	aes_t4_decrypt,#function
++.size	aes_t4_decrypt,.-aes_t4_decrypt
++___
++}
++
++######################################################################
++# key setup subroutines
++#
++{
++my ($inp,$bits,$out,$tmp)=map("%o$_",(0..5));
++$code.=<<___;
++.globl	aes_t4_set_encrypt_key
++.align	32
++aes_t4_set_encrypt_key:
++.Lset_encrypt_key:
++	and		$inp, 7, $tmp
++	alignaddr	$inp, %g0, $inp
++	cmp		$bits, 192
++	ldd		[$inp + 0], %f0
++	bl,pt		%icc,.L128
++	ldd		[$inp + 8], %f2
++
++	be,pt		%icc,.L192
++	ldd		[$inp + 16], %f4
++	brz,pt		$tmp, .L256aligned
++	ldd		[$inp + 24], %f6
++
++	ldd		[$inp + 32], %f8
++	faligndata	%f0, %f2, %f0
++	faligndata	%f2, %f4, %f2
++	faligndata	%f4, %f6, %f4
++	faligndata	%f6, %f8, %f6
++.L256aligned:
++___
++for ($i=0; $i<6; $i++) {
++    $code.=<<___;
++	std		%f0, [$out + `32*$i+0`]
++	aes_kexpand1	%f0, %f6, $i, %f0
++	std		%f2, [$out + `32*$i+8`]
++	aes_kexpand2	%f2, %f0, %f2
++	std		%f4, [$out + `32*$i+16`]
++	aes_kexpand0	%f4, %f2, %f4
++	std		%f6, [$out + `32*$i+24`]
++	aes_kexpand2	%f6, %f4, %f6
++___
++}
++$code.=<<___;
++	std		%f0, [$out + `32*$i+0`]
++	aes_kexpand1	%f0, %f6, $i, %f0
++	std		%f2, [$out + `32*$i+8`]
++	aes_kexpand2	%f2, %f0, %f2
++	std		%f4, [$out + `32*$i+16`]
++	std		%f6, [$out + `32*$i+24`]
++	std		%f0, [$out + `32*$i+32`]
++	std		%f2, [$out + `32*$i+40`]
++
++	mov		14, $tmp
++	st		$tmp, [$out + 240]
++	retl
++	xor		%o0, %o0, %o0
++
++.align	16
++.L192:
++	brz,pt		$tmp, .L192aligned
++	nop
++
++	ldd		[$inp + 24], %f6
++	faligndata	%f0, %f2, %f0
++	faligndata	%f2, %f4, %f2
++	faligndata	%f4, %f6, %f4
++.L192aligned:
++___
++for ($i=0; $i<7; $i++) {
++    $code.=<<___;
++	std		%f0, [$out + `24*$i+0`]
++	aes_kexpand1	%f0, %f4, $i, %f0
++	std		%f2, [$out + `24*$i+8`]
++	aes_kexpand2	%f2, %f0, %f2
++	std		%f4, [$out + `24*$i+16`]
++	aes_kexpand2	%f4, %f2, %f4
++___
++}
++$code.=<<___;
++	std		%f0, [$out + `24*$i+0`]
++	aes_kexpand1	%f0, %f4, $i, %f0
++	std		%f2, [$out + `24*$i+8`]
++	aes_kexpand2	%f2, %f0, %f2
++	std		%f4, [$out + `24*$i+16`]
++	std		%f0, [$out + `24*$i+24`]
++	std		%f2, [$out + `24*$i+32`]
++
++	mov		12, $tmp
++	st		$tmp, [$out + 240]
++	retl
++	xor		%o0, %o0, %o0
++
++.align	16
++.L128:
++	brz,pt		$tmp, .L128aligned
++	nop
++
++	ldd		[$inp + 16], %f4
++	faligndata	%f0, %f2, %f0
++	faligndata	%f2, %f4, %f2
++.L128aligned:
++___
++for ($i=0; $i<10; $i++) {
++    $code.=<<___;
++	std		%f0, [$out + `16*$i+0`]
++	aes_kexpand1	%f0, %f2, $i, %f0
++	std		%f2, [$out + `16*$i+8`]
++	aes_kexpand2	%f2, %f0, %f2
++___
++}
++$code.=<<___;
++	std		%f0, [$out + `16*$i+0`]
++	std		%f2, [$out + `16*$i+8`]
++
++	mov		10, $tmp
++	st		$tmp, [$out + 240]
++	retl
++	xor		%o0, %o0, %o0
++.type	aes_t4_set_encrypt_key,#function
++.size	aes_t4_set_encrypt_key,.-aes_t4_set_encrypt_key
++
++.globl	aes_t4_set_decrypt_key
++.align	32
++aes_t4_set_decrypt_key:
++	mov		%o7, %o5
++	call		.Lset_encrypt_key
++	nop
++
++	mov		%o5, %o7
++	sll		$tmp, 4, $inp		! $tmp is number of rounds
++	add		$tmp, 2, $tmp
++	add		$out, $inp, $inp	! $inp=$out+16*rounds
++	srl		$tmp, 2, $tmp		! $tmp=(rounds+2)/4
++
++.Lkey_flip:
++	ldd		[$out + 0],  %f0
++	ldd		[$out + 8],  %f2
++	ldd		[$out + 16], %f4
++	ldd		[$out + 24], %f6
++	ldd		[$inp + 0],  %f8
++	ldd		[$inp + 8],  %f10
++	ldd		[$inp - 16], %f12
++	ldd		[$inp - 8],  %f14
++	sub		$tmp, 1, $tmp
++	std		%f0, [$inp + 0]
++	std		%f2, [$inp + 8]
++	std		%f4, [$inp - 16]
++	std		%f6, [$inp - 8]
++	std		%f8, [$out + 0]
++	std		%f10, [$out + 8]
++	std		%f12, [$out + 16]
++	std		%f14, [$out + 24]
++	add		$out, 32, $out
++	brnz		$tmp, .Lkey_flip
++	sub		$inp, 32, $inp
++
++	retl
++	xor		%o0, %o0, %o0
++.type	aes_t4_set_decrypt_key,#function
++.size	aes_t4_set_decrypt_key,.-aes_t4_set_decrypt_key
++___
++}
++
++{{{
++my ($inp,$out,$len,$key,$ivec,$enc)=map("%i$_",(0..5));
++my ($ileft,$iright,$ooff,$omask,$ivoff)=map("%l$_",(1..7));
++
++$code.=<<___;
++.align	32
++_aes128_loadkey:
++	ldx		[$key + 0], %g4
++	ldx		[$key + 8], %g5
++___
++for ($i=2; $i<22;$i++) {			# load key schedule
++    $code.=<<___;
++	ldd		[$key + `8*$i`], %f`12+2*$i`
++___
++}
++$code.=<<___;
++	retl
++	nop
++.type	_aes128_loadkey,#function
++.size	_aes128_loadkey,.-_aes128_loadkey
++_aes128_load_enckey=_aes128_loadkey
++_aes128_load_deckey=_aes128_loadkey
++
++.align	32
++_aes128_encrypt_1x:
++___
++for ($i=0; $i<4; $i++) {
++    $code.=<<___;
++	aes_eround01	%f`16+8*$i+0`, %f0, %f2, %f4
++	aes_eround23	%f`16+8*$i+2`, %f0, %f2, %f2
++	aes_eround01	%f`16+8*$i+4`, %f4, %f2, %f0
++	aes_eround23	%f`16+8*$i+6`, %f4, %f2, %f2
++___
++}
++$code.=<<___;
++	aes_eround01	%f48, %f0, %f2, %f4
++	aes_eround23	%f50, %f0, %f2, %f2
++	aes_eround01_l	%f52, %f4, %f2, %f0
++	retl
++	aes_eround23_l	%f54, %f4, %f2, %f2
++.type	_aes128_encrypt_1x,#function
++.size	_aes128_encrypt_1x,.-_aes128_encrypt_1x
++
++.align	32
++_aes128_encrypt_2x:
++___
++for ($i=0; $i<4; $i++) {
++    $code.=<<___;
++	aes_eround01	%f`16+8*$i+0`, %f0, %f2, %f8
++	aes_eround23	%f`16+8*$i+2`, %f0, %f2, %f2
++	aes_eround01	%f`16+8*$i+0`, %f4, %f6, %f10
++	aes_eround23	%f`16+8*$i+2`, %f4, %f6, %f6
++	aes_eround01	%f`16+8*$i+4`, %f8, %f2, %f0
++	aes_eround23	%f`16+8*$i+6`, %f8, %f2, %f2
++	aes_eround01	%f`16+8*$i+4`, %f10, %f6, %f4
++	aes_eround23	%f`16+8*$i+6`, %f10, %f6, %f6
++___
++}
++$code.=<<___;
++	aes_eround01	%f48, %f0, %f2, %f8
++	aes_eround23	%f50, %f0, %f2, %f2
++	aes_eround01	%f48, %f4, %f6, %f10
++	aes_eround23	%f50, %f4, %f6, %f6
++	aes_eround01_l	%f52, %f8, %f2, %f0
++	aes_eround23_l	%f54, %f8, %f2, %f2
++	aes_eround01_l	%f52, %f10, %f6, %f4
++	retl
++	aes_eround23_l	%f54, %f10, %f6, %f6
++.type	_aes128_encrypt_2x,#function
++.size	_aes128_encrypt_2x,.-_aes128_encrypt_2x
++
++.align	32
++_aes128_decrypt_1x:
++___
++for ($i=0; $i<4; $i++) {
++    $code.=<<___;
++	aes_dround01	%f`16+8*$i+0`, %f0, %f2, %f4
++	aes_dround23	%f`16+8*$i+2`, %f0, %f2, %f2
++	aes_dround01	%f`16+8*$i+4`, %f4, %f2, %f0
++	aes_dround23	%f`16+8*$i+6`, %f4, %f2, %f2
++___
++}
++$code.=<<___;
++	aes_dround01	%f48, %f0, %f2, %f4
++	aes_dround23	%f50, %f0, %f2, %f2
++	aes_dround01_l	%f52, %f4, %f2, %f0
++	retl
++	aes_dround23_l	%f54, %f4, %f2, %f2
++.type	_aes128_decrypt_1x,#function
++.size	_aes128_decrypt_1x,.-_aes128_decrypt_1x
++
++.align	32
++_aes128_decrypt_2x:
++___
++for ($i=0; $i<4; $i++) {
++    $code.=<<___;
++	aes_dround01	%f`16+8*$i+0`, %f0, %f2, %f8
++	aes_dround23	%f`16+8*$i+2`, %f0, %f2, %f2
++	aes_dround01	%f`16+8*$i+0`, %f4, %f6, %f10
++	aes_dround23	%f`16+8*$i+2`, %f4, %f6, %f6
++	aes_dround01	%f`16+8*$i+4`, %f8, %f2, %f0
++	aes_dround23	%f`16+8*$i+6`, %f8, %f2, %f2
++	aes_dround01	%f`16+8*$i+4`, %f10, %f6, %f4
++	aes_dround23	%f`16+8*$i+6`, %f10, %f6, %f6
++___
++}
++$code.=<<___;
++	aes_dround01	%f48, %f0, %f2, %f8
++	aes_dround23	%f50, %f0, %f2, %f2
++	aes_dround01	%f48, %f4, %f6, %f10
++	aes_dround23	%f50, %f4, %f6, %f6
++	aes_dround01_l	%f52, %f8, %f2, %f0
++	aes_dround23_l	%f54, %f8, %f2, %f2
++	aes_dround01_l	%f52, %f10, %f6, %f4
++	retl
++	aes_dround23_l	%f54, %f10, %f6, %f6
++.type	_aes128_decrypt_2x,#function
++.size	_aes128_decrypt_2x,.-_aes128_decrypt_2x
++
++.align	32
++_aes192_loadkey:
++_aes256_loadkey:
++	ldx		[$key + 0], %g4
++	ldx		[$key + 8], %g5
++___
++for ($i=2; $i<26;$i++) {			# load key schedule
++    $code.=<<___;
++	ldd		[$key + `8*$i`], %f`12+2*$i`
++___
++}
++$code.=<<___;
++	retl
++	nop
++.type	_aes192_loadkey,#function
++.size	_aes192_loadkey,.-_aes192_loadkey
++_aes192_load_enckey=_aes192_loadkey
++_aes192_load_deckey=_aes192_loadkey
++_aes256_load_enckey=_aes192_loadkey
++_aes256_load_deckey=_aes192_loadkey
++
++.align	32
++_aes192_encrypt_1x:
++___
++for ($i=0; $i<5; $i++) {
++    $code.=<<___;
++	aes_eround01	%f`16+8*$i+0`, %f0, %f2, %f4
++	aes_eround23	%f`16+8*$i+2`, %f0, %f2, %f2
++	aes_eround01	%f`16+8*$i+4`, %f4, %f2, %f0
++	aes_eround23	%f`16+8*$i+6`, %f4, %f2, %f2
++___
++}
++$code.=<<___;
++	aes_eround01	%f56, %f0, %f2, %f4
++	aes_eround23	%f58, %f0, %f2, %f2
++	aes_eround01_l	%f60, %f4, %f2, %f0
++	retl
++	aes_eround23_l	%f62, %f4, %f2, %f2
++.type	_aes192_encrypt_1x,#function
++.size	_aes192_encrypt_1x,.-_aes192_encrypt_1x
++
++.align	32
++_aes192_encrypt_2x:
++___
++for ($i=0; $i<5; $i++) {
++    $code.=<<___;
++	aes_eround01	%f`16+8*$i+0`, %f0, %f2, %f8
++	aes_eround23	%f`16+8*$i+2`, %f0, %f2, %f2
++	aes_eround01	%f`16+8*$i+0`, %f4, %f6, %f10
++	aes_eround23	%f`16+8*$i+2`, %f4, %f6, %f6
++	aes_eround01	%f`16+8*$i+4`, %f8, %f2, %f0
++	aes_eround23	%f`16+8*$i+6`, %f8, %f2, %f2
++	aes_eround01	%f`16+8*$i+4`, %f10, %f6, %f4
++	aes_eround23	%f`16+8*$i+6`, %f10, %f6, %f6
++___
++}
++$code.=<<___;
++	aes_eround01	%f56, %f0, %f2, %f8
++	aes_eround23	%f58, %f0, %f2, %f2
++	aes_eround01	%f56, %f4, %f6, %f10
++	aes_eround23	%f58, %f4, %f6, %f6
++	aes_eround01_l	%f60, %f8, %f2, %f0
++	aes_eround23_l	%f62, %f8, %f2, %f2
++	aes_eround01_l	%f60, %f10, %f6, %f4
++	retl
++	aes_eround23_l	%f62, %f10, %f6, %f6
++.type	_aes192_encrypt_2x,#function
++.size	_aes192_encrypt_2x,.-_aes192_encrypt_2x
++
++.align	32
++_aes192_decrypt_1x:
++___
++for ($i=0; $i<5; $i++) {
++    $code.=<<___;
++	aes_dround01	%f`16+8*$i+0`, %f0, %f2, %f4
++	aes_dround23	%f`16+8*$i+2`, %f0, %f2, %f2
++	aes_dround01	%f`16+8*$i+4`, %f4, %f2, %f0
++	aes_dround23	%f`16+8*$i+6`, %f4, %f2, %f2
++___
++}
++$code.=<<___;
++	aes_dround01	%f56, %f0, %f2, %f4
++	aes_dround23	%f58, %f0, %f2, %f2
++	aes_dround01_l	%f60, %f4, %f2, %f0
++	retl
++	aes_dround23_l	%f62, %f4, %f2, %f2
++.type	_aes192_decrypt_1x,#function
++.size	_aes192_decrypt_1x,.-_aes192_decrypt_1x
++
++.align	32
++_aes192_decrypt_2x:
++___
++for ($i=0; $i<5; $i++) {
++    $code.=<<___;
++	aes_dround01	%f`16+8*$i+0`, %f0, %f2, %f8
++	aes_dround23	%f`16+8*$i+2`, %f0, %f2, %f2
++	aes_dround01	%f`16+8*$i+0`, %f4, %f6, %f10
++	aes_dround23	%f`16+8*$i+2`, %f4, %f6, %f6
++	aes_dround01	%f`16+8*$i+4`, %f8, %f2, %f0
++	aes_dround23	%f`16+8*$i+6`, %f8, %f2, %f2
++	aes_dround01	%f`16+8*$i+4`, %f10, %f6, %f4
++	aes_dround23	%f`16+8*$i+6`, %f10, %f6, %f6
++___
++}
++$code.=<<___;
++	aes_dround01	%f56, %f0, %f2, %f8
++	aes_dround23	%f58, %f0, %f2, %f2
++	aes_dround01	%f56, %f4, %f6, %f10
++	aes_dround23	%f58, %f4, %f6, %f6
++	aes_dround01_l	%f60, %f8, %f2, %f0
++	aes_dround23_l	%f62, %f8, %f2, %f2
++	aes_dround01_l	%f60, %f10, %f6, %f4
++	retl
++	aes_dround23_l	%f62, %f10, %f6, %f6
++.type	_aes192_decrypt_2x,#function
++.size	_aes192_decrypt_2x,.-_aes192_decrypt_2x
++
++.align	32
++_aes256_encrypt_1x:
++	aes_eround01	%f16, %f0, %f2, %f4
++	aes_eround23	%f18, %f0, %f2, %f2
++	ldd		[$key + 208], %f16
++	ldd		[$key + 216], %f18
++	aes_eround01	%f20, %f4, %f2, %f0
++	aes_eround23	%f22, %f4, %f2, %f2
++	ldd		[$key + 224], %f20
++	ldd		[$key + 232], %f22
++___
++for ($i=1; $i<6; $i++) {
++    $code.=<<___;
++	aes_eround01	%f`16+8*$i+0`, %f0, %f2, %f4
++	aes_eround23	%f`16+8*$i+2`, %f0, %f2, %f2
++	aes_eround01	%f`16+8*$i+4`, %f4, %f2, %f0
++	aes_eround23	%f`16+8*$i+6`, %f4, %f2, %f2
++___
++}
++$code.=<<___;
++	aes_eround01	%f16, %f0, %f2, %f4
++	aes_eround23	%f18, %f0, %f2, %f2
++	ldd		[$key + 16], %f16
++	ldd		[$key + 24], %f18
++	aes_eround01_l	%f20, %f4, %f2, %f0
++	aes_eround23_l	%f22, %f4, %f2, %f2
++	ldd		[$key + 32], %f20
++	retl
++	ldd		[$key + 40], %f22
++.type	_aes256_encrypt_1x,#function
++.size	_aes256_encrypt_1x,.-_aes256_encrypt_1x
++
++.align	32
++_aes256_encrypt_2x:
++	aes_eround01	%f16, %f0, %f2, %f8
++	aes_eround23	%f18, %f0, %f2, %f2
++	aes_eround01	%f16, %f4, %f6, %f10
++	aes_eround23	%f18, %f4, %f6, %f6
++	ldd		[$key + 208], %f16
++	ldd		[$key + 216], %f18
++	aes_eround01	%f20, %f8, %f2, %f0
++	aes_eround23	%f22, %f8, %f2, %f2
++	aes_eround01	%f20, %f10, %f6, %f4
++	aes_eround23	%f22, %f10, %f6, %f6
++	ldd		[$key + 224], %f20
++	ldd		[$key + 232], %f22
++___
++for ($i=1; $i<6; $i++) {
++    $code.=<<___;
++	aes_eround01	%f`16+8*$i+0`, %f0, %f2, %f8
++	aes_eround23	%f`16+8*$i+2`, %f0, %f2, %f2
++	aes_eround01	%f`16+8*$i+0`, %f4, %f6, %f10
++	aes_eround23	%f`16+8*$i+2`, %f4, %f6, %f6
++	aes_eround01	%f`16+8*$i+4`, %f8, %f2, %f0
++	aes_eround23	%f`16+8*$i+6`, %f8, %f2, %f2
++	aes_eround01	%f`16+8*$i+4`, %f10, %f6, %f4
++	aes_eround23	%f`16+8*$i+6`, %f10, %f6, %f6
++___
++}
++$code.=<<___;
++	aes_eround01	%f16, %f0, %f2, %f8
++	aes_eround23	%f18, %f0, %f2, %f2
++	aes_eround01	%f16, %f4, %f6, %f10
++	aes_eround23	%f18, %f4, %f6, %f6
++	ldd		[$key + 16], %f16
++	ldd		[$key + 24], %f18
++	aes_eround01_l	%f20, %f8, %f2, %f0
++	aes_eround23_l	%f22, %f8, %f2, %f2
++	aes_eround01_l	%f20, %f10, %f6, %f4
++	aes_eround23_l	%f22, %f10, %f6, %f6
++	ldd		[$key + 32], %f20
++	retl
++	ldd		[$key + 40], %f22
++.type	_aes256_encrypt_2x,#function
++.size	_aes256_encrypt_2x,.-_aes256_encrypt_2x
++
++.align	32
++_aes256_decrypt_1x:
++	aes_dround01	%f16, %f0, %f2, %f4
++	aes_dround23	%f18, %f0, %f2, %f2
++	ldd		[$key + 208], %f16
++	ldd		[$key + 216], %f18
++	aes_dround01	%f20, %f4, %f2, %f0
++	aes_dround23	%f22, %f4, %f2, %f2
++	ldd		[$key + 224], %f20
++	ldd		[$key + 232], %f22
++___
++for ($i=1; $i<6; $i++) {
++    $code.=<<___;
++	aes_dround01	%f`16+8*$i+0`, %f0, %f2, %f4
++	aes_dround23	%f`16+8*$i+2`, %f0, %f2, %f2
++	aes_dround01	%f`16+8*$i+4`, %f4, %f2, %f0
++	aes_dround23	%f`16+8*$i+6`, %f4, %f2, %f2
++___
++}
++$code.=<<___;
++	aes_dround01	%f16, %f0, %f2, %f4
++	aes_dround23	%f18, %f0, %f2, %f2
++	ldd		[$key + 16], %f16
++	ldd		[$key + 24], %f18
++	aes_dround01_l	%f20, %f4, %f2, %f0
++	aes_dround23_l	%f22, %f4, %f2, %f2
++	ldd		[$key + 32], %f20
++	retl
++	ldd		[$key + 40], %f22
++.type	_aes256_decrypt_1x,#function
++.size	_aes256_decrypt_1x,.-_aes256_decrypt_1x
++
++.align	32
++_aes256_decrypt_2x:
++	aes_dround01	%f16, %f0, %f2, %f8
++	aes_dround23	%f18, %f0, %f2, %f2
++	aes_dround01	%f16, %f4, %f6, %f10
++	aes_dround23	%f18, %f4, %f6, %f6
++	ldd		[$key + 208], %f16
++	ldd		[$key + 216], %f18
++	aes_dround01	%f20, %f8, %f2, %f0
++	aes_dround23	%f22, %f8, %f2, %f2
++	aes_dround01	%f20, %f10, %f6, %f4
++	aes_dround23	%f22, %f10, %f6, %f6
++	ldd		[$key + 224], %f20
++	ldd		[$key + 232], %f22
++___
++for ($i=1; $i<6; $i++) {
++    $code.=<<___;
++	aes_dround01	%f`16+8*$i+0`, %f0, %f2, %f8
++	aes_dround23	%f`16+8*$i+2`, %f0, %f2, %f2
++	aes_dround01	%f`16+8*$i+0`, %f4, %f6, %f10
++	aes_dround23	%f`16+8*$i+2`, %f4, %f6, %f6
++	aes_dround01	%f`16+8*$i+4`, %f8, %f2, %f0
++	aes_dround23	%f`16+8*$i+6`, %f8, %f2, %f2
++	aes_dround01	%f`16+8*$i+4`, %f10, %f6, %f4
++	aes_dround23	%f`16+8*$i+6`, %f10, %f6, %f6
++___
++}
++$code.=<<___;
++	aes_dround01	%f16, %f0, %f2, %f8
++	aes_dround23	%f18, %f0, %f2, %f2
++	aes_dround01	%f16, %f4, %f6, %f10
++	aes_dround23	%f18, %f4, %f6, %f6
++	ldd		[$key + 16], %f16
++	ldd		[$key + 24], %f18
++	aes_dround01_l	%f20, %f8, %f2, %f0
++	aes_dround23_l	%f22, %f8, %f2, %f2
++	aes_dround01_l	%f20, %f10, %f6, %f4
++	aes_dround23_l	%f22, %f10, %f6, %f6
++	ldd		[$key + 32], %f20
++	retl
++	ldd		[$key + 40], %f22
++.type	_aes256_decrypt_2x,#function
++.size	_aes256_decrypt_2x,.-_aes256_decrypt_2x
++___
++
++&alg_cbc_encrypt_implement("aes",128);
++&alg_cbc_encrypt_implement("aes",192);
++&alg_cbc_encrypt_implement("aes",256);
++
++&alg_cbc_decrypt_implement("aes",128);
++&alg_cbc_decrypt_implement("aes",192);
++&alg_cbc_decrypt_implement("aes",256);
++
++if ($::evp) {
++    &alg_ctr32_implement("aes",128);
++    &alg_ctr32_implement("aes",192);
++    &alg_ctr32_implement("aes",256);
++}
++}}}
++
++if (!$::evp) {
++$code.=<<___;
++.global	AES_encrypt
++AES_encrypt=aes_t4_encrypt
++.global	AES_decrypt
++AES_decrypt=aes_t4_decrypt
++.global	AES_set_encrypt_key
++.align	32
++AES_set_encrypt_key:
++	andcc		%o2, 7, %g0		! check alignment
++	bnz,a,pn	%icc, 1f
++	mov		-1, %o0
++	brz,a,pn	%o0, 1f
++	mov		-1, %o0
++	brz,a,pn	%o2, 1f
++	mov		-1, %o0
++	andncc		%o1, 0x1c0, %g0
++	bnz,a,pn	%icc, 1f
++	mov		-2, %o0
++	cmp		%o1, 128
++	bl,a,pn		%icc, 1f
++	mov		-2, %o0
++	b		aes_t4_set_encrypt_key
++	nop
++1:	retl
++	nop
++.type	AES_set_encrypt_key,#function
++.size	AES_set_encrypt_key,.-AES_set_encrypt_key
++
++.global	AES_set_decrypt_key
++.align	32
++AES_set_decrypt_key:
++	andcc		%o2, 7, %g0		! check alignment
++	bnz,a,pn	%icc, 1f
++	mov		-1, %o0
++	brz,a,pn	%o0, 1f
++	mov		-1, %o0
++	brz,a,pn	%o2, 1f
++	mov		-1, %o0
++	andncc		%o1, 0x1c0, %g0
++	bnz,a,pn	%icc, 1f
++	mov		-2, %o0
++	cmp		%o1, 128
++	bl,a,pn		%icc, 1f
++	mov		-2, %o0
++	b		aes_t4_set_decrypt_key
++	nop
++1:	retl
++	nop
++.type	AES_set_decrypt_key,#function
++.size	AES_set_decrypt_key,.-AES_set_decrypt_key
++___
++
++my ($inp,$out,$len,$key,$ivec,$enc)=map("%o$_",(0..5));
++
++$code.=<<___;
++.globl	AES_cbc_encrypt
++.align	32
++AES_cbc_encrypt:
++	ld		[$key + 240], %g1
++	nop
++	brz		$enc, .Lcbc_decrypt
++	cmp		%g1, 12
++
++	bl,pt		%icc, aes128_t4_cbc_encrypt
++	nop
++	be,pn		%icc, aes192_t4_cbc_encrypt
++	nop
++	ba		aes256_t4_cbc_encrypt
++	nop
++
++.Lcbc_decrypt:
++	bl,pt		%icc, aes128_t4_cbc_decrypt
++	nop
++	be,pn		%icc, aes192_t4_cbc_decrypt
++	nop
++	ba		aes256_t4_cbc_decrypt
++	nop
++.type	AES_cbc_encrypt,#function
++.size	AES_cbc_encrypt,.-AES_cbc_encrypt
++___
++}
++$code.=<<___;
++.asciz	"AES for SPARC T4, David S. Miller, Andy Polyakov"
++.align	4
++___
++
++&emit_assembler();
++
++close STDOUT;
+Index: crypto/des/asm/dest4-sparcv9.pl
+===================================================================
+diff -uNr openssl-1.0.1m/crypto/des/asm/dest4-sparcv9.pl openssl-1.0.1m/crypto/des/asm/dest4-sparcv9.pl
+--- openssl-1.0.1m/crypto/des/asm/dest4-sparcv9.pl 1970-01-01 01:00:00.000000000 +0100
++++ openssl-1.0.1m/crypto/des/asm/dest4-sparcv9.pl 2015-06-02 09:52:11.809148396 +0200
+@@ -0,0 +1,602 @@
++#!/usr/bin/env perl
++
++# ====================================================================
++# Written by David S. Miller <davem at devemloft.net> and Andy Polyakov
++# <appro at openssl.org>. The module is licensed under 2-clause BSD
++# license. March 2013. All rights reserved.
++# ====================================================================
++
++######################################################################
++# DES for SPARC T4.
++#
++# As with other hardware-assisted ciphers CBC encrypt results [for
++# aligned data] are virtually identical to critical path lengths:
++#
++#		DES		Triple-DES
++# CBC encrypt	4.14/4.15(*)	11.7/11.7
++# CBC decrypt	1.77/4.11(**)	6.42/7.47
++#
++#			 (*)	numbers after slash are for
++#				misaligned data;
++#			 (**)	this is result for largest
++#				block size, unlike all other
++#				cases smaller blocks results
++#				are better[?];
++
++$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
++push(@INC,"${dir}","${dir}../../perlasm");
++require "sparcv9_modes.pl";
++
++&asm_init(@ARGV);
++
++$code.=<<___ if ($::abibits==64);
++.register       %g2,#scratch
++.register       %g3,#scratch
++___
++
++$code.=<<___;
++.text
++___
++
++{ my ($inp,$out)=("%o0","%o1");
++
++$code.=<<___;
++.align	32
++.globl	des_t4_key_expand
++.type	des_t4_key_expand,#function
++des_t4_key_expand:
++	andcc		$inp, 0x7, %g0
++	alignaddr	$inp, %g0, $inp
++	bz,pt		%icc, 1f
++	ldd		[$inp + 0x00], %f0
++	ldd		[$inp + 0x08], %f2
++	faligndata	%f0, %f2, %f0
++1:	des_kexpand	%f0, 0, %f0
++	des_kexpand	%f0, 1, %f2
++	std		%f0, [$out + 0x00]
++	des_kexpand	%f2, 3, %f6
++	std		%f2, [$out + 0x08]
++	des_kexpand	%f2, 2, %f4
++	des_kexpand	%f6, 3, %f10
++	std		%f6, [$out + 0x18]
++	des_kexpand	%f6, 2, %f8
++	std		%f4, [$out + 0x10]
++	des_kexpand	%f10, 3, %f14
++	std		%f10, [$out + 0x28]
++	des_kexpand	%f10, 2, %f12
++	std		%f8, [$out + 0x20]
++	des_kexpand	%f14, 1, %f16
++	std		%f14, [$out + 0x38]
++	des_kexpand	%f16, 3, %f20
++	std		%f12, [$out + 0x30]
++	des_kexpand	%f16, 2, %f18
++	std		%f16, [$out + 0x40]
++	des_kexpand	%f20, 3, %f24
++	std		%f20, [$out + 0x50]
++	des_kexpand	%f20, 2, %f22
++	std		%f18, [$out + 0x48]
++	des_kexpand	%f24, 3, %f28
++	std		%f24, [$out + 0x60]
++	des_kexpand	%f24, 2, %f26
++	std		%f22, [$out + 0x58]
++	des_kexpand	%f28, 1, %f30
++	std		%f28, [$out + 0x70]
++	std		%f26, [$out + 0x68]
++	retl
++	std		%f30, [$out + 0x78]
++.size	des_t4_key_expand,.-des_t4_key_expand
++___
++}
++{ my ($inp,$out,$len,$key,$ivec) = map("%o$_",(0..4));
++  my ($ileft,$iright,$omask) = map("%g$_",(1..3));
++
++$code.=<<___;
++.globl	des_t4_cbc_encrypt
++.align	32
++des_t4_cbc_encrypt:
++	ld		[$ivec + 0], %f0	! load ivec
++	ld		[$ivec + 4], %f1
++
++	and		$inp, 7, $ileft
++	andn		$inp, 7, $inp
++	sll		$ileft, 3, $ileft
++	mov		0xff, $omask
++	prefetch	[$inp], 20
++	prefetch	[$inp + 63], 20
++	sub		%g0, $ileft, $iright
++	and		$out, 7, %g4
++	alignaddrl	$out, %g0, $out
++	srl		$omask, %g4, $omask
++	srlx		$len, 3, $len
++	movrz		%g4, 0, $omask
++	prefetch	[$out], 22
++
++	ldd		[$key + 0x00], %f4	! load key schedule
++	ldd		[$key + 0x08], %f6
++	ldd		[$key + 0x10], %f8
++	ldd		[$key + 0x18], %f10
++	ldd		[$key + 0x20], %f12
++	ldd		[$key + 0x28], %f14
++	ldd		[$key + 0x30], %f16
++	ldd		[$key + 0x38], %f18
++	ldd		[$key + 0x40], %f20
++	ldd		[$key + 0x48], %f22
++	ldd		[$key + 0x50], %f24
++	ldd		[$key + 0x58], %f26
++	ldd		[$key + 0x60], %f28
++	ldd		[$key + 0x68], %f30
++	ldd		[$key + 0x70], %f32
++	ldd		[$key + 0x78], %f34
++
++.Ldes_cbc_enc_loop:
++	ldx		[$inp + 0], %g4
++	brz,pt		$ileft, 4f
++	nop
++
++	ldx		[$inp + 8], %g5
++	sllx		%g4, $ileft, %g4
++	srlx		%g5, $iright, %g5
++	or		%g5, %g4, %g4
++4:
++	movxtod		%g4, %f2
++	prefetch	[$inp + 8+63], 20
++	add		$inp, 8, $inp
++	fxor		%f2, %f0, %f0		! ^= ivec
++	prefetch	[$out + 63], 22
++
++	des_ip		%f0, %f0
++	des_round	%f4, %f6, %f0, %f0
++	des_round	%f8, %f10, %f0, %f0
++	des_round	%f12, %f14, %f0, %f0
++	des_round	%f16, %f18, %f0, %f0
++	des_round	%f20, %f22, %f0, %f0
++	des_round	%f24, %f26, %f0, %f0
++	des_round	%f28, %f30, %f0, %f0
++	des_round	%f32, %f34, %f0, %f0
++	des_iip		%f0, %f0
++
++	brnz,pn		$omask, 2f
++	sub		$len, 1, $len
++
++	std		%f0, [$out + 0]
++	brnz,pt		$len, .Ldes_cbc_enc_loop
++	add		$out, 8, $out
++
++	st		%f0, [$ivec + 0]	! write out ivec
++	retl
++	st		%f1, [$ivec + 4]
++
++.align	16
++2:	ldxa		[$inp]0x82, %g4		! avoid read-after-write hazard
++						! and ~4x deterioration
++						! in inp==out case
++	faligndata	%f0, %f0, %f2		! handle unaligned output
++
++	stda		%f2, [$out + $omask]0xc0	! partial store
++	add		$out, 8, $out
++	orn		%g0, $omask, $omask
++	stda		%f2, [$out + $omask]0xc0	! partial store
++
++	brnz,pt		$len, .Ldes_cbc_enc_loop+4
++	orn		%g0, $omask, $omask
++
++	st		%f0, [$ivec + 0]	! write out ivec
++	retl
++	st		%f1, [$ivec + 4]
++.type	des_t4_cbc_encrypt,#function
++.size	des_t4_cbc_encrypt,.-des_t4_cbc_encrypt
++
++.globl	des_t4_cbc_decrypt
++.align	32
++des_t4_cbc_decrypt:
++	ld		[$ivec + 0], %f2	! load ivec
++	ld		[$ivec + 4], %f3
++
++	and		$inp, 7, $ileft
++	andn		$inp, 7, $inp
++	sll		$ileft, 3, $ileft
++	mov		0xff, $omask
++	prefetch	[$inp], 20
++	prefetch	[$inp + 63], 20
++	sub		%g0, $ileft, $iright
++	and		$out, 7, %g4
++	alignaddrl	$out, %g0, $out
++	srl		$omask, %g4, $omask
++	srlx		$len, 3, $len
++	movrz		%g4, 0, $omask
++	prefetch	[$out], 22
++
++	ldd		[$key + 0x78], %f4	! load key schedule
++	ldd		[$key + 0x70], %f6
++	ldd		[$key + 0x68], %f8
++	ldd		[$key + 0x60], %f10
++	ldd		[$key + 0x58], %f12
++	ldd		[$key + 0x50], %f14
++	ldd		[$key + 0x48], %f16
++	ldd		[$key + 0x40], %f18
++	ldd		[$key + 0x38], %f20
++	ldd		[$key + 0x30], %f22
++	ldd		[$key + 0x28], %f24
++	ldd		[$key + 0x20], %f26
++	ldd		[$key + 0x18], %f28
++	ldd		[$key + 0x10], %f30
++	ldd		[$key + 0x08], %f32
++	ldd		[$key + 0x00], %f34
++
++.Ldes_cbc_dec_loop:
++	ldx		[$inp + 0], %g4
++	brz,pt		$ileft, 4f
++	nop
++
++	ldx		[$inp + 8], %g5
++	sllx		%g4, $ileft, %g4
++	srlx		%g5, $iright, %g5
++	or		%g5, %g4, %g4
++4:
++	movxtod		%g4, %f0
++	prefetch	[$inp + 8+63], 20
++	add		$inp, 8, $inp
++	prefetch	[$out + 63], 22
++
++	des_ip		%f0, %f0
++	des_round	%f4, %f6, %f0, %f0
++	des_round	%f8, %f10, %f0, %f0
++	des_round	%f12, %f14, %f0, %f0
++	des_round	%f16, %f18, %f0, %f0
++	des_round	%f20, %f22, %f0, %f0
++	des_round	%f24, %f26, %f0, %f0
++	des_round	%f28, %f30, %f0, %f0
++	des_round	%f32, %f34, %f0, %f0
++	des_iip		%f0, %f0
++
++	fxor		%f2, %f0, %f0		! ^= ivec
++	movxtod		%g4, %f2
++
++	brnz,pn		$omask, 2f
++	sub		$len, 1, $len
++
++	std		%f0, [$out + 0]
++	brnz,pt		$len, .Ldes_cbc_dec_loop
++	add		$out, 8, $out
++
++	st		%f2, [$ivec + 0]	! write out ivec
++	retl
++	st		%f3, [$ivec + 4]
++
++.align	16
++2:	ldxa		[$inp]0x82, %g4		! avoid read-after-write hazard
++						! and ~4x deterioration
++						! in inp==out case
++	faligndata	%f0, %f0, %f0		! handle unaligned output
++
++	stda		%f0, [$out + $omask]0xc0	! partial store
++	add		$out, 8, $out
++	orn		%g0, $omask, $omask
++	stda		%f0, [$out + $omask]0xc0	! partial store
++
++	brnz,pt		$len, .Ldes_cbc_dec_loop+4
++	orn		%g0, $omask, $omask
++
++	st		%f2, [$ivec + 0]	! write out ivec
++	retl
++	st		%f3, [$ivec + 4]
++.type	des_t4_cbc_decrypt,#function
++.size	des_t4_cbc_decrypt,.-des_t4_cbc_decrypt
++___
++
++# One might wonder why does one have back-to-back des_iip/des_ip
++# pairs between EDE passes. Indeed, aren't they inverse of each other?
++# They almost are. Outcome of the pair is 32-bit words being swapped
++# in target register. Consider pair of des_iip/des_ip as a way to
++# perform the due swap, it's actually fastest way in this case.
++
++$code.=<<___;
++.globl	des_t4_ede3_cbc_encrypt
++.align	32
++des_t4_ede3_cbc_encrypt:
++	ld		[$ivec + 0], %f0	! load ivec
++	ld		[$ivec + 4], %f1
++
++	and		$inp, 7, $ileft
++	andn		$inp, 7, $inp
++	sll		$ileft, 3, $ileft
++	mov		0xff, $omask
++	prefetch	[$inp], 20
++	prefetch	[$inp + 63], 20
++	sub		%g0, $ileft, $iright
++	and		$out, 7, %g4
++	alignaddrl	$out, %g0, $out
++	srl		$omask, %g4, $omask
++	srlx		$len, 3, $len
++	movrz		%g4, 0, $omask
++	prefetch	[$out], 22
++
++	ldd		[$key + 0x00], %f4	! load key schedule
++	ldd		[$key + 0x08], %f6
++	ldd		[$key + 0x10], %f8
++	ldd		[$key + 0x18], %f10
++	ldd		[$key + 0x20], %f12
++	ldd		[$key + 0x28], %f14
++	ldd		[$key + 0x30], %f16
++	ldd		[$key + 0x38], %f18
++	ldd		[$key + 0x40], %f20
++	ldd		[$key + 0x48], %f22
++	ldd		[$key + 0x50], %f24
++	ldd		[$key + 0x58], %f26
++	ldd		[$key + 0x60], %f28
++	ldd		[$key + 0x68], %f30
++	ldd		[$key + 0x70], %f32
++	ldd		[$key + 0x78], %f34
++
++.Ldes_ede3_cbc_enc_loop:
++	ldx		[$inp + 0], %g4
++	brz,pt		$ileft, 4f
++	nop
++
++	ldx		[$inp + 8], %g5
++	sllx		%g4, $ileft, %g4
++	srlx		%g5, $iright, %g5
++	or		%g5, %g4, %g4
++4:
++	movxtod		%g4, %f2
++	prefetch	[$inp + 8+63], 20
++	add		$inp, 8, $inp
++	fxor		%f2, %f0, %f0		! ^= ivec
++	prefetch	[$out + 63], 22
++
++	des_ip		%f0, %f0
++	des_round	%f4, %f6, %f0, %f0
++	des_round	%f8, %f10, %f0, %f0
++	des_round	%f12, %f14, %f0, %f0
++	des_round	%f16, %f18, %f0, %f0
++	ldd		[$key + 0x100-0x08], %f36
++	ldd		[$key + 0x100-0x10], %f38
++	des_round	%f20, %f22, %f0, %f0
++	ldd		[$key + 0x100-0x18], %f40
++	ldd		[$key + 0x100-0x20], %f42
++	des_round	%f24, %f26, %f0, %f0
++	ldd		[$key + 0x100-0x28], %f44
++	ldd		[$key + 0x100-0x30], %f46
++	des_round	%f28, %f30, %f0, %f0
++	ldd		[$key + 0x100-0x38], %f48
++	ldd		[$key + 0x100-0x40], %f50
++	des_round	%f32, %f34, %f0, %f0
++	ldd		[$key + 0x100-0x48], %f52
++	ldd		[$key + 0x100-0x50], %f54
++	des_iip		%f0, %f0
++
++	ldd		[$key + 0x100-0x58], %f56
++	ldd		[$key + 0x100-0x60], %f58
++	des_ip		%f0, %f0
++	ldd		[$key + 0x100-0x68], %f60
++	ldd		[$key + 0x100-0x70], %f62
++	des_round	%f36, %f38, %f0, %f0
++	ldd		[$key + 0x100-0x78], %f36
++	ldd		[$key + 0x100-0x80], %f38
++	des_round	%f40, %f42, %f0, %f0
++	des_round	%f44, %f46, %f0, %f0
++	des_round	%f48, %f50, %f0, %f0
++	ldd		[$key + 0x100+0x00], %f40
++	ldd		[$key + 0x100+0x08], %f42
++	des_round	%f52, %f54, %f0, %f0
++	ldd		[$key + 0x100+0x10], %f44
++	ldd		[$key + 0x100+0x18], %f46
++	des_round	%f56, %f58, %f0, %f0
++	ldd		[$key + 0x100+0x20], %f48
++	ldd		[$key + 0x100+0x28], %f50
++	des_round	%f60, %f62, %f0, %f0
++	ldd		[$key + 0x100+0x30], %f52
++	ldd		[$key + 0x100+0x38], %f54
++	des_round	%f36, %f38, %f0, %f0
++	ldd		[$key + 0x100+0x40], %f56
++	ldd		[$key + 0x100+0x48], %f58
++	des_iip		%f0, %f0
++
++	ldd		[$key + 0x100+0x50], %f60
++	ldd		[$key + 0x100+0x58], %f62
++	des_ip		%f0, %f0
++	ldd		[$key + 0x100+0x60], %f36
++	ldd		[$key + 0x100+0x68], %f38
++	des_round	%f40, %f42, %f0, %f0
++	ldd		[$key + 0x100+0x70], %f40
++	ldd		[$key + 0x100+0x78], %f42
++	des_round	%f44, %f46, %f0, %f0
++	des_round	%f48, %f50, %f0, %f0
++	des_round	%f52, %f54, %f0, %f0
++	des_round	%f56, %f58, %f0, %f0
++	des_round	%f60, %f62, %f0, %f0
++	des_round	%f36, %f38, %f0, %f0
++	des_round	%f40, %f42, %f0, %f0
++	des_iip		%f0, %f0
++
++	brnz,pn		$omask, 2f
++	sub		$len, 1, $len
++
++	std		%f0, [$out + 0]
++	brnz,pt		$len, .Ldes_ede3_cbc_enc_loop
++	add		$out, 8, $out
++
++	st		%f0, [$ivec + 0]	! write out ivec
++	retl
++	st		%f1, [$ivec + 4]
++
++.align	16
++2:	ldxa		[$inp]0x82, %g4		! avoid read-after-write hazard
++						! and ~2x deterioration
++						! in inp==out case
++	faligndata	%f0, %f0, %f2		! handle unaligned output
++
++	stda		%f2, [$out + $omask]0xc0	! partial store
++	add		$out, 8, $out
++	orn		%g0, $omask, $omask
++	stda		%f2, [$out + $omask]0xc0	! partial store
++
++	brnz,pt		$len, .Ldes_ede3_cbc_enc_loop+4
++	orn		%g0, $omask, $omask
++
++	st		%f0, [$ivec + 0]	! write out ivec
++	retl
++	st		%f1, [$ivec + 4]
++.type	des_t4_ede3_cbc_encrypt,#function
++.size	des_t4_ede3_cbc_encrypt,.-des_t4_ede3_cbc_encrypt
++
++.globl	des_t4_ede3_cbc_decrypt
++.align	32
++des_t4_ede3_cbc_decrypt:
++	ld		[$ivec + 0], %f2	! load ivec
++	ld		[$ivec + 4], %f3
++
++	and		$inp, 7, $ileft
++	andn		$inp, 7, $inp
++	sll		$ileft, 3, $ileft
++	mov		0xff, $omask
++	prefetch	[$inp], 20
++	prefetch	[$inp + 63], 20
++	sub		%g0, $ileft, $iright
++	and		$out, 7, %g4
++	alignaddrl	$out, %g0, $out
++	srl		$omask, %g4, $omask
++	srlx		$len, 3, $len
++	movrz		%g4, 0, $omask
++	prefetch	[$out], 22
++
++	ldd		[$key + 0x100+0x78], %f4	! load key schedule
++	ldd		[$key + 0x100+0x70], %f6
++	ldd		[$key + 0x100+0x68], %f8
++	ldd		[$key + 0x100+0x60], %f10
++	ldd		[$key + 0x100+0x58], %f12
++	ldd		[$key + 0x100+0x50], %f14
++	ldd		[$key + 0x100+0x48], %f16
++	ldd		[$key + 0x100+0x40], %f18
++	ldd		[$key + 0x100+0x38], %f20
++	ldd		[$key + 0x100+0x30], %f22
++	ldd		[$key + 0x100+0x28], %f24
++	ldd		[$key + 0x100+0x20], %f26
++	ldd		[$key + 0x100+0x18], %f28
++	ldd		[$key + 0x100+0x10], %f30
++	ldd		[$key + 0x100+0x08], %f32
++	ldd		[$key + 0x100+0x00], %f34
++
++.Ldes_ede3_cbc_dec_loop:
++	ldx		[$inp + 0], %g4
++	brz,pt		$ileft, 4f
++	nop
++
++	ldx		[$inp + 8], %g5
++	sllx		%g4, $ileft, %g4
++	srlx		%g5, $iright, %g5
++	or		%g5, %g4, %g4
++4:
++	movxtod		%g4, %f0
++	prefetch	[$inp + 8+63], 20
++	add		$inp, 8, $inp
++	prefetch	[$out + 63], 22
++
++	des_ip		%f0, %f0
++	des_round	%f4, %f6, %f0, %f0
++	des_round	%f8, %f10, %f0, %f0
++	des_round	%f12, %f14, %f0, %f0
++	des_round	%f16, %f18, %f0, %f0
++	ldd		[$key + 0x80+0x00], %f36
++	ldd		[$key + 0x80+0x08], %f38
++	des_round	%f20, %f22, %f0, %f0
++	ldd		[$key + 0x80+0x10], %f40
++	ldd		[$key + 0x80+0x18], %f42
++	des_round	%f24, %f26, %f0, %f0
++	ldd		[$key + 0x80+0x20], %f44
++	ldd		[$key + 0x80+0x28], %f46
++	des_round	%f28, %f30, %f0, %f0
++	ldd		[$key + 0x80+0x30], %f48
++	ldd		[$key + 0x80+0x38], %f50
++	des_round	%f32, %f34, %f0, %f0
++	ldd		[$key + 0x80+0x40], %f52
++	ldd		[$key + 0x80+0x48], %f54
++	des_iip		%f0, %f0
++
++	ldd		[$key + 0x80+0x50], %f56
++	ldd		[$key + 0x80+0x58], %f58
++	des_ip		%f0, %f0
++	ldd		[$key + 0x80+0x60], %f60
++	ldd		[$key + 0x80+0x68], %f62
++	des_round	%f36, %f38, %f0, %f0
++	ldd		[$key + 0x80+0x70], %f36
++	ldd		[$key + 0x80+0x78], %f38
++	des_round	%f40, %f42, %f0, %f0
++	des_round	%f44, %f46, %f0, %f0
++	des_round	%f48, %f50, %f0, %f0
++	ldd		[$key + 0x80-0x08], %f40
++	ldd		[$key + 0x80-0x10], %f42
++	des_round	%f52, %f54, %f0, %f0
++	ldd		[$key + 0x80-0x18], %f44
++	ldd		[$key + 0x80-0x20], %f46
++	des_round	%f56, %f58, %f0, %f0
++	ldd		[$key + 0x80-0x28], %f48
++	ldd		[$key + 0x80-0x30], %f50
++	des_round	%f60, %f62, %f0, %f0
++	ldd		[$key + 0x80-0x38], %f52
++	ldd		[$key + 0x80-0x40], %f54
++	des_round	%f36, %f38, %f0, %f0
++	ldd		[$key + 0x80-0x48], %f56
++	ldd		[$key + 0x80-0x50], %f58
++	des_iip		%f0, %f0
++
++	ldd		[$key + 0x80-0x58], %f60
++	ldd		[$key + 0x80-0x60], %f62
++	des_ip		%f0, %f0
++	ldd		[$key + 0x80-0x68], %f36
++	ldd		[$key + 0x80-0x70], %f38
++	des_round	%f40, %f42, %f0, %f0
++	ldd		[$key + 0x80-0x78], %f40
++	ldd		[$key + 0x80-0x80], %f42
++	des_round	%f44, %f46, %f0, %f0
++	des_round	%f48, %f50, %f0, %f0
++	des_round	%f52, %f54, %f0, %f0
++	des_round	%f56, %f58, %f0, %f0
++	des_round	%f60, %f62, %f0, %f0
++	des_round	%f36, %f38, %f0, %f0
++	des_round	%f40, %f42, %f0, %f0
++	des_iip		%f0, %f0
++
++	fxor		%f2, %f0, %f0		! ^= ivec
++	movxtod		%g4, %f2
++
++	brnz,pn		$omask, 2f
++	sub		$len, 1, $len
++
++	std		%f0, [$out + 0]
++	brnz,pt		$len, .Ldes_ede3_cbc_dec_loop
++	add		$out, 8, $out
++
++	st		%f2, [$ivec + 0]	! write out ivec
++	retl
++	st		%f3, [$ivec + 4]
++
++.align	16
++2:	ldxa		[$inp]0x82, %g4		! avoid read-after-write hazard
++						! and ~3x deterioration
++						! in inp==out case
++	faligndata	%f0, %f0, %f0		! handle unaligned output
++
++	stda		%f0, [$out + $omask]0xc0	! partial store
++	add		$out, 8, $out
++	orn		%g0, $omask, $omask
++	stda		%f0, [$out + $omask]0xc0	! partial store
++
++	brnz,pt		$len, .Ldes_ede3_cbc_dec_loop+4
++	orn		%g0, $omask, $omask
++
++	st		%f2, [$ivec + 0]	! write out ivec
++	retl
++	st		%f3, [$ivec + 4]
++.type	des_t4_ede3_cbc_decrypt,#function
++.size	des_t4_ede3_cbc_decrypt,.-des_t4_ede3_cbc_decrypt
++___
++}
++$code.=<<___;
++.asciz  "DES for SPARC T4, David S. Miller, Andy Polyakov"
++.align  4
++___
++
++&emit_assembler();
++
++close STDOUT;
+Index: crypto/perlasm/sparcv9_modes.pl
+===================================================================
+diff -uNr openssl-1.0.1m/crypto/perlasm/sparcv9_modes.pl openssl-1.0.1m/crypto/perlasm/sparcv9_modes.pl
+--- openssl-1.0.1m/crypto/perlasm/sparcv9_modes.pl 1970-01-01 01:00:00.000000000 +0100
++++ openssl-1.0.1m/crypto/perlasm/sparcv9_modes.pl 2015-06-02 09:52:11.809148396 +0200
+@@ -0,0 +1,1680 @@
++#!/usr/bin/env perl
++
++# Specific modes implementations for SPARC Architecture 2011. There
++# is T4 dependency though, an ASI value that is not specified in the
++# Architecture Manual. But as SPARC universe is rather monocultural,
++# we imply that processor capable of executing crypto instructions
++# can handle the ASI in question as well. This means that we ought to
++# keep eyes open when new processors emerge...
++#
++# As for above mentioned ASI. It's so called "block initializing
++# store" which cancels "read" in "read-update-write" on cache lines.
++# This is "cooperative" optimization, as it reduces overall pressure
++# on memory interface. Benefits can't be observed/quantified with
++# usual benchmarks, on the contrary you can notice that single-thread
++# performance for parallelizable modes is ~1.5% worse for largest
++# block sizes [though few percent better for not so long ones]. All
++# this based on suggestions from David Miller.
++
++sub asm_init {		# to be called with @ARGV as argument
++    for (@_)		{ $::abibits=64 if (/\-m64/ || /\-xarch\=v9/); }
++    if ($::abibits==64)	{ $::bias=2047; $::frame=192; $::size_t_cc="%xcc"; }
++    else		{ $::bias=0;    $::frame=112; $::size_t_cc="%icc"; }
++}
++
++# unified interface
++my ($inp,$out,$len,$key,$ivec)=map("%i$_",(0..5));
++# local variables
++my ($ileft,$iright,$ooff,$omask,$ivoff,$blk_init)=map("%l$_",(0..7));
++
++sub alg_cbc_encrypt_implement {
++my ($alg,$bits) = @_;
++
++$::code.=<<___;
++.globl	${alg}${bits}_t4_cbc_encrypt
++.align	32
++${alg}${bits}_t4_cbc_encrypt:
++	save		%sp, -$::frame, %sp
++	sub		$inp, $out, $blk_init	! $inp!=$out
++___
++$::code.=<<___ if (!$::evp);
++	andcc		$ivec, 7, $ivoff
++	alignaddr	$ivec, %g0, $ivec
++
++	ldd		[$ivec + 0], %f0	! load ivec
++	bz,pt		%icc, 1f
++	ldd		[$ivec + 8], %f2
++	ldd		[$ivec + 16], %f4
++	faligndata	%f0, %f2, %f0
++	faligndata	%f2, %f4, %f2
++1:
++___
++$::code.=<<___ if ($::evp);
++	ld		[$ivec + 0], %f0
++	ld		[$ivec + 4], %f1
++	ld		[$ivec + 8], %f2
++	ld		[$ivec + 12], %f3
++___
++$::code.=<<___;
++	prefetch	[$inp], 20
++	prefetch	[$inp + 63], 20
++	call		_${alg}${bits}_load_enckey
++	and		$inp, 7, $ileft
++	andn		$inp, 7, $inp
++	sll		$ileft, 3, $ileft
++	mov		64, $iright
++	mov		0xff, $omask
++	sub		$iright, $ileft, $iright
++	and		$out, 7, $ooff
++	cmp		$len, 127
++	movrnz		$ooff, 0, $blk_init		! if (	$out&7 ||
++	movleu		$::size_t_cc, 0, $blk_init	!	$len<128 ||
++	brnz,pn		$blk_init, .L${bits}cbc_enc_blk	!	$inp==$out)
++	srl		$omask, $ooff, $omask
++
++	alignaddrl	$out, %g0, $out
++	srlx		$len, 4, $len
++	prefetch	[$out], 22
++
++.L${bits}_cbc_enc_loop:
++	ldx		[$inp + 0], %o0
++	brz,pt		$ileft, 4f
++	ldx		[$inp + 8], %o1
++
++	ldx		[$inp + 16], %o2
++	sllx		%o0, $ileft, %o0
++	srlx		%o1, $iright, %g1
++	sllx		%o1, $ileft, %o1
++	or		%g1, %o0, %o0
++	srlx		%o2, $iright, %o2
++	or		%o2, %o1, %o1
++4:
++	xor		%g4, %o0, %o0		! ^= rk[0]
++	xor		%g5, %o1, %o1
++	movxtod		%o0, %f12
++	movxtod		%o1, %f14
++
++	fxor		%f12, %f0, %f0		! ^= ivec
++	fxor		%f14, %f2, %f2
++	prefetch	[$out + 63], 22
++	prefetch	[$inp + 16+63], 20
++	call		_${alg}${bits}_encrypt_1x
++	add		$inp, 16, $inp
++
++	brnz,pn		$ooff, 2f
++	sub		$len, 1, $len
++		
++	std		%f0, [$out + 0]
++	std		%f2, [$out + 8]
++	brnz,pt		$len, .L${bits}_cbc_enc_loop
++	add		$out, 16, $out
++___
++$::code.=<<___ if ($::evp);
++	st		%f0, [$ivec + 0]
++	st		%f1, [$ivec + 4]
++	st		%f2, [$ivec + 8]
++	st		%f3, [$ivec + 12]
++___
++$::code.=<<___ if (!$::evp);
++	brnz,pn		$ivoff, 3f
++	nop
++
++	std		%f0, [$ivec + 0]	! write out ivec
++	std		%f2, [$ivec + 8]
++___
++$::code.=<<___;
++	ret
++	restore
++
++.align	16
++2:	ldxa		[$inp]0x82, %o0		! avoid read-after-write hazard
++						! and ~3x deterioration
++						! in inp==out case
++	faligndata	%f0, %f0, %f4		! handle unaligned output
++	faligndata	%f0, %f2, %f6
++	faligndata	%f2, %f2, %f8
++
++	stda		%f4, [$out + $omask]0xc0	! partial store
++	std		%f6, [$out + 8]
++	add		$out, 16, $out
++	orn		%g0, $omask, $omask
++	stda		%f8, [$out + $omask]0xc0	! partial store
++
++	brnz,pt		$len, .L${bits}_cbc_enc_loop+4
++	orn		%g0, $omask, $omask
++___
++$::code.=<<___ if ($::evp);
++	st		%f0, [$ivec + 0]
++	st		%f1, [$ivec + 4]
++	st		%f2, [$ivec + 8]
++	st		%f3, [$ivec + 12]
++___
++$::code.=<<___ if (!$::evp);
++	brnz,pn		$ivoff, 3f
++	nop
++
++	std		%f0, [$ivec + 0]	! write out ivec
++	std		%f2, [$ivec + 8]
++	ret
++	restore
++
++.align	16
++3:	alignaddrl	$ivec, $ivoff, %g0	! handle unaligned ivec
++	mov		0xff, $omask
++	srl		$omask, $ivoff, $omask
++	faligndata	%f0, %f0, %f4
++	faligndata	%f0, %f2, %f6
++	faligndata	%f2, %f2, %f8
++	stda		%f4, [$ivec + $omask]0xc0
++	std		%f6, [$ivec + 8]
++	add		$ivec, 16, $ivec
++	orn		%g0, $omask, $omask
++	stda		%f8, [$ivec + $omask]0xc0
++___
++$::code.=<<___;
++	ret
++	restore
++
++!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
++.align	32
++.L${bits}cbc_enc_blk:
++	add	$out, $len, $blk_init
++	and	$blk_init, 63, $blk_init	! tail
++	sub	$len, $blk_init, $len
++	add	$blk_init, 15, $blk_init	! round up to 16n
++	srlx	$len, 4, $len
++	srl	$blk_init, 4, $blk_init
++
++.L${bits}_cbc_enc_blk_loop:
++	ldx		[$inp + 0], %o0
++	brz,pt		$ileft, 5f
++	ldx		[$inp + 8], %o1
++
++	ldx		[$inp + 16], %o2
++	sllx		%o0, $ileft, %o0
++	srlx		%o1, $iright, %g1
++	sllx		%o1, $ileft, %o1
++	or		%g1, %o0, %o0
++	srlx		%o2, $iright, %o2
++	or		%o2, %o1, %o1
++5:
++	xor		%g4, %o0, %o0		! ^= rk[0]
++	xor		%g5, %o1, %o1
++	movxtod		%o0, %f12
++	movxtod		%o1, %f14
++
++	fxor		%f12, %f0, %f0		! ^= ivec
++	fxor		%f14, %f2, %f2
++	prefetch	[$inp + 16+63], 20
++	call		_${alg}${bits}_encrypt_1x
++	add		$inp, 16, $inp
++	sub		$len, 1, $len
++		
++	stda		%f0, [$out]0xe2		! ASI_BLK_INIT, T4-specific
++	add		$out, 8, $out
++	stda		%f2, [$out]0xe2		! ASI_BLK_INIT, T4-specific
++	brnz,pt		$len, .L${bits}_cbc_enc_blk_loop
++	add		$out, 8, $out
++
++	membar		#StoreLoad|#StoreStore
++	brnz,pt		$blk_init, .L${bits}_cbc_enc_loop
++	mov		$blk_init, $len
++___
++$::code.=<<___ if ($::evp);
++	st		%f0, [$ivec + 0]
++	st		%f1, [$ivec + 4]
++	st		%f2, [$ivec + 8]
++	st		%f3, [$ivec + 12]
++___
++$::code.=<<___ if (!$::evp);
++	brnz,pn		$ivoff, 3b
++	nop
++
++	std		%f0, [$ivec + 0]	! write out ivec
++	std		%f2, [$ivec + 8]
++___
++$::code.=<<___;
++	ret
++	restore
++.type	${alg}${bits}_t4_cbc_encrypt,#function
++.size	${alg}${bits}_t4_cbc_encrypt,.-${alg}${bits}_t4_cbc_encrypt
++___
++}
++
++sub alg_cbc_decrypt_implement {
++my ($alg,$bits) = @_;
++
++$::code.=<<___;
++.globl	${alg}${bits}_t4_cbc_decrypt
++.align	32
++${alg}${bits}_t4_cbc_decrypt:
++	save		%sp, -$::frame, %sp
++	sub		$inp, $out, $blk_init	! $inp!=$out
++___
++$::code.=<<___ if (!$::evp);
++	andcc		$ivec, 7, $ivoff
++	alignaddr	$ivec, %g0, $ivec
++
++	ldd		[$ivec + 0], %f12	! load ivec
++	bz,pt		%icc, 1f
++	ldd		[$ivec + 8], %f14
++	ldd		[$ivec + 16], %f0
++	faligndata	%f12, %f14, %f12
++	faligndata	%f14, %f0, %f14
++1:
++___
++$::code.=<<___ if ($::evp);
++	ld		[$ivec + 0], %f12	! load ivec
++	ld		[$ivec + 4], %f13
++	ld		[$ivec + 8], %f14
++	ld		[$ivec + 12], %f15
++___
++$::code.=<<___;
++	prefetch	[$inp], 20
++	prefetch	[$inp + 63], 20
++	call		_${alg}${bits}_load_deckey
++	and		$inp, 7, $ileft
++	andn		$inp, 7, $inp
++	sll		$ileft, 3, $ileft
++	mov		64, $iright
++	mov		0xff, $omask
++	sub		$iright, $ileft, $iright
++	and		$out, 7, $ooff
++	cmp		$len, 255
++	movrnz		$ooff, 0, $blk_init		! if (	$out&7 ||
++	movleu		$::size_t_cc, 0, $blk_init	!	$len<256 ||
++	brnz,pn		$blk_init, .L${bits}cbc_dec_blk	!	$inp==$out)
++	srl		$omask, $ooff, $omask
++
++	andcc		$len, 16, %g0		! is number of blocks even?
++	srlx		$len, 4, $len
++	alignaddrl	$out, %g0, $out
++	bz		%icc, .L${bits}_cbc_dec_loop2x
++	prefetch	[$out], 22
++.L${bits}_cbc_dec_loop:
++	ldx		[$inp + 0], %o0
++	brz,pt		$ileft, 4f
++	ldx		[$inp + 8], %o1
++
++	ldx		[$inp + 16], %o2
++	sllx		%o0, $ileft, %o0
++	srlx		%o1, $iright, %g1
++	sllx		%o1, $ileft, %o1
++	or		%g1, %o0, %o0
++	srlx		%o2, $iright, %o2
++	or		%o2, %o1, %o1
++4:
++	xor		%g4, %o0, %o2		! ^= rk[0]
++	xor		%g5, %o1, %o3
++	movxtod		%o2, %f0
++	movxtod		%o3, %f2
++
++	prefetch	[$out + 63], 22
++	prefetch	[$inp + 16+63], 20
++	call		_${alg}${bits}_decrypt_1x
++	add		$inp, 16, $inp
++
++	fxor		%f12, %f0, %f0		! ^= ivec
++	fxor		%f14, %f2, %f2
++	movxtod		%o0, %f12
++	movxtod		%o1, %f14
++
++	brnz,pn		$ooff, 2f
++	sub		$len, 1, $len
++		
++	std		%f0, [$out + 0]
++	std		%f2, [$out + 8]
++	brnz,pt		$len, .L${bits}_cbc_dec_loop2x
++	add		$out, 16, $out
++___
++$::code.=<<___ if ($::evp);
++	st		%f12, [$ivec + 0]
++	st		%f13, [$ivec + 4]
++	st		%f14, [$ivec + 8]
++	st		%f15, [$ivec + 12]
++___
++$::code.=<<___ if (!$::evp);
++	brnz,pn		$ivoff, .L${bits}_cbc_dec_unaligned_ivec
++	nop
++
++	std		%f12, [$ivec + 0]	! write out ivec
++	std		%f14, [$ivec + 8]
++___
++$::code.=<<___;
++	ret
++	restore
++
++.align	16
++2:	ldxa		[$inp]0x82, %o0		! avoid read-after-write hazard
++						! and ~3x deterioration
++						! in inp==out case
++	faligndata	%f0, %f0, %f4		! handle unaligned output
++	faligndata	%f0, %f2, %f6
++	faligndata	%f2, %f2, %f8
++
++	stda		%f4, [$out + $omask]0xc0	! partial store
++	std		%f6, [$out + 8]
++	add		$out, 16, $out
++	orn		%g0, $omask, $omask
++	stda		%f8, [$out + $omask]0xc0	! partial store
++
++	brnz,pt		$len, .L${bits}_cbc_dec_loop2x+4
++	orn		%g0, $omask, $omask
++___
++$::code.=<<___ if ($::evp);
++	st		%f12, [$ivec + 0]
++	st		%f13, [$ivec + 4]
++	st		%f14, [$ivec + 8]
++	st		%f15, [$ivec + 12]
++___
++$::code.=<<___ if (!$::evp);
++	brnz,pn		$ivoff, .L${bits}_cbc_dec_unaligned_ivec
++	nop
++
++	std		%f12, [$ivec + 0]	! write out ivec
++	std		%f14, [$ivec + 8]
++___
++$::code.=<<___;
++	ret
++	restore
++
++!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
++.align	32
++.L${bits}_cbc_dec_loop2x:
++	ldx		[$inp + 0], %o0
++	ldx		[$inp + 8], %o1
++	ldx		[$inp + 16], %o2
++	brz,pt		$ileft, 4f
++	ldx		[$inp + 24], %o3
++
++	ldx		[$inp + 32], %o4
++	sllx		%o0, $ileft, %o0
++	srlx		%o1, $iright, %g1
++	or		%g1, %o0, %o0
++	sllx		%o1, $ileft, %o1
++	srlx		%o2, $iright, %g1
++	or		%g1, %o1, %o1
++	sllx		%o2, $ileft, %o2
++	srlx		%o3, $iright, %g1
++	or		%g1, %o2, %o2
++	sllx		%o3, $ileft, %o3
++	srlx		%o4, $iright, %o4
++	or		%o4, %o3, %o3
++4:
++	xor		%g4, %o0, %o4		! ^= rk[0]
++	xor		%g5, %o1, %o5
++	movxtod		%o4, %f0
++	movxtod		%o5, %f2
++	xor		%g4, %o2, %o4
++	xor		%g5, %o3, %o5
++	movxtod		%o4, %f4
++	movxtod		%o5, %f6
++
++	prefetch	[$out + 63], 22
++	prefetch	[$inp + 32+63], 20
++	call		_${alg}${bits}_decrypt_2x
++	add		$inp, 32, $inp
++
++	movxtod		%o0, %f8
++	movxtod		%o1, %f10
++	fxor		%f12, %f0, %f0		! ^= ivec
++	fxor		%f14, %f2, %f2
++	movxtod		%o2, %f12
++	movxtod		%o3, %f14
++	fxor		%f8, %f4, %f4
++	fxor		%f10, %f6, %f6
++
++	brnz,pn		$ooff, 2f
++	sub		$len, 2, $len
++		
++	std		%f0, [$out + 0]
++	std		%f2, [$out + 8]
++	std		%f4, [$out + 16]
++	std		%f6, [$out + 24]
++	brnz,pt		$len, .L${bits}_cbc_dec_loop2x
++	add		$out, 32, $out
++___
++$::code.=<<___ if ($::evp);
++	st		%f12, [$ivec + 0]
++	st		%f13, [$ivec + 4]
++	st		%f14, [$ivec + 8]
++	st		%f15, [$ivec + 12]
++___
++$::code.=<<___ if (!$::evp);
++	brnz,pn		$ivoff, .L${bits}_cbc_dec_unaligned_ivec
++	nop
++
++	std		%f12, [$ivec + 0]	! write out ivec
++	std		%f14, [$ivec + 8]
++___
++$::code.=<<___;
++	ret
++	restore
++
++.align	16
++2:	ldxa		[$inp]0x82, %o0		! avoid read-after-write hazard
++						! and ~3x deterioration
++						! in inp==out case
++	faligndata	%f0, %f0, %f8		! handle unaligned output
++	faligndata	%f0, %f2, %f0
++	faligndata	%f2, %f4, %f2
++	faligndata	%f4, %f6, %f4
++	faligndata	%f6, %f6, %f6
++	stda		%f8, [$out + $omask]0xc0	! partial store
++	std		%f0, [$out + 8]
++	std		%f2, [$out + 16]
++	std		%f4, [$out + 24]
++	add		$out, 32, $out
++	orn		%g0, $omask, $omask
++	stda		%f6, [$out + $omask]0xc0	! partial store
++
++	brnz,pt		$len, .L${bits}_cbc_dec_loop2x+4
++	orn		%g0, $omask, $omask
++___
++$::code.=<<___ if ($::evp);
++	st		%f12, [$ivec + 0]
++	st		%f13, [$ivec + 4]
++	st		%f14, [$ivec + 8]
++	st		%f15, [$ivec + 12]
++___
++$::code.=<<___ if (!$::evp);
++	brnz,pn		$ivoff, .L${bits}_cbc_dec_unaligned_ivec
++	nop
++
++	std		%f12, [$ivec + 0]	! write out ivec
++	std		%f14, [$ivec + 8]
++	ret
++	restore
++
++.align	16
++.L${bits}_cbc_dec_unaligned_ivec:
++	alignaddrl	$ivec, $ivoff, %g0	! handle unaligned ivec
++	mov		0xff, $omask
++	srl		$omask, $ivoff, $omask
++	faligndata	%f12, %f12, %f0
++	faligndata	%f12, %f14, %f2
++	faligndata	%f14, %f14, %f4
++	stda		%f0, [$ivec + $omask]0xc0
++	std		%f2, [$ivec + 8]
++	add		$ivec, 16, $ivec
++	orn		%g0, $omask, $omask
++	stda		%f4, [$ivec + $omask]0xc0
++___
++$::code.=<<___;
++	ret
++	restore
++
++!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
++.align	32
++.L${bits}cbc_dec_blk:
++	add	$out, $len, $blk_init
++	and	$blk_init, 63, $blk_init	! tail
++	sub	$len, $blk_init, $len
++	add	$blk_init, 15, $blk_init	! round up to 16n
++	srlx	$len, 4, $len
++	srl	$blk_init, 4, $blk_init
++	sub	$len, 1, $len
++	add	$blk_init, 1, $blk_init
++
++.L${bits}_cbc_dec_blk_loop2x:
++	ldx		[$inp + 0], %o0
++	ldx		[$inp + 8], %o1
++	ldx		[$inp + 16], %o2
++	brz,pt		$ileft, 5f
++	ldx		[$inp + 24], %o3
++
++	ldx		[$inp + 32], %o4
++	sllx		%o0, $ileft, %o0
++	srlx		%o1, $iright, %g1
++	or		%g1, %o0, %o0
++	sllx		%o1, $ileft, %o1
++	srlx		%o2, $iright, %g1
++	or		%g1, %o1, %o1
++	sllx		%o2, $ileft, %o2
++	srlx		%o3, $iright, %g1
++	or		%g1, %o2, %o2
++	sllx		%o3, $ileft, %o3
++	srlx		%o4, $iright, %o4
++	or		%o4, %o3, %o3
++5:
++	xor		%g4, %o0, %o4		! ^= rk[0]
++	xor		%g5, %o1, %o5
++	movxtod		%o4, %f0
++	movxtod		%o5, %f2
++	xor		%g4, %o2, %o4
++	xor		%g5, %o3, %o5
++	movxtod		%o4, %f4
++	movxtod		%o5, %f6
++
++	prefetch	[$inp + 32+63], 20
++	call		_${alg}${bits}_decrypt_2x
++	add		$inp, 32, $inp
++	subcc		$len, 2, $len
++
++	movxtod		%o0, %f8
++	movxtod		%o1, %f10
++	fxor		%f12, %f0, %f0		! ^= ivec
++	fxor		%f14, %f2, %f2
++	movxtod		%o2, %f12
++	movxtod		%o3, %f14
++	fxor		%f8, %f4, %f4
++	fxor		%f10, %f6, %f6
++
++	stda		%f0, [$out]0xe2		! ASI_BLK_INIT, T4-specific
++	add		$out, 8, $out
++	stda		%f2, [$out]0xe2		! ASI_BLK_INIT, T4-specific
++	add		$out, 8, $out
++	stda		%f4, [$out]0xe2		! ASI_BLK_INIT, T4-specific
++	add		$out, 8, $out
++	stda		%f6, [$out]0xe2		! ASI_BLK_INIT, T4-specific
++	bgu,pt		$::size_t_cc, .L${bits}_cbc_dec_blk_loop2x
++	add		$out, 8, $out
++
++	add		$blk_init, $len, $len
++	andcc		$len, 1, %g0		! is number of blocks even?
++	membar		#StoreLoad|#StoreStore
++	bnz,pt		%icc, .L${bits}_cbc_dec_loop
++	srl		$len, 0, $len
++	brnz,pn		$len, .L${bits}_cbc_dec_loop2x
++	nop
++___
++$::code.=<<___ if ($::evp);
++	st		%f12, [$ivec + 0]	! write out ivec
++	st		%f13, [$ivec + 4]
++	st		%f14, [$ivec + 8]
++	st		%f15, [$ivec + 12]
++___
++$::code.=<<___ if (!$::evp);
++	brnz,pn		$ivoff, 3b
++	nop
++
++	std		%f12, [$ivec + 0]	! write out ivec
++	std		%f14, [$ivec + 8]
++___
++$::code.=<<___;
++	ret
++	restore
++.type	${alg}${bits}_t4_cbc_decrypt,#function
++.size	${alg}${bits}_t4_cbc_decrypt,.-${alg}${bits}_t4_cbc_decrypt
++___
++}
++
++sub alg_ctr32_implement {
++my ($alg,$bits) = @_;
++
++$::code.=<<___;
++.globl	${alg}${bits}_t4_ctr32_encrypt
++.align	32
++${alg}${bits}_t4_ctr32_encrypt:
++	save		%sp, -$::frame, %sp
++
++	prefetch	[$inp], 20
++	prefetch	[$inp + 63], 20
++	call		_${alg}${bits}_load_enckey
++	sllx		$len, 4, $len
++
++	ld		[$ivec + 0], %l4	! counter
++	ld		[$ivec + 4], %l5
++	ld		[$ivec + 8], %l6
++	ld		[$ivec + 12], %l7
++
++	sllx		%l4, 32, %o5
++	or		%l5, %o5, %o5
++	sllx		%l6, 32, %g1
++	xor		%o5, %g4, %g4		! ^= rk[0]
++	xor		%g1, %g5, %g5
++	movxtod		%g4, %f14		! most significant 64 bits
++
++	sub		$inp, $out, $blk_init	! $inp!=$out
++	and		$inp, 7, $ileft
++	andn		$inp, 7, $inp
++	sll		$ileft, 3, $ileft
++	mov		64, $iright
++	mov		0xff, $omask
++	sub		$iright, $ileft, $iright
++	and		$out, 7, $ooff
++	cmp		$len, 255
++	movrnz		$ooff, 0, $blk_init		! if (	$out&7 ||
++	movleu		$::size_t_cc, 0, $blk_init	!	$len<256 ||
++	brnz,pn		$blk_init, .L${bits}_ctr32_blk	!	$inp==$out)
++	srl		$omask, $ooff, $omask
++
++	andcc		$len, 16, %g0		! is number of blocks even?
++	alignaddrl	$out, %g0, $out
++	bz		%icc, .L${bits}_ctr32_loop2x
++	srlx		$len, 4, $len
++.L${bits}_ctr32_loop:
++	ldx		[$inp + 0], %o0
++	brz,pt		$ileft, 4f
++	ldx		[$inp + 8], %o1
++
++	ldx		[$inp + 16], %o2
++	sllx		%o0, $ileft, %o0
++	srlx		%o1, $iright, %g1
++	sllx		%o1, $ileft, %o1
++	or		%g1, %o0, %o0
++	srlx		%o2, $iright, %o2
++	or		%o2, %o1, %o1
++4:
++	xor		%g5, %l7, %g1		! ^= rk[0]
++	add		%l7, 1, %l7
++	movxtod		%g1, %f2
++	srl		%l7, 0, %l7		! clruw
++	prefetch	[$out + 63], 22
++	prefetch	[$inp + 16+63], 20
++___
++$::code.=<<___ if ($alg eq "aes");
++	aes_eround01	%f16, %f14, %f2, %f4
++	aes_eround23	%f18, %f14, %f2, %f2
++___
++$::code.=<<___ if ($alg eq "cmll");
++	camellia_f	%f16, %f2, %f14, %f2
++	camellia_f	%f18, %f14, %f2, %f0
++___
++$::code.=<<___;
++	call		_${alg}${bits}_encrypt_1x+8
++	add		$inp, 16, $inp
++
++	movxtod		%o0, %f10
++	movxtod		%o1, %f12
++	fxor		%f10, %f0, %f0		! ^= inp
++	fxor		%f12, %f2, %f2
++
++	brnz,pn		$ooff, 2f
++	sub		$len, 1, $len
++		
++	std		%f0, [$out + 0]
++	std		%f2, [$out + 8]
++	brnz,pt		$len, .L${bits}_ctr32_loop2x
++	add		$out, 16, $out
++
++	ret
++	restore
++
++.align	16
++2:	ldxa		[$inp]0x82, %o0		! avoid read-after-write hazard
++						! and ~3x deterioration
++						! in inp==out case
++	faligndata	%f0, %f0, %f4		! handle unaligned output
++	faligndata	%f0, %f2, %f6
++	faligndata	%f2, %f2, %f8
++	stda		%f4, [$out + $omask]0xc0	! partial store
++	std		%f6, [$out + 8]
++	add		$out, 16, $out
++	orn		%g0, $omask, $omask
++	stda		%f8, [$out + $omask]0xc0	! partial store
++
++	brnz,pt		$len, .L${bits}_ctr32_loop2x+4
++	orn		%g0, $omask, $omask
++
++	ret
++	restore
++
++!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
++.align	32
++.L${bits}_ctr32_loop2x:
++	ldx		[$inp + 0], %o0
++	ldx		[$inp + 8], %o1
++	ldx		[$inp + 16], %o2
++	brz,pt		$ileft, 4f
++	ldx		[$inp + 24], %o3
++
++	ldx		[$inp + 32], %o4
++	sllx		%o0, $ileft, %o0
++	srlx		%o1, $iright, %g1
++	or		%g1, %o0, %o0
++	sllx		%o1, $ileft, %o1
++	srlx		%o2, $iright, %g1
++	or		%g1, %o1, %o1
++	sllx		%o2, $ileft, %o2
++	srlx		%o3, $iright, %g1
++	or		%g1, %o2, %o2
++	sllx		%o3, $ileft, %o3
++	srlx		%o4, $iright, %o4
++	or		%o4, %o3, %o3
++4:
++	xor		%g5, %l7, %g1		! ^= rk[0]
++	add		%l7, 1, %l7
++	movxtod		%g1, %f2
++	srl		%l7, 0, %l7		! clruw
++	xor		%g5, %l7, %g1
++	add		%l7, 1, %l7
++	movxtod		%g1, %f6
++	srl		%l7, 0, %l7		! clruw
++	prefetch	[$out + 63], 22
++	prefetch	[$inp + 32+63], 20
++___
++$::code.=<<___ if ($alg eq "aes");
++	aes_eround01	%f16, %f14, %f2, %f8
++	aes_eround23	%f18, %f14, %f2, %f2
++	aes_eround01	%f16, %f14, %f6, %f10
++	aes_eround23	%f18, %f14, %f6, %f6
++___
++$::code.=<<___ if ($alg eq "cmll");
++	camellia_f	%f16, %f2, %f14, %f2
++	camellia_f	%f16, %f6, %f14, %f6
++	camellia_f	%f18, %f14, %f2, %f0
++	camellia_f	%f18, %f14, %f6, %f4
++___
++$::code.=<<___;
++	call		_${alg}${bits}_encrypt_2x+16
++	add		$inp, 32, $inp
++
++	movxtod		%o0, %f8
++	movxtod		%o1, %f10
++	movxtod		%o2, %f12
++	fxor		%f8, %f0, %f0		! ^= inp
++	movxtod		%o3, %f8
++	fxor		%f10, %f2, %f2
++	fxor		%f12, %f4, %f4
++	fxor		%f8, %f6, %f6
++
++	brnz,pn		$ooff, 2f
++	sub		$len, 2, $len
++		
++	std		%f0, [$out + 0]
++	std		%f2, [$out + 8]
++	std		%f4, [$out + 16]
++	std		%f6, [$out + 24]
++	brnz,pt		$len, .L${bits}_ctr32_loop2x
++	add		$out, 32, $out
++
++	ret
++	restore
++
++.align	16
++2:	ldxa		[$inp]0x82, %o0		! avoid read-after-write hazard
++						! and ~3x deterioration
++						! in inp==out case
++	faligndata	%f0, %f0, %f8		! handle unaligned output
++	faligndata	%f0, %f2, %f0
++	faligndata	%f2, %f4, %f2
++	faligndata	%f4, %f6, %f4
++	faligndata	%f6, %f6, %f6
++
++	stda		%f8, [$out + $omask]0xc0	! partial store
++	std		%f0, [$out + 8]
++	std		%f2, [$out + 16]
++	std		%f4, [$out + 24]
++	add		$out, 32, $out
++	orn		%g0, $omask, $omask
++	stda		%f6, [$out + $omask]0xc0	! partial store
++
++	brnz,pt		$len, .L${bits}_ctr32_loop2x+4
++	orn		%g0, $omask, $omask
++
++	ret
++	restore
++
++!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
++.align	32
++.L${bits}_ctr32_blk:
++	add	$out, $len, $blk_init
++	and	$blk_init, 63, $blk_init	! tail
++	sub	$len, $blk_init, $len
++	add	$blk_init, 15, $blk_init	! round up to 16n
++	srlx	$len, 4, $len
++	srl	$blk_init, 4, $blk_init
++	sub	$len, 1, $len
++	add	$blk_init, 1, $blk_init
++
++.L${bits}_ctr32_blk_loop2x:
++	ldx		[$inp + 0], %o0
++	ldx		[$inp + 8], %o1
++	ldx		[$inp + 16], %o2
++	brz,pt		$ileft, 5f
++	ldx		[$inp + 24], %o3
++
++	ldx		[$inp + 32], %o4
++	sllx		%o0, $ileft, %o0
++	srlx		%o1, $iright, %g1
++	or		%g1, %o0, %o0
++	sllx		%o1, $ileft, %o1
++	srlx		%o2, $iright, %g1
++	or		%g1, %o1, %o1
++	sllx		%o2, $ileft, %o2
++	srlx		%o3, $iright, %g1
++	or		%g1, %o2, %o2
++	sllx		%o3, $ileft, %o3
++	srlx		%o4, $iright, %o4
++	or		%o4, %o3, %o3
++5:
++	xor		%g5, %l7, %g1		! ^= rk[0]
++	add		%l7, 1, %l7
++	movxtod		%g1, %f2
++	srl		%l7, 0, %l7		! clruw
++	xor		%g5, %l7, %g1
++	add		%l7, 1, %l7
++	movxtod		%g1, %f6
++	srl		%l7, 0, %l7		! clruw
++	prefetch	[$inp + 32+63], 20
++___
++$::code.=<<___ if ($alg eq "aes");
++	aes_eround01	%f16, %f14, %f2, %f8
++	aes_eround23	%f18, %f14, %f2, %f2
++	aes_eround01	%f16, %f14, %f6, %f10
++	aes_eround23	%f18, %f14, %f6, %f6
++___
++$::code.=<<___ if ($alg eq "cmll");
++	camellia_f	%f16, %f2, %f14, %f2
++	camellia_f	%f16, %f6, %f14, %f6
++	camellia_f	%f18, %f14, %f2, %f0
++	camellia_f	%f18, %f14, %f6, %f4
++___
++$::code.=<<___;
++	call		_${alg}${bits}_encrypt_2x+16
++	add		$inp, 32, $inp
++	subcc		$len, 2, $len
++
++	movxtod		%o0, %f8
++	movxtod		%o1, %f10
++	movxtod		%o2, %f12
++	fxor		%f8, %f0, %f0		! ^= inp
++	movxtod		%o3, %f8
++	fxor		%f10, %f2, %f2
++	fxor		%f12, %f4, %f4
++	fxor		%f8, %f6, %f6
++
++	stda		%f0, [$out]0xe2		! ASI_BLK_INIT, T4-specific
++	add		$out, 8, $out
++	stda		%f2, [$out]0xe2		! ASI_BLK_INIT, T4-specific
++	add		$out, 8, $out
++	stda		%f4, [$out]0xe2		! ASI_BLK_INIT, T4-specific
++	add		$out, 8, $out
++	stda		%f6, [$out]0xe2		! ASI_BLK_INIT, T4-specific
++	bgu,pt		$::size_t_cc, .L${bits}_ctr32_blk_loop2x
++	add		$out, 8, $out
++
++	add		$blk_init, $len, $len
++	andcc		$len, 1, %g0		! is number of blocks even?
++	membar		#StoreLoad|#StoreStore
++	bnz,pt		%icc, .L${bits}_ctr32_loop
++	srl		$len, 0, $len
++	brnz,pn		$len, .L${bits}_ctr32_loop2x
++	nop
++
++	ret
++	restore
++.type	${alg}${bits}_t4_ctr32_encrypt,#function
++.size	${alg}${bits}_t4_ctr32_encrypt,.-${alg}${bits}_t4_ctr32_encrypt
++___
++}
++
++sub alg_xts_implement {
++my ($alg,$bits,$dir) = @_;
++my ($inp,$out,$len,$key1,$key2,$ivec)=map("%i$_",(0..5));
++my $rem=$ivec;
++
++$::code.=<<___;
++.globl	${alg}${bits}_t4_xts_${dir}crypt
++.align	32
++${alg}${bits}_t4_xts_${dir}crypt:
++	save		%sp, -$::frame-16, %sp
++
++	mov		$ivec, %o0
++	add		%fp, $::bias-16, %o1
++	call		${alg}_t4_encrypt
++	mov		$key2, %o2
++
++	add		%fp, $::bias-16, %l7
++	ldxa		[%l7]0x88, %g2
++	add		%fp, $::bias-8, %l7
++	ldxa		[%l7]0x88, %g3		! %g3:%g2 is tweak
++
++	sethi		%hi(0x76543210), %l7
++	or		%l7, %lo(0x76543210), %l7
++	bmask		%l7, %g0, %g0		! byte swap mask
++
++	prefetch	[$inp], 20
++	prefetch	[$inp + 63], 20
++	call		_${alg}${bits}_load_${dir}ckey
++	and		$len, 15,  $rem
++	and		$len, -16, $len
++___
++$code.=<<___ if ($dir eq "de");
++	mov		0, %l7
++	movrnz		$rem, 16,  %l7
++	sub		$len, %l7, $len
++___
++$code.=<<___;
++
++	sub		$inp, $out, $blk_init	! $inp!=$out
++	and		$inp, 7, $ileft
++	andn		$inp, 7, $inp
++	sll		$ileft, 3, $ileft
++	mov		64, $iright
++	mov		0xff, $omask
++	sub		$iright, $ileft, $iright
++	and		$out, 7, $ooff
++	cmp		$len, 255
++	movrnz		$ooff, 0, $blk_init		! if (	$out&7 ||
++	movleu		$::size_t_cc, 0, $blk_init	!	$len<256 ||
++	brnz,pn		$blk_init, .L${bits}_xts_${dir}blk !	$inp==$out)
++	srl		$omask, $ooff, $omask
++
++	andcc		$len, 16, %g0		! is number of blocks even?
++___
++$code.=<<___ if ($dir eq "de");
++	brz,pn		$len, .L${bits}_xts_${dir}steal
++___
++$code.=<<___;
++	alignaddrl	$out, %g0, $out
++	bz		%icc, .L${bits}_xts_${dir}loop2x
++	srlx		$len, 4, $len
++.L${bits}_xts_${dir}loop:
++	ldx		[$inp + 0], %o0
++	brz,pt		$ileft, 4f
++	ldx		[$inp + 8], %o1
++
++	ldx		[$inp + 16], %o2
++	sllx		%o0, $ileft, %o0
++	srlx		%o1, $iright, %g1
++	sllx		%o1, $ileft, %o1
++	or		%g1, %o0, %o0
++	srlx		%o2, $iright, %o2
++	or		%o2, %o1, %o1
++4:
++	movxtod		%g2, %f12
++	movxtod		%g3, %f14
++	bshuffle	%f12, %f12, %f12
++	bshuffle	%f14, %f14, %f14
++
++	xor		%g4, %o0, %o0		! ^= rk[0]
++	xor		%g5, %o1, %o1
++	movxtod		%o0, %f0
++	movxtod		%o1, %f2
++
++	fxor		%f12, %f0, %f0		! ^= tweak[0]
++	fxor		%f14, %f2, %f2
++
++	prefetch	[$out + 63], 22
++	prefetch	[$inp + 16+63], 20
++	call		_${alg}${bits}_${dir}crypt_1x
++	add		$inp, 16, $inp
++
++	fxor		%f12, %f0, %f0		! ^= tweak[0]
++	fxor		%f14, %f2, %f2
++
++	srax		%g3, 63, %l7		! next tweak value
++	addcc		%g2, %g2, %g2
++	and		%l7, 0x87, %l7
++	addxc		%g3, %g3, %g3
++	xor		%l7, %g2, %g2
++
++	brnz,pn		$ooff, 2f
++	sub		$len, 1, $len
++		
++	std		%f0, [$out + 0]
++	std		%f2, [$out + 8]
++	brnz,pt		$len, .L${bits}_xts_${dir}loop2x
++	add		$out, 16, $out
++
++	brnz,pn		$rem, .L${bits}_xts_${dir}steal
++	nop
++
++	ret
++	restore
++
++.align	16
++2:	ldxa		[$inp]0x82, %o0		! avoid read-after-write hazard
++						! and ~3x deterioration
++						! in inp==out case
++	faligndata	%f0, %f0, %f4		! handle unaligned output
++	faligndata	%f0, %f2, %f6
++	faligndata	%f2, %f2, %f8
++	stda		%f4, [$out + $omask]0xc0	! partial store
++	std		%f6, [$out + 8]
++	add		$out, 16, $out
++	orn		%g0, $omask, $omask
++	stda		%f8, [$out + $omask]0xc0	! partial store
++
++	brnz,pt		$len, .L${bits}_xts_${dir}loop2x+4
++	orn		%g0, $omask, $omask
++
++	brnz,pn		$rem, .L${bits}_xts_${dir}steal
++	nop
++
++	ret
++	restore
++
++!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
++.align	32
++.L${bits}_xts_${dir}loop2x:
++	ldx		[$inp + 0], %o0
++	ldx		[$inp + 8], %o1
++	ldx		[$inp + 16], %o2
++	brz,pt		$ileft, 4f
++	ldx		[$inp + 24], %o3
++
++	ldx		[$inp + 32], %o4
++	sllx		%o0, $ileft, %o0
++	srlx		%o1, $iright, %g1
++	or		%g1, %o0, %o0
++	sllx		%o1, $ileft, %o1
++	srlx		%o2, $iright, %g1
++	or		%g1, %o1, %o1
++	sllx		%o2, $ileft, %o2
++	srlx		%o3, $iright, %g1
++	or		%g1, %o2, %o2
++	sllx		%o3, $ileft, %o3
++	srlx		%o4, $iright, %o4
++	or		%o4, %o3, %o3
++4:
++	movxtod		%g2, %f12
++	movxtod		%g3, %f14
++	bshuffle	%f12, %f12, %f12
++	bshuffle	%f14, %f14, %f14
++
++	srax		%g3, 63, %l7		! next tweak value
++	addcc		%g2, %g2, %g2
++	and		%l7, 0x87, %l7
++	addxc		%g3, %g3, %g3
++	xor		%l7, %g2, %g2
++
++	movxtod		%g2, %f8
++	movxtod		%g3, %f10
++	bshuffle	%f8,  %f8,  %f8
++	bshuffle	%f10, %f10, %f10
++
++	xor		%g4, %o0, %o0		! ^= rk[0]
++	xor		%g5, %o1, %o1
++	xor		%g4, %o2, %o2		! ^= rk[0]
++	xor		%g5, %o3, %o3
++	movxtod		%o0, %f0
++	movxtod		%o1, %f2
++	movxtod		%o2, %f4
++	movxtod		%o3, %f6
++
++	fxor		%f12, %f0, %f0		! ^= tweak[0]
++	fxor		%f14, %f2, %f2
++	fxor		%f8,  %f4, %f4		! ^= tweak[0]
++	fxor		%f10, %f6, %f6
++
++	prefetch	[$out + 63], 22
++	prefetch	[$inp + 32+63], 20
++	call		_${alg}${bits}_${dir}crypt_2x
++	add		$inp, 32, $inp
++
++	movxtod		%g2, %f8
++	movxtod		%g3, %f10
++
++	srax		%g3, 63, %l7		! next tweak value
++	addcc		%g2, %g2, %g2
++	and		%l7, 0x87, %l7
++	addxc		%g3, %g3, %g3
++	xor		%l7, %g2, %g2
++
++	bshuffle	%f8,  %f8,  %f8
++	bshuffle	%f10, %f10, %f10
++
++	fxor		%f12, %f0, %f0		! ^= tweak[0]
++	fxor		%f14, %f2, %f2
++	fxor		%f8,  %f4, %f4
++	fxor		%f10, %f6, %f6
++
++	brnz,pn		$ooff, 2f
++	sub		$len, 2, $len
++		
++	std		%f0, [$out + 0]
++	std		%f2, [$out + 8]
++	std		%f4, [$out + 16]
++	std		%f6, [$out + 24]
++	brnz,pt		$len, .L${bits}_xts_${dir}loop2x
++	add		$out, 32, $out
++
++	fsrc2		%f4, %f0
++	fsrc2		%f6, %f2
++	brnz,pn		$rem, .L${bits}_xts_${dir}steal
++	nop
++
++	ret
++	restore
++
++.align	16
++2:	ldxa		[$inp]0x82, %o0		! avoid read-after-write hazard
++						! and ~3x deterioration
++						! in inp==out case
++	faligndata	%f0, %f0, %f8		! handle unaligned output
++	faligndata	%f0, %f2, %f10
++	faligndata	%f2, %f4, %f12
++	faligndata	%f4, %f6, %f14
++	faligndata	%f6, %f6, %f0
++
++	stda		%f8, [$out + $omask]0xc0	! partial store
++	std		%f10, [$out + 8]
++	std		%f12, [$out + 16]
++	std		%f14, [$out + 24]
++	add		$out, 32, $out
++	orn		%g0, $omask, $omask
++	stda		%f0, [$out + $omask]0xc0	! partial store
++
++	brnz,pt		$len, .L${bits}_xts_${dir}loop2x+4
++	orn		%g0, $omask, $omask
++
++	fsrc2		%f4, %f0
++	fsrc2		%f6, %f2
++	brnz,pn		$rem, .L${bits}_xts_${dir}steal
++	nop
++
++	ret
++	restore
++
++!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
++.align	32
++.L${bits}_xts_${dir}blk:
++	add	$out, $len, $blk_init
++	and	$blk_init, 63, $blk_init	! tail
++	sub	$len, $blk_init, $len
++	add	$blk_init, 15, $blk_init	! round up to 16n
++	srlx	$len, 4, $len
++	srl	$blk_init, 4, $blk_init
++	sub	$len, 1, $len
++	add	$blk_init, 1, $blk_init
++
++.L${bits}_xts_${dir}blk2x:
++	ldx		[$inp + 0], %o0
++	ldx		[$inp + 8], %o1
++	ldx		[$inp + 16], %o2
++	brz,pt		$ileft, 5f
++	ldx		[$inp + 24], %o3
++
++	ldx		[$inp + 32], %o4
++	sllx		%o0, $ileft, %o0
++	srlx		%o1, $iright, %g1
++	or		%g1, %o0, %o0
++	sllx		%o1, $ileft, %o1
++	srlx		%o2, $iright, %g1
++	or		%g1, %o1, %o1
++	sllx		%o2, $ileft, %o2
++	srlx		%o3, $iright, %g1
++	or		%g1, %o2, %o2
++	sllx		%o3, $ileft, %o3
++	srlx		%o4, $iright, %o4
++	or		%o4, %o3, %o3
++5:
++	movxtod		%g2, %f12
++	movxtod		%g3, %f14
++	bshuffle	%f12, %f12, %f12
++	bshuffle	%f14, %f14, %f14
++
++	srax		%g3, 63, %l7		! next tweak value
++	addcc		%g2, %g2, %g2
++	and		%l7, 0x87, %l7
++	addxc		%g3, %g3, %g3
++	xor		%l7, %g2, %g2
++
++	movxtod		%g2, %f8
++	movxtod		%g3, %f10
++	bshuffle	%f8,  %f8,  %f8
++	bshuffle	%f10, %f10, %f10
++
++	xor		%g4, %o0, %o0		! ^= rk[0]
++	xor		%g5, %o1, %o1
++	xor		%g4, %o2, %o2		! ^= rk[0]
++	xor		%g5, %o3, %o3
++	movxtod		%o0, %f0
++	movxtod		%o1, %f2
++	movxtod		%o2, %f4
++	movxtod		%o3, %f6
++
++	fxor		%f12, %f0, %f0		! ^= tweak[0]
++	fxor		%f14, %f2, %f2
++	fxor		%f8,  %f4, %f4		! ^= tweak[0]
++	fxor		%f10, %f6, %f6
++
++	prefetch	[$inp + 32+63], 20
++	call		_${alg}${bits}_${dir}crypt_2x
++	add		$inp, 32, $inp
++
++	movxtod		%g2, %f8
++	movxtod		%g3, %f10
++
++	srax		%g3, 63, %l7		! next tweak value
++	addcc		%g2, %g2, %g2
++	and		%l7, 0x87, %l7
++	addxc		%g3, %g3, %g3
++	xor		%l7, %g2, %g2
++
++	bshuffle	%f8,  %f8,  %f8
++	bshuffle	%f10, %f10, %f10
++
++	fxor		%f12, %f0, %f0		! ^= tweak[0]
++	fxor		%f14, %f2, %f2
++	fxor		%f8,  %f4, %f4
++	fxor		%f10, %f6, %f6
++
++	stda		%f0, [$out]0xe2		! ASI_BLK_INIT, T4-specific
++	add		$out, 8, $out
++	stda		%f2, [$out]0xe2		! ASI_BLK_INIT, T4-specific
++	add		$out, 8, $out
++	stda		%f4, [$out]0xe2		! ASI_BLK_INIT, T4-specific
++	add		$out, 8, $out
++	stda		%f6, [$out]0xe2		! ASI_BLK_INIT, T4-specific
++	bgu,pt		$::size_t_cc, .L${bits}_xts_${dir}blk2x
++	add		$out, 8, $out
++
++	add		$blk_init, $len, $len
++	andcc		$len, 1, %g0		! is number of blocks even?
++	membar		#StoreLoad|#StoreStore
++	bnz,pt		%icc, .L${bits}_xts_${dir}loop
++	srl		$len, 0, $len
++	brnz,pn		$len, .L${bits}_xts_${dir}loop2x
++	nop
++
++	fsrc2		%f4, %f0
++	fsrc2		%f6, %f2
++	brnz,pn		$rem, .L${bits}_xts_${dir}steal
++	nop
++
++	ret
++	restore
++!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
++___
++$code.=<<___ if ($dir eq "en");
++.align	32
++.L${bits}_xts_${dir}steal:
++	std		%f0, [%fp + $::bias-16]	! copy of output
++	std		%f2, [%fp + $::bias-8]
++
++	srl		$ileft, 3, $ileft
++	add		%fp, $::bias-16, %l7
++	add		$inp, $ileft, $inp	! original $inp+$len&-15
++	add		$out, $ooff, $out	! original $out+$len&-15
++	mov		0, $ileft
++	nop					! align
++
++.L${bits}_xts_${dir}stealing:
++	ldub		[$inp + $ileft], %o0
++	ldub		[%l7  + $ileft], %o1
++	dec		$rem
++	stb		%o0, [%l7  + $ileft]
++	stb		%o1, [$out + $ileft]
++	brnz		$rem, .L${bits}_xts_${dir}stealing
++	inc		$ileft
++
++	mov		%l7, $inp
++	sub		$out, 16, $out
++	mov		0, $ileft
++	sub		$out, $ooff, $out
++	ba		.L${bits}_xts_${dir}loop	! one more time
++	mov		1, $len				! $rem is 0
++___
++$code.=<<___ if ($dir eq "de");
++.align	32
++.L${bits}_xts_${dir}steal:
++	ldx		[$inp + 0], %o0
++	brz,pt		$ileft, 8f
++	ldx		[$inp + 8], %o1
++
++	ldx		[$inp + 16], %o2
++	sllx		%o0, $ileft, %o0
++	srlx		%o1, $iright, %g1
++	sllx		%o1, $ileft, %o1
++	or		%g1, %o0, %o0
++	srlx		%o2, $iright, %o2
++	or		%o2, %o1, %o1
++8:
++	srax		%g3, 63, %l7		! next tweak value
++	addcc		%g2, %g2, %o2
++	and		%l7, 0x87, %l7
++	addxc		%g3, %g3, %o3
++	xor		%l7, %o2, %o2
++
++	movxtod		%o2, %f12
++	movxtod		%o3, %f14
++	bshuffle	%f12, %f12, %f12
++	bshuffle	%f14, %f14, %f14
++
++	xor		%g4, %o0, %o0		! ^= rk[0]
++	xor		%g5, %o1, %o1
++	movxtod		%o0, %f0
++	movxtod		%o1, %f2
++
++	fxor		%f12, %f0, %f0		! ^= tweak[0]
++	fxor		%f14, %f2, %f2
++
++	call		_${alg}${bits}_${dir}crypt_1x
++	add		$inp, 16, $inp
++
++	fxor		%f12, %f0, %f0		! ^= tweak[0]
++	fxor		%f14, %f2, %f2
++
++	std		%f0, [%fp + $::bias-16]
++	std		%f2, [%fp + $::bias-8]
++
++	srl		$ileft, 3, $ileft
++	add		%fp, $::bias-16, %l7
++	add		$inp, $ileft, $inp	! original $inp+$len&-15
++	add		$out, $ooff, $out	! original $out+$len&-15
++	mov		0, $ileft
++	add		$out, 16, $out
++	nop					! align
++
++.L${bits}_xts_${dir}stealing:
++	ldub		[$inp + $ileft], %o0
++	ldub		[%l7  + $ileft], %o1
++	dec		$rem
++	stb		%o0, [%l7  + $ileft]
++	stb		%o1, [$out + $ileft]
++	brnz		$rem, .L${bits}_xts_${dir}stealing
++	inc		$ileft
++
++	mov		%l7, $inp
++	sub		$out, 16, $out
++	mov		0, $ileft
++	sub		$out, $ooff, $out
++	ba		.L${bits}_xts_${dir}loop	! one more time
++	mov		1, $len				! $rem is 0
++___
++$code.=<<___;
++	ret
++	restore
++.type	${alg}${bits}_t4_xts_${dir}crypt,#function
++.size	${alg}${bits}_t4_xts_${dir}crypt,.-${alg}${bits}_t4_xts_${dir}crypt
++___
++}
++
++# Purpose of these subroutines is to explicitly encode VIS instructions,
++# so that one can compile the module without having to specify VIS
++# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
++# Idea is to reserve for option to produce "universal" binary and let
++# programmer detect if current CPU is VIS capable at run-time.
++sub unvis {
++my ($mnemonic,$rs1,$rs2,$rd)=@_;
++my ($ref,$opf);
++my %visopf = (	"faligndata"	=> 0x048,
++		"bshuffle"	=> 0x04c,
++		"fnot2"		=> 0x066,
++		"fxor"		=> 0x06c,
++		"fsrc2"		=> 0x078	);
++
++    $ref = "$mnemonic\t$rs1,$rs2,$rd";
++
++    if ($opf=$visopf{$mnemonic}) {
++	foreach ($rs1,$rs2,$rd) {
++	    return $ref if (!/%f([0-9]{1,2})/);
++	    $_=$1;
++	    if ($1>=32) {
++		return $ref if ($1&1);
++		# re-encode for upper double register addressing
++		$_=($1|$1>>5)&31;
++	    }
++	}
++
++	return	sprintf ".word\t0x%08x !%s",
++			0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
++			$ref;
++    } else {
++	return $ref;
++    }
++}
++
++sub unvis3 {
++my ($mnemonic,$rs1,$rs2,$rd)=@_;
++my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
++my ($ref,$opf);
++my %visopf = (	"addxc"		=> 0x011,
++		"addxccc"	=> 0x013,
++		"umulxhi"	=> 0x016,
++		"alignaddr"	=> 0x018,
++		"bmask"		=> 0x019,
++		"alignaddrl"	=> 0x01a	);
++
++    $ref = "$mnemonic\t$rs1,$rs2,$rd";
++
++    if ($opf=$visopf{$mnemonic}) {
++	foreach ($rs1,$rs2,$rd) {
++	    return $ref if (!/%([goli])([0-9])/);
++	    $_=$bias{$1}+$2;
++	}
++
++	return	sprintf ".word\t0x%08x !%s",
++			0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
++			$ref;
++    } else {
++	return $ref;
++    }
++}
++
++sub unaes_round {	# 4-argument instructions
++my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
++my ($ref,$opf);
++my %aesopf = (	"aes_eround01"	=> 0,
++		"aes_eround23"	=> 1,
++		"aes_dround01"	=> 2,
++		"aes_dround23"	=> 3,
++		"aes_eround01_l"=> 4,
++		"aes_eround23_l"=> 5,
++		"aes_dround01_l"=> 6,
++		"aes_dround23_l"=> 7,
++		"aes_kexpand1"	=> 8	);
++
++    $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
++
++    if (defined($opf=$aesopf{$mnemonic})) {
++	$rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3;
++	foreach ($rs1,$rs2,$rd) {
++	    return $ref if (!/%f([0-9]{1,2})/);
++	    $_=$1;
++	    if ($1>=32) {
++		return $ref if ($1&1);
++		# re-encode for upper double register addressing
++		$_=($1|$1>>5)&31;
++	    }
++	}
++
++	return	sprintf ".word\t0x%08x !%s",
++			2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|$opf<<5|$rs2,
++			$ref;
++    } else {
++	return $ref;
++    }
++}
++
++sub unaes_kexpand {	# 3-argument instructions
++my ($mnemonic,$rs1,$rs2,$rd)=@_;
++my ($ref,$opf);
++my %aesopf = (	"aes_kexpand0"	=> 0x130,
++		"aes_kexpand2"	=> 0x131	);
++
++    $ref = "$mnemonic\t$rs1,$rs2,$rd";
++
++    if (defined($opf=$aesopf{$mnemonic})) {
++	foreach ($rs1,$rs2,$rd) {
++	    return $ref if (!/%f([0-9]{1,2})/);
++	    $_=$1;
++	    if ($1>=32) {
++		return $ref if ($1&1);
++		# re-encode for upper double register addressing
++		$_=($1|$1>>5)&31;
++	    }
++	}
++
++	return	sprintf ".word\t0x%08x !%s",
++			2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
++			$ref;
++    } else {
++	return $ref;
++    }
++}
++
++sub uncamellia_f {	# 4-argument instructions
++my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
++my ($ref,$opf);
++
++    $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
++
++    if (1) {
++	$rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3;
++	foreach ($rs1,$rs2,$rd) {
++	    return $ref if (!/%f([0-9]{1,2})/);
++	    $_=$1;
++	    if ($1>=32) {
++		return $ref if ($1&1);
++		# re-encode for upper double register addressing
++		$_=($1|$1>>5)&31;
++	    }
++	}
++
++	return	sprintf ".word\t0x%08x !%s",
++			2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|0xc<<5|$rs2,
++			$ref;
++    } else {
++	return $ref;
++    }
++}
++
++sub uncamellia3 {	# 3-argument instructions
++my ($mnemonic,$rs1,$rs2,$rd)=@_;
++my ($ref,$opf);
++my %cmllopf = (	"camellia_fl"	=> 0x13c,
++		"camellia_fli"	=> 0x13d	);
++
++    $ref = "$mnemonic\t$rs1,$rs2,$rd";
++
++    if (defined($opf=$cmllopf{$mnemonic})) {
++	foreach ($rs1,$rs2,$rd) {
++	    return $ref if (!/%f([0-9]{1,2})/);
++	    $_=$1;
++	    if ($1>=32) {
++		return $ref if ($1&1);
++		# re-encode for upper double register addressing
++		$_=($1|$1>>5)&31;
++	    }
++	}
++
++	return	sprintf ".word\t0x%08x !%s",
++			2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
++			$ref;
++    } else {
++	return $ref;
++    }
++}
++
++sub unmovxtox {		# 2-argument instructions
++my ($mnemonic,$rs,$rd)=@_;
++my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24, "f" => 0 );
++my ($ref,$opf);
++my %movxopf = (	"movdtox"	=> 0x110,
++		"movstouw"	=> 0x111,
++		"movstosw"	=> 0x113,
++		"movxtod"	=> 0x118,
++		"movwtos"	=> 0x119	);
++
++    $ref = "$mnemonic\t$rs,$rd";
++
++    if (defined($opf=$movxopf{$mnemonic})) {
++	foreach ($rs,$rd) {
++	    return $ref if (!/%([fgoli])([0-9]{1,2})/);
++	    $_=$bias{$1}+$2;
++	    if ($2>=32) {
++		return $ref if ($2&1);
++		# re-encode for upper double register addressing
++		$_=($2|$2>>5)&31;
++	    }
++	}
++
++	return	sprintf ".word\t0x%08x !%s",
++			2<<30|$rd<<25|0x36<<19|$opf<<5|$rs,
++			$ref;
++    } else {
++	return $ref;
++    }
++}
++
++sub undes {
++my ($mnemonic)=shift;
++my @args=@_;
++my ($ref,$opf);
++my %desopf = (	"des_round"	=> 0b1001,
++		"des_ip"	=> 0b100110100,
++		"des_iip"	=> 0b100110101,
++		"des_kexpand"	=> 0b100110110	);
++
++    $ref = "$mnemonic\t".join(",", at _);
++
++    if (defined($opf=$desopf{$mnemonic})) {	# 4-arg
++	if ($mnemonic eq "des_round") {
++	    foreach (@args[0..3]) {
++		return $ref if (!/%f([0-9]{1,2})/);
++		$_=$1;
++		if ($1>=32) {
++		    return $ref if ($1&1);
++		    # re-encode for upper double register addressing
++		    $_=($1|$1>>5)&31;
++		}
++	    }
++	    return  sprintf ".word\t0x%08x !%s",
++			    2<<30|0b011001<<19|$opf<<5|$args[0]<<14|$args[1]|$args[2]<<9|$args[3]<<25,
++			    $ref;
++	} elsif ($mnemonic eq "des_kexpand") {	# 3-arg
++	    foreach (@args[0..2]) {
++		return $ref if (!/(%f)?([0-9]{1,2})/);
++		$_=$2;
++		if ($2>=32) {
++		    return $ref if ($2&1);
++		    # re-encode for upper double register addressing
++		    $_=($2|$2>>5)&31;
++		}
++	    }
++	    return  sprintf ".word\t0x%08x !%s",
++			    2<<30|0b110110<<19|$opf<<5|$args[0]<<14|$args[1]|$args[2]<<25,
++			    $ref;
++	} else {				# 2-arg
++	    foreach (@args[0..1]) {
++		return $ref if (!/%f([0-9]{1,2})/);
++		$_=$1;
++		if ($1>=32) {
++		    return $ref if ($2&1);
++		    # re-encode for upper double register addressing
++		    $_=($1|$1>>5)&31;
++		}
++	    }
++	    return  sprintf ".word\t0x%08x !%s",
++			    2<<30|0b110110<<19|$opf<<5|$args[0]<<14|$args[1]<<25,
++			    $ref;
++	}
++    } else {
++	return $ref;
++    }
++}
++
++sub emit_assembler {
++    foreach (split("\n",$::code)) {
++	s/\`([^\`]*)\`/eval $1/ge;
++
++	s/\b(f[a-z]+2[sd]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})\s*$/$1\t%f0,$2,$3/go;
++
++	s/\b(aes_[edk][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
++		&unaes_round($1,$2,$3,$4,$5)
++	 /geo or
++	s/\b(aes_kexpand[02])\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
++		&unaes_kexpand($1,$2,$3,$4)
++	 /geo or
++	s/\b(camellia_f)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
++		&uncamellia_f($1,$2,$3,$4,$5)
++	 /geo or
++	s/\b(camellia_[^s]+)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
++		&uncamellia3($1,$2,$3,$4)
++	 /geo or
++	s/\b(des_\w+)\s+(?<rs1>%f[0-9]{1,2}),\s*(?<rs2>[%fx0-9]+)(,\s*(?<rs3>%f[0-9]{1,2})(,\s*(?<rs4>%f[0-9]{1,2}))?)?/
++		&undes($1,$+{rs1},$+{rs2},$+{rs3},$+{rs4})
++	 /geo or
++	s/\b(mov[ds]to\w+)\s+(%f[0-9]{1,2}),\s*(%[goli][0-7])/
++		&unmovxtox($1,$2,$3)
++	 /geo or
++	s/\b(mov[xw]to[ds])\s+(%[goli][0-7]),\s*(%f[0-9]{1,2})/
++		&unmovxtox($1,$2,$3)
++	 /geo or
++	s/\b([fb][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
++		&unvis($1,$2,$3,$4)
++	 /geo or
++	s/\b(umulxhi|bmask|addxc[c]{0,2}|alignaddr[l]*)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
++		&unvis3($1,$2,$3,$4)
++	 /geo;
++
++	print $_,"\n";
++    }
++}
++
++1;
+Index: crypto/bn/asm/vis3-mont.pl
+===================================================================
+diff -uNr openssl-1.0.1m/crypto/bn/asm/vis3-mont.pl openssl-1.0.1m/crypto/bn/asm/vis3-mont.pl
+--- openssl-1.0.1m/crypto/bn/asm/vis3-mont.pl 1970-01-01 01:00:00.000000000 +0100
++++ openssl-1.0.1m/crypto/bn/asm/vis3-mont.pl 2015-06-02 09:52:11.809148396 +0200
+@@ -0,0 +1,373 @@
++#!/usr/bin/env perl
++
++# ====================================================================
++# Written by Andy Polyakov <appro at openssl.org> for the OpenSSL
++# project. The module is, however, dual licensed under OpenSSL and
++# CRYPTOGAMS licenses depending on where you obtain it. For further
++# details see http://www.openssl.org/~appro/cryptogams/.
++# ====================================================================
++
++# October 2012.
++#
++# SPARCv9 VIS3 Montgomery multiplicaion procedure suitable for T3 and
++# onward. There are three new instructions used here: umulxhi,
++# addxc[cc] and initializing store. On T3 RSA private key operations
++# are 1.54/1.87/2.11/2.26 times faster for 512/1024/2048/4096-bit key
++# lengths. This is without dedicated squaring procedure. On T4
++# corresponding coefficients are 1.47/2.10/2.80/2.90x, which is mostly
++# for reference purposes, because T4 has dedicated Montgomery
++# multiplication and squaring *instructions* that deliver even more.
++
++$bits=32;
++for (@ARGV)     { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
++if ($bits==64)  { $bias=2047; $frame=192; }
++else            { $bias=0;    $frame=112; }
++
++$code.=<<___ if ($bits==64);
++.register	%g2,#scratch
++.register	%g3,#scratch
++___
++$code.=<<___;
++.section	".text",#alloc,#execinstr
++___
++
++($n0,$m0,$m1,$lo0,$hi0, $lo1,$hi1,$aj,$alo,$nj,$nlo,$tj)=
++	(map("%g$_",(1..5)),map("%o$_",(0..5,7)));
++
++# int bn_mul_mont(
++$rp="%o0";	# BN_ULONG *rp,
++$ap="%o1";	# const BN_ULONG *ap,
++$bp="%o2";	# const BN_ULONG *bp,
++$np="%o3";	# const BN_ULONG *np,
++$n0p="%o4";	# const BN_ULONG *n0,
++$num="%o5";	# int num);	# caller ensures that num is even
++				# and >=6
++$code.=<<___;
++.globl	bn_mul_mont_vis3
++.align	32
++bn_mul_mont_vis3:
++	add	%sp,	$bias,	%g4	! real top of stack
++	sll	$num,	2,	$num	! size in bytes
++	add	$num,	63,	%g5
++	andn	%g5,	63,	%g5	! buffer size rounded up to 64 bytes
++	add	%g5,	%g5,	%g1
++	add	%g5,	%g1,	%g1	! 3*buffer size
++	sub	%g4,	%g1,	%g1
++	andn	%g1,	63,	%g1	! align at 64 byte
++	sub	%g1,	$frame,	%g1	! new top of stack
++	sub	%g1,	%g4,	%g1
++
++	save	%sp,	%g1,	%sp
++___
++
++#	+-------------------------------+<-----	%sp
++#	.				.
++#	+-------------------------------+<-----	aligned at 64 bytes
++#	| __int64 tmp[0]		|
++#	+-------------------------------+
++#	.				.
++#	.				.
++#	+-------------------------------+<----- aligned at 64 bytes
++#	| __int64 ap[1..0]		|	converted ap[]
++#	+-------------------------------+
++#	| __int64 np[1..0]		|	converted np[]
++#	+-------------------------------+
++#	| __int64 ap[3..2]		|
++#	.				.
++#	.				.
++#	+-------------------------------+
++($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5));
++($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz,$anp)=map("%l$_",(0..7));
++($ovf,$i)=($t0,$t1);
++$code.=<<___;
++	ld	[$n0p+0],	$t0	! pull n0[0..1] value
++	add	%sp, $bias+$frame, $tp
++	ld	[$n0p+4],	$t1
++	add	$tp,	%g5,	$anp
++	ld	[$bp+0],	$t2	! m0=bp[0]
++	sllx	$t1,	32,	$n0
++	ld	[$bp+4],	$t3
++	or	$t0,	$n0,	$n0
++	add	$bp,	8,	$bp
++
++	ld	[$ap+0],	$t0	! ap[0]
++	sllx	$t3,	32,	$m0
++	ld	[$ap+4],	$t1
++	or	$t2,	$m0,	$m0
++
++	ld	[$ap+8],	$t2	! ap[1]
++	sllx	$t1,	32,	$aj
++	ld	[$ap+12],	$t3
++	or	$t0,	$aj,	$aj
++	add	$ap,	16,	$ap
++	stxa	$aj,	[$anp]0xe2	! converted ap[0]
++
++	mulx	$aj,	$m0,	$lo0	! ap[0]*bp[0]
++	umulxhi	$aj,	$m0,	$hi0
++
++	ld	[$np+0],	$t0	! np[0]
++	sllx	$t3,	32,	$aj
++	ld	[$np+4],	$t1
++	or	$t2,	$aj,	$aj
++
++	ld	[$np+8],	$t2	! np[1]
++	sllx	$t1,	32,	$nj
++	ld	[$np+12],	$t3
++	or	$t0, $nj,	$nj
++	add	$np,	16,	$np
++	stx	$nj,	[$anp+8]	! converted np[0]
++
++	mulx	$lo0,	$n0,	$m1	! "tp[0]"*n0
++	stx	$aj,	[$anp+16]	! converted ap[1]
++
++	mulx	$aj,	$m0,	$alo	! ap[1]*bp[0]
++	umulxhi	$aj,	$m0,	$aj	! ahi=aj
++
++	mulx	$nj,	$m1,	$lo1	! np[0]*m1
++	umulxhi	$nj,	$m1,	$hi1
++
++	sllx	$t3,	32,	$nj
++	or	$t2,	$nj,	$nj
++	stx	$nj,	[$anp+24]	! converted np[1]
++	add	$anp,	32,	$anp
++
++	addcc	$lo0,	$lo1,	$lo1
++	addxc	%g0,	$hi1,	$hi1
++
++	mulx	$nj,	$m1,	$nlo	! np[1]*m1
++	umulxhi	$nj,	$m1,	$nj	! nhi=nj
++
++	ba	.L1st
++	sub	$num,	24,	$cnt	! cnt=num-3
++
++.align	16
++.L1st:
++	ld	[$ap+0],	$t0	! ap[j]
++	addcc	$alo,	$hi0,	$lo0
++	ld	[$ap+4],	$t1
++	addxc	$aj,	%g0,	$hi0
++
++	sllx	$t1,	32,	$aj
++	add	$ap,	8,	$ap
++	or	$t0,	$aj,	$aj
++	stxa	$aj,	[$anp]0xe2	! converted ap[j]
++
++	ld	[$np+0],	$t2	! np[j]
++	addcc	$nlo,	$hi1,	$lo1
++	ld	[$np+4],	$t3
++	addxc	$nj,	%g0,	$hi1	! nhi=nj
++
++	sllx	$t3,	32,	$nj
++	add	$np,	8,	$np
++	mulx	$aj,	$m0,	$alo	! ap[j]*bp[0]
++	or	$t2,	$nj,	$nj
++	umulxhi	$aj,	$m0,	$aj	! ahi=aj
++	stx	$nj,	[$anp+8]	! converted np[j]
++	add	$anp,	16,	$anp	! anp++
++

@@ Diff output truncated at 100000 characters. @@
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.



More information about the devel mailing list