SF.net SVN: gar:[25051] csw/mgar/pkg/openssl1/trunk/files
janholzh at users.sourceforge.net
janholzh at users.sourceforge.net
Tue Jun 2 09:53:16 CEST 2015
Revision: 25051
http://sourceforge.net/p/gar/code/25051
Author: janholzh
Date: 2015-06-02 07:53:16 +0000 (Tue, 02 Jun 2015)
Log Message:
-----------
openssl1/trunk: Oracle Moved pachtes around
Modified Paths:
--------------
csw/mgar/pkg/openssl1/trunk/files/openssl-1.0.1m-t4-engine.sparc.5.11.patch
csw/mgar/pkg/openssl1/trunk/files/openssl-1.0.1m-wanboot.patch
csw/mgar/pkg/openssl1/trunk/files/update-t4-patch.sh
csw/mgar/pkg/openssl1/trunk/files/update-wanboot-patch.sh
Modified: csw/mgar/pkg/openssl1/trunk/files/openssl-1.0.1m-t4-engine.sparc.5.11.patch
===================================================================
--- csw/mgar/pkg/openssl1/trunk/files/openssl-1.0.1m-t4-engine.sparc.5.11.patch 2015-06-02 07:41:26 UTC (rev 25050)
+++ csw/mgar/pkg/openssl1/trunk/files/openssl-1.0.1m-t4-engine.sparc.5.11.patch 2015-06-02 07:53:16 UTC (rev 25051)
@@ -2227,3 +2227,5563 @@
{ERR_FUNC(EVP_F_AES_XTS), "AES_XTS"},
{ERR_FUNC(EVP_F_AES_XTS_CIPHER), "AES_XTS_CIPHER"},
{ERR_FUNC(EVP_F_ALG_MODULE_INIT), "ALG_MODULE_INIT"},
+Index: crypto/sparc_arch.h
+===================================================================
+diff -uNr openssl-1.0.1m/crypto/sparc_arch.h openssl-1.0.1m/crypto/sparc_arch.h
+--- openssl-1.0.1m/crypto/sparc_arch.h 1970-01-01 01:00:00.000000000 +0100
++++ openssl-1.0.1m/crypto/sparc_arch.h 2015-06-02 09:52:11.809148396 +0200
+@@ -0,0 +1,101 @@
++#ifndef __SPARC_ARCH_H__
++#define __SPARC_ARCH_H__
++
++#define SPARCV9_TICK_PRIVILEGED (1<<0)
++#define SPARCV9_PREFER_FPU (1<<1)
++#define SPARCV9_VIS1 (1<<2)
++#define SPARCV9_VIS2 (1<<3) /* reserved */
++#define SPARCV9_FMADD (1<<4) /* reserved for SPARC64 V */
++#define SPARCV9_BLK (1<<5) /* VIS1 block copy */
++#define SPARCV9_VIS3 (1<<6)
++#define SPARCV9_RANDOM (1<<7)
++#define SPARCV9_64BIT_STACK (1<<8)
++
++/*
++ * OPENSSL_sparcv9cap_P[1] is copy of Compatibility Feature Register,
++ * %asr26, SPARC-T4 and later. There is no SPARCV9_CFR bit in
++ * OPENSSL_sparcv9cap_P[0], as %cfr copy is sufficient...
++ */
++#define CFR_AES 0x00000001 /* Supports AES opcodes */
++#define CFR_DES 0x00000002 /* Supports DES opcodes */
++#define CFR_KASUMI 0x00000004 /* Supports KASUMI opcodes */
++#define CFR_CAMELLIA 0x00000008 /* Supports CAMELLIA opcodes */
++#define CFR_MD5 0x00000010 /* Supports MD5 opcodes */
++#define CFR_SHA1 0x00000020 /* Supports SHA1 opcodes */
++#define CFR_SHA256 0x00000040 /* Supports SHA256 opcodes */
++#define CFR_SHA512 0x00000080 /* Supports SHA512 opcodes */
++#define CFR_MPMUL 0x00000100 /* Supports MPMUL opcodes */
++#define CFR_MONTMUL 0x00000200 /* Supports MONTMUL opcodes */
++#define CFR_MONTSQR 0x00000400 /* Supports MONTSQR opcodes */
++#define CFR_CRC32C 0x00000800 /* Supports CRC32C opcodes */
++
++#if defined(OPENSSL_PIC) && !defined(__PIC__)
++#define __PIC__
++#endif
++
++#if defined(__SUNPRO_C) && defined(__sparcv9) && !defined(__arch64__)
++#define __arch64__
++#endif
++
++#define SPARC_PIC_THUNK(reg) \
++ .align 32; \
++.Lpic_thunk: \
++ jmp %o7 + 8; \
++ add %o7, reg, reg;
++
++#define SPARC_PIC_THUNK_CALL(reg) \
++ sethi %hi(_GLOBAL_OFFSET_TABLE_-4), reg; \
++ call .Lpic_thunk; \
++ or reg, %lo(_GLOBAL_OFFSET_TABLE_+4), reg;
++
++#if 1
++#define SPARC_SETUP_GOT_REG(reg) SPARC_PIC_THUNK_CALL(reg)
++#else
++#define SPARC_SETUP_GOT_REG(reg) \
++ sethi %hi(_GLOBAL_OFFSET_TABLE_-4), reg; \
++ call .+8; \
++ or reg, %lo(_GLOBAL_OFFSET_TABLE_+4), reg; \
++ add %o7, reg, reg
++#endif
++
++#if defined(__arch64__)
++
++#define SPARC_LOAD_ADDRESS(SYM, reg) \
++ setx SYM, %o7, reg;
++#define LDPTR ldx
++#define SIZE_T_CC %xcc
++#define STACK_FRAME 192
++#define STACK_BIAS 2047
++#define STACK_7thARG (STACK_BIAS+176)
++
++#else
++
++#define SPARC_LOAD_ADDRESS(SYM, reg) \
++ set SYM, reg;
++#define LDPTR ld
++#define SIZE_T_CC %icc
++#define STACK_FRAME 112
++#define STACK_BIAS 0
++#define STACK_7thARG 92
++#define SPARC_LOAD_ADDRESS_LEAF(SYM, reg, tmp) SPARC_LOAD_ADDRESS(SYM, reg)
++
++#endif
++
++#ifdef __PIC__
++#undef SPARC_LOAD_ADDRESS
++#undef SPARC_LOAD_ADDRESS_LEAF
++#define SPARC_LOAD_ADDRESS(SYM, reg) \
++ SPARC_SETUP_GOT_REG(reg); \
++ sethi %hi(SYM), %o7; \
++ or %o7, %lo(SYM), %o7; \
++ LDPTR [reg + %o7], reg;
++#endif
++
++#ifndef SPARC_LOAD_ADDRESS_LEAF
++#define SPARC_LOAD_ADDRESS_LEAF(SYM, reg, tmp) \
++ mov %o7, tmp; \
++ SPARC_LOAD_ADDRESS(SYM, reg) \
++ mov tmp, %o7;
++#endif
++
++#endif /* __SPARC_ARCH_H__ */
+Index: crypto/md5/asm/md5-sparcv9.pl
+===================================================================
+diff -uNr openssl-1.0.1m/crypto/md5/asm/md5-sparcv9.pl openssl-1.0.1m/crypto/md5/asm/md5-sparcv9.pl
+--- openssl-1.0.1m/crypto/md5/asm/md5-sparcv9.pl 1970-01-01 01:00:00.000000000 +0100
++++ openssl-1.0.1m/crypto/md5/asm/md5-sparcv9.pl 2015-06-02 09:52:11.809148396 +0200
+@@ -0,0 +1,434 @@
++#!/usr/bin/env perl
++
++# ====================================================================
++# Written by Andy Polyakov <appro at openssl.org> for the OpenSSL
++# project. The module is, however, dual licensed under OpenSSL and
++# CRYPTOGAMS licenses depending on where you obtain it. For further
++# details see http://www.openssl.org/~appro/cryptogams/.
++#
++# Hardware SPARC T4 support by David S. Miller <davem at davemloft.net>.
++# ====================================================================
++
++# MD5 for SPARCv9, 6.9 cycles per byte on UltraSPARC, >40% faster than
++# code generated by Sun C 5.2.
++
++# SPARC T4 MD5 hardware achieves 3.20 cycles per byte, which is 2.1x
++# faster than software. Multi-process benchmark saturates at 12x
++# single-process result on 8-core processor, or ~11GBps per 2.85GHz
++# socket.
++
++$bits=32;
++for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
++if ($bits==64) { $bias=2047; $frame=192; }
++else { $bias=0; $frame=112; }
++
++$output=shift;
++open STDOUT,">$output";
++
++use integer;
++
++($ctx,$inp,$len)=("%i0","%i1","%i2"); # input arguments
++
++# 64-bit values
++ at X=("%o0","%o1","%o2","%o3","%o4","%o5","%o7","%g1","%g2");
++$tx="%g3";
++($AB,$CD)=("%g4","%g5");
++
++# 32-bit values
++ at V=($A,$B,$C,$D)=map("%l$_",(0..3));
++($t1,$t2,$t3,$saved_asi)=map("%l$_",(4..7));
++($shr,$shl1,$shl2)=("%i3","%i4","%i5");
++
++my @K=( 0xd76aa478,0xe8c7b756,0x242070db,0xc1bdceee,
++ 0xf57c0faf,0x4787c62a,0xa8304613,0xfd469501,
++ 0x698098d8,0x8b44f7af,0xffff5bb1,0x895cd7be,
++ 0x6b901122,0xfd987193,0xa679438e,0x49b40821,
++
++ 0xf61e2562,0xc040b340,0x265e5a51,0xe9b6c7aa,
++ 0xd62f105d,0x02441453,0xd8a1e681,0xe7d3fbc8,
++ 0x21e1cde6,0xc33707d6,0xf4d50d87,0x455a14ed,
++ 0xa9e3e905,0xfcefa3f8,0x676f02d9,0x8d2a4c8a,
++
++ 0xfffa3942,0x8771f681,0x6d9d6122,0xfde5380c,
++ 0xa4beea44,0x4bdecfa9,0xf6bb4b60,0xbebfbc70,
++ 0x289b7ec6,0xeaa127fa,0xd4ef3085,0x04881d05,
++ 0xd9d4d039,0xe6db99e5,0x1fa27cf8,0xc4ac5665,
++
++ 0xf4292244,0x432aff97,0xab9423a7,0xfc93a039,
++ 0x655b59c3,0x8f0ccc92,0xffeff47d,0x85845dd1,
++ 0x6fa87e4f,0xfe2ce6e0,0xa3014314,0x4e0811a1,
++ 0xf7537e82,0xbd3af235,0x2ad7d2bb,0xeb86d391, 0 );
++
++sub R0 {
++ my ($i,$a,$b,$c,$d) = @_;
++ my $rot = (7,12,17,22)[$i%4];
++ my $j = ($i+1)/2;
++
++ if ($i&1) {
++ $code.=<<___;
++ srlx @X[$j],$shr, at X[$j] ! align X[`$i+1`]
++ and $b,$t1,$t1 ! round $i
++ sllx @X[$j+1],$shl1,$tx
++ add $t2,$a,$a
++ sllx $tx,$shl2,$tx
++ xor $d,$t1,$t1
++ or $tx, at X[$j], at X[$j]
++ sethi %hi(@K[$i+1]),$t2
++ add $t1,$a,$a
++ or $t2,%lo(@K[$i+1]),$t2
++ sll $a,$rot,$t3
++ add @X[$j],$t2,$t2 ! X[`$i+1`]+K[`$i+1`]
++ srl $a,32-$rot,$a
++ add $b,$t3,$t3
++ xor $b,$c,$t1
++ add $t3,$a,$a
++___
++ } else {
++ $code.=<<___;
++ srlx @X[$j],32,$tx ! extract X[`2*$j+1`]
++ and $b,$t1,$t1 ! round $i
++ add $t2,$a,$a
++ xor $d,$t1,$t1
++ sethi %hi(@K[$i+1]),$t2
++ add $t1,$a,$a
++ or $t2,%lo(@K[$i+1]),$t2
++ sll $a,$rot,$t3
++ add $tx,$t2,$t2 ! X[`2*$j+1`]+K[`$i+1`]
++ srl $a,32-$rot,$a
++ add $b,$t3,$t3
++ xor $b,$c,$t1
++ add $t3,$a,$a
++___
++ }
++}
++
++sub R0_1 {
++ my ($i,$a,$b,$c,$d) = @_;
++ my $rot = (7,12,17,22)[$i%4];
++
++$code.=<<___;
++ srlx @X[0],32,$tx ! extract X[1]
++ and $b,$t1,$t1 ! round $i
++ add $t2,$a,$a
++ xor $d,$t1,$t1
++ sethi %hi(@K[$i+1]),$t2
++ add $t1,$a,$a
++ or $t2,%lo(@K[$i+1]),$t2
++ sll $a,$rot,$t3
++ add $tx,$t2,$t2 ! X[1]+K[`$i+1`]
++ srl $a,32-$rot,$a
++ add $b,$t3,$t3
++ andn $b,$c,$t1
++ add $t3,$a,$a
++___
++}
++
++sub R1 {
++ my ($i,$a,$b,$c,$d) = @_;
++ my $rot = (5,9,14,20)[$i%4];
++ my $j = $i<31 ? (1+5*($i+1))%16 : (5+3*($i+1))%16;
++ my $xi = @X[$j/2];
++
++$code.=<<___ if ($j&1 && ($xi=$tx));
++ srlx @X[$j/2],32,$xi ! extract X[$j]
++___
++$code.=<<___;
++ and $b,$d,$t3 ! round $i
++ add $t2,$a,$a
++ or $t3,$t1,$t1
++ sethi %hi(@K[$i+1]),$t2
++ add $t1,$a,$a
++ or $t2,%lo(@K[$i+1]),$t2
++ sll $a,$rot,$t3
++ add $xi,$t2,$t2 ! X[$j]+K[`$i+1`]
++ srl $a,32-$rot,$a
++ add $b,$t3,$t3
++ `$i<31?"andn":"xor"` $b,$c,$t1
++ add $t3,$a,$a
++___
++}
++
++sub R2 {
++ my ($i,$a,$b,$c,$d) = @_;
++ my $rot = (4,11,16,23)[$i%4];
++ my $j = $i<47 ? (5+3*($i+1))%16 : (0+7*($i+1))%16;
++ my $xi = @X[$j/2];
++
++$code.=<<___ if ($j&1 && ($xi=$tx));
++ srlx @X[$j/2],32,$xi ! extract X[$j]
++___
++$code.=<<___;
++ add $t2,$a,$a ! round $i
++ xor $b,$t1,$t1
++ sethi %hi(@K[$i+1]),$t2
++ add $t1,$a,$a
++ or $t2,%lo(@K[$i+1]),$t2
++ sll $a,$rot,$t3
++ add $xi,$t2,$t2 ! X[$j]+K[`$i+1`]
++ srl $a,32-$rot,$a
++ add $b,$t3,$t3
++ xor $b,$c,$t1
++ add $t3,$a,$a
++___
++}
++
++sub R3 {
++ my ($i,$a,$b,$c,$d) = @_;
++ my $rot = (6,10,15,21)[$i%4];
++ my $j = (0+7*($i+1))%16;
++ my $xi = @X[$j/2];
++
++$code.=<<___;
++ add $t2,$a,$a ! round $i
++___
++$code.=<<___ if ($j&1 && ($xi=$tx));
++ srlx @X[$j/2],32,$xi ! extract X[$j]
++___
++$code.=<<___;
++ orn $b,$d,$t1
++ sethi %hi(@K[$i+1]),$t2
++ xor $c,$t1,$t1
++ or $t2,%lo(@K[$i+1]),$t2
++ add $t1,$a,$a
++ sll $a,$rot,$t3
++ add $xi,$t2,$t2 ! X[$j]+K[`$i+1`]
++ srl $a,32-$rot,$a
++ add $b,$t3,$t3
++ add $t3,$a,$a
++___
++}
++
++$code.=<<___ if ($bits==64);
++.register %g2,#scratch
++.register %g3,#scratch
++___
++$code.=<<___;
++#include "sparc_arch.h"
++
++.section ".text",#alloc,#execinstr
++
++#ifdef __PIC__
++SPARC_PIC_THUNK(%g1)
++#endif
++
++.globl md5_block_asm_data_order
++.align 32
++md5_block_asm_data_order:
++ SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
++ ld [%g1+4],%g1 ! OPENSSL_sparcv9cap_P[1]
++
++ andcc %g1, CFR_MD5, %g0
++ be .Lsoftware
++ nop
++
++ mov 4, %g1
++ andcc %o1, 0x7, %g0
++ lda [%o0 + %g0]0x88, %f0 ! load context
++ lda [%o0 + %g1]0x88, %f1
++ add %o0, 8, %o0
++ lda [%o0 + %g0]0x88, %f2
++ lda [%o0 + %g1]0x88, %f3
++ bne,pn %icc, .Lhwunaligned
++ sub %o0, 8, %o0
++
++.Lhw_loop:
++ ldd [%o1 + 0x00], %f8
++ ldd [%o1 + 0x08], %f10
++ ldd [%o1 + 0x10], %f12
++ ldd [%o1 + 0x18], %f14
++ ldd [%o1 + 0x20], %f16
++ ldd [%o1 + 0x28], %f18
++ ldd [%o1 + 0x30], %f20
++ subcc %o2, 1, %o2 ! done yet?
++ ldd [%o1 + 0x38], %f22
++ add %o1, 0x40, %o1
++ prefetch [%o1 + 63], 20
++
++ .word 0x81b02800 ! MD5
++
++ bne,pt `$bits==64?"%xcc":"%icc"`, .Lhw_loop
++ nop
++
++.Lhwfinish:
++ sta %f0, [%o0 + %g0]0x88 ! store context
++ sta %f1, [%o0 + %g1]0x88
++ add %o0, 8, %o0
++ sta %f2, [%o0 + %g0]0x88
++ sta %f3, [%o0 + %g1]0x88
++ retl
++ nop
++
++.align 8
++.Lhwunaligned:
++ alignaddr %o1, %g0, %o1
++
++ ldd [%o1 + 0x00], %f10
++.Lhwunaligned_loop:
++ ldd [%o1 + 0x08], %f12
++ ldd [%o1 + 0x10], %f14
++ ldd [%o1 + 0x18], %f16
++ ldd [%o1 + 0x20], %f18
++ ldd [%o1 + 0x28], %f20
++ ldd [%o1 + 0x30], %f22
++ ldd [%o1 + 0x38], %f24
++ subcc %o2, 1, %o2 ! done yet?
++ ldd [%o1 + 0x40], %f26
++ add %o1, 0x40, %o1
++ prefetch [%o1 + 63], 20
++
++ faligndata %f10, %f12, %f8
++ faligndata %f12, %f14, %f10
++ faligndata %f14, %f16, %f12
++ faligndata %f16, %f18, %f14
++ faligndata %f18, %f20, %f16
++ faligndata %f20, %f22, %f18
++ faligndata %f22, %f24, %f20
++ faligndata %f24, %f26, %f22
++
++ .word 0x81b02800 ! MD5
++
++ bne,pt `$bits==64?"%xcc":"%icc"`, .Lhwunaligned_loop
++ for %f26, %f26, %f10 ! %f10=%f26
++
++ ba .Lhwfinish
++ nop
++
++.align 16
++.Lsoftware:
++ save %sp,-$frame,%sp
++
++ rd %asi,$saved_asi
++ wr %g0,0x88,%asi ! ASI_PRIMARY_LITTLE
++ and $inp,7,$shr
++ andn $inp,7,$inp
++
++ sll $shr,3,$shr ! *=8
++ mov 56,$shl2
++ ld [$ctx+0],$A
++ sub $shl2,$shr,$shl2
++ ld [$ctx+4],$B
++ and $shl2,32,$shl1
++ add $shl2,8,$shl2
++ ld [$ctx+8],$C
++ sub $shl2,$shl1,$shl2 ! shr+shl1+shl2==64
++ ld [$ctx+12],$D
++ nop
++
++.Loop:
++ cmp $shr,0 ! was inp aligned?
++ ldxa [$inp+0]%asi, at X[0] ! load little-endian input
++ ldxa [$inp+8]%asi, at X[1]
++ ldxa [$inp+16]%asi, at X[2]
++ ldxa [$inp+24]%asi, at X[3]
++ ldxa [$inp+32]%asi, at X[4]
++ sllx $A,32,$AB ! pack A,B
++ ldxa [$inp+40]%asi, at X[5]
++ sllx $C,32,$CD ! pack C,D
++ ldxa [$inp+48]%asi, at X[6]
++ or $B,$AB,$AB
++ ldxa [$inp+56]%asi, at X[7]
++ or $D,$CD,$CD
++ bnz,a,pn %icc,.+8
++ ldxa [$inp+64]%asi, at X[8]
++
++ srlx @X[0],$shr, at X[0] ! align X[0]
++ sllx @X[1],$shl1,$tx
++ sethi %hi(@K[0]),$t2
++ sllx $tx,$shl2,$tx
++ or $t2,%lo(@K[0]),$t2
++ or $tx, at X[0], at X[0]
++ xor $C,$D,$t1
++ add @X[0],$t2,$t2 ! X[0]+K[0]
++___
++ for ($i=0;$i<15;$i++) { &R0($i, at V); unshift(@V,pop(@V)); }
++ for (;$i<16;$i++) { &R0_1($i, at V); unshift(@V,pop(@V)); }
++ for (;$i<32;$i++) { &R1($i, at V); unshift(@V,pop(@V)); }
++ for (;$i<48;$i++) { &R2($i, at V); unshift(@V,pop(@V)); }
++ for (;$i<64;$i++) { &R3($i, at V); unshift(@V,pop(@V)); }
++$code.=<<___;
++ srlx $AB,32,$t1 ! unpack A,B,C,D and accumulate
++ add $inp,64,$inp ! advance inp
++ srlx $CD,32,$t2
++ add $t1,$A,$A
++ subcc $len,1,$len ! done yet?
++ add $AB,$B,$B
++ add $t2,$C,$C
++ add $CD,$D,$D
++ srl $B,0,$B ! clruw $B
++ bne `$bits==64?"%xcc":"%icc"`,.Loop
++ srl $D,0,$D ! clruw $D
++
++ st $A,[$ctx+0] ! write out ctx
++ st $B,[$ctx+4]
++ st $C,[$ctx+8]
++ st $D,[$ctx+12]
++
++ wr %g0,$saved_asi,%asi
++ ret
++ restore
++.type md5_block_asm_data_order,#function
++.size md5_block_asm_data_order,(.-md5_block_asm_data_order)
++
++.asciz "MD5 block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
++.align 4
++___
++
++# Purpose of these subroutines is to explicitly encode VIS instructions,
++# so that one can compile the module without having to specify VIS
++# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
++# Idea is to reserve for option to produce "universal" binary and let
++# programmer detect if current CPU is VIS capable at run-time.
++sub unvis {
++my ($mnemonic,$rs1,$rs2,$rd)=@_;
++my $ref,$opf;
++my %visopf = ( "faligndata" => 0x048,
++ "for" => 0x07c );
++
++ $ref = "$mnemonic\t$rs1,$rs2,$rd";
++
++ if ($opf=$visopf{$mnemonic}) {
++ foreach ($rs1,$rs2,$rd) {
++ return $ref if (!/%f([0-9]{1,2})/);
++ $_=$1;
++ if ($1>=32) {
++ return $ref if ($1&1);
++ # re-encode for upper double register addressing
++ $_=($1|$1>>5)&31;
++ }
++ }
++
++ return sprintf ".word\t0x%08x !%s",
++ 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
++ $ref;
++ } else {
++ return $ref;
++ }
++}
++sub unalignaddr {
++my ($mnemonic,$rs1,$rs2,$rd)=@_;
++my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
++my $ref="$mnemonic\t$rs1,$rs2,$rd";
++
++ foreach ($rs1,$rs2,$rd) {
++ if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; }
++ else { return $ref; }
++ }
++ return sprintf ".word\t0x%08x !%s",
++ 0x81b00300|$rd<<25|$rs1<<14|$rs2,
++ $ref;
++}
++
++foreach (split("\n",$code)) {
++ s/\`([^\`]*)\`/eval $1/ge;
++
++ s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
++ &unvis($1,$2,$3,$4)
++ /ge;
++ s/\b(alignaddr)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
++ &unalignaddr($1,$2,$3,$4)
++ /ge;
++
++ print $_,"\n";
++}
++
++close STDOUT;
+Index: crypto/aes/asm/aest4-sparcv9.pl
+===================================================================
+diff -uNr openssl-1.0.1m/crypto/aes/asm/aest4-sparcv9.pl openssl-1.0.1m/crypto/aes/asm/aest4-sparcv9.pl
+--- openssl-1.0.1m/crypto/aes/asm/aest4-sparcv9.pl 1970-01-01 01:00:00.000000000 +0100
++++ openssl-1.0.1m/crypto/aes/asm/aest4-sparcv9.pl 2015-06-02 09:52:11.809148396 +0200
+@@ -0,0 +1,902 @@
++#!/usr/bin/env perl
++
++# ====================================================================
++# Written by David S. Miller <davem at devemloft.net> and Andy Polyakov
++# <appro at openssl.org>. The module is licensed under 2-clause BSD
++# license. October 2012. All rights reserved.
++# ====================================================================
++
++######################################################################
++# AES for SPARC T4.
++#
++# AES round instructions complete in 3 cycles and can be issued every
++# cycle. It means that round calculations should take 4*rounds cycles,
++# because any given round instruction depends on result of *both*
++# previous instructions:
++#
++# |0 |1 |2 |3 |4
++# |01|01|01|
++# |23|23|23|
++# |01|01|...
++# |23|...
++#
++# Provided that fxor [with IV] takes 3 cycles to complete, critical
++# path length for CBC encrypt would be 3+4*rounds, or in other words
++# it should process one byte in at least (3+4*rounds)/16 cycles. This
++# estimate doesn't account for "collateral" instructions, such as
++# fetching input from memory, xor-ing it with zero-round key and
++# storing the result. Yet, *measured* performance [for data aligned
++# at 64-bit boundary!] deviates from this equation by less than 0.5%:
++#
++# 128-bit key 192- 256-
++# CBC encrypt 2.70/2.90(*) 3.20/3.40 3.70/3.90
++# (*) numbers after slash are for
++# misaligned data.
++#
++# Out-of-order execution logic managed to fully overlap "collateral"
++# instructions with those on critical path. Amazing!
++#
++# As with Intel AES-NI, question is if it's possible to improve
++# performance of parallelizeable modes by interleaving round
++# instructions. Provided round instruction latency and throughput
++# optimal interleave factor is 2. But can we expect 2x performance
++# improvement? Well, as round instructions can be issued one per
++# cycle, they don't saturate the 2-way issue pipeline and therefore
++# there is room for "collateral" calculations... Yet, 2x speed-up
++# over CBC encrypt remains unattaintable:
++#
++# 128-bit key 192- 256-
++# CBC decrypt 1.64/2.11 1.89/2.37 2.23/2.61
++# CTR 1.64/2.08(*) 1.89/2.33 2.23/2.61
++# (*) numbers after slash are for
++# misaligned data.
++#
++# Estimates based on amount of instructions under assumption that
++# round instructions are not pairable with any other instruction
++# suggest that latter is the actual case and pipeline runs
++# underutilized. It should be noted that T4 out-of-order execution
++# logic is so capable that performance gain from 2x interleave is
++# not even impressive, ~7-13% over non-interleaved code, largest
++# for 256-bit keys.
++
++# To anchor to something else, software implementation processes
++# one byte in 29 cycles with 128-bit key on same processor. Intel
++# Sandy Bridge encrypts byte in 5.07 cycles in CBC mode and decrypts
++# in 0.93, naturally with AES-NI.
++
++$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
++push(@INC,"${dir}","${dir}../../perlasm");
++require "sparcv9_modes.pl";
++
++&asm_init(@ARGV);
++
++$::evp=1; # if $evp is set to 0, script generates module with
++# AES_[en|de]crypt, AES_set_[en|de]crypt_key and AES_cbc_encrypt entry
++# points. These however are not fully compatible with openssl/aes.h,
++# because they expect AES_KEY to be aligned at 64-bit boundary. When
++# used through EVP, alignment is arranged at EVP layer. Second thing
++# that is arranged by EVP is at least 32-bit alignment of IV.
++
++######################################################################
++# single-round subroutines
++#
++{
++my ($inp,$out,$key,$rounds,$tmp,$mask)=map("%o$_",(0..5));
++
++$code=<<___;
++.text
++
++.globl aes_t4_encrypt
++.align 32
++aes_t4_encrypt:
++ andcc $inp, 7, %g1 ! is input aligned?
++ andn $inp, 7, $inp
++
++ ldx [$key + 0], %g4
++ ldx [$key + 8], %g5
++
++ ldx [$inp + 0], %o4
++ bz,pt %icc, 1f
++ ldx [$inp + 8], %o5
++ ldx [$inp + 16], $inp
++ sll %g1, 3, %g1
++ sub %g0, %g1, %o3
++ sllx %o4, %g1, %o4
++ sllx %o5, %g1, %g1
++ srlx %o5, %o3, %o5
++ srlx $inp, %o3, %o3
++ or %o5, %o4, %o4
++ or %o3, %g1, %o5
++1:
++ ld [$key + 240], $rounds
++ ldd [$key + 16], %f12
++ ldd [$key + 24], %f14
++ xor %g4, %o4, %o4
++ xor %g5, %o5, %o5
++ movxtod %o4, %f0
++ movxtod %o5, %f2
++ srl $rounds, 1, $rounds
++ ldd [$key + 32], %f16
++ sub $rounds, 1, $rounds
++ ldd [$key + 40], %f18
++ add $key, 48, $key
++
++.Lenc:
++ aes_eround01 %f12, %f0, %f2, %f4
++ aes_eround23 %f14, %f0, %f2, %f2
++ ldd [$key + 0], %f12
++ ldd [$key + 8], %f14
++ sub $rounds,1,$rounds
++ aes_eround01 %f16, %f4, %f2, %f0
++ aes_eround23 %f18, %f4, %f2, %f2
++ ldd [$key + 16], %f16
++ ldd [$key + 24], %f18
++ brnz,pt $rounds, .Lenc
++ add $key, 32, $key
++
++ andcc $out, 7, $tmp ! is output aligned?
++ aes_eround01 %f12, %f0, %f2, %f4
++ aes_eround23 %f14, %f0, %f2, %f2
++ aes_eround01_l %f16, %f4, %f2, %f0
++ aes_eround23_l %f18, %f4, %f2, %f2
++
++ bnz,pn %icc, 2f
++ nop
++
++ std %f0, [$out + 0]
++ retl
++ std %f2, [$out + 8]
++
++2: alignaddrl $out, %g0, $out
++ mov 0xff, $mask
++ srl $mask, $tmp, $mask
++
++ faligndata %f0, %f0, %f4
++ faligndata %f0, %f2, %f6
++ faligndata %f2, %f2, %f8
++
++ stda %f4, [$out + $mask]0xc0 ! partial store
++ std %f6, [$out + 8]
++ add $out, 16, $out
++ orn %g0, $mask, $mask
++ retl
++ stda %f8, [$out + $mask]0xc0 ! partial store
++.type aes_t4_encrypt,#function
++.size aes_t4_encrypt,.-aes_t4_encrypt
++
++.globl aes_t4_decrypt
++.align 32
++aes_t4_decrypt:
++ andcc $inp, 7, %g1 ! is input aligned?
++ andn $inp, 7, $inp
++
++ ldx [$key + 0], %g4
++ ldx [$key + 8], %g5
++
++ ldx [$inp + 0], %o4
++ bz,pt %icc, 1f
++ ldx [$inp + 8], %o5
++ ldx [$inp + 16], $inp
++ sll %g1, 3, %g1
++ sub %g0, %g1, %o3
++ sllx %o4, %g1, %o4
++ sllx %o5, %g1, %g1
++ srlx %o5, %o3, %o5
++ srlx $inp, %o3, %o3
++ or %o5, %o4, %o4
++ or %o3, %g1, %o5
++1:
++ ld [$key + 240], $rounds
++ ldd [$key + 16], %f12
++ ldd [$key + 24], %f14
++ xor %g4, %o4, %o4
++ xor %g5, %o5, %o5
++ movxtod %o4, %f0
++ movxtod %o5, %f2
++ srl $rounds, 1, $rounds
++ ldd [$key + 32], %f16
++ sub $rounds, 1, $rounds
++ ldd [$key + 40], %f18
++ add $key, 48, $key
++
++.Ldec:
++ aes_dround01 %f12, %f0, %f2, %f4
++ aes_dround23 %f14, %f0, %f2, %f2
++ ldd [$key + 0], %f12
++ ldd [$key + 8], %f14
++ sub $rounds,1,$rounds
++ aes_dround01 %f16, %f4, %f2, %f0
++ aes_dround23 %f18, %f4, %f2, %f2
++ ldd [$key + 16], %f16
++ ldd [$key + 24], %f18
++ brnz,pt $rounds, .Ldec
++ add $key, 32, $key
++
++ andcc $out, 7, $tmp ! is output aligned?
++ aes_dround01 %f12, %f0, %f2, %f4
++ aes_dround23 %f14, %f0, %f2, %f2
++ aes_dround01_l %f16, %f4, %f2, %f0
++ aes_dround23_l %f18, %f4, %f2, %f2
++
++ bnz,pn %icc, 2f
++ nop
++
++ std %f0, [$out + 0]
++ retl
++ std %f2, [$out + 8]
++
++2: alignaddrl $out, %g0, $out
++ mov 0xff, $mask
++ srl $mask, $tmp, $mask
++
++ faligndata %f0, %f0, %f4
++ faligndata %f0, %f2, %f6
++ faligndata %f2, %f2, %f8
++
++ stda %f4, [$out + $mask]0xc0 ! partial store
++ std %f6, [$out + 8]
++ add $out, 16, $out
++ orn %g0, $mask, $mask
++ retl
++ stda %f8, [$out + $mask]0xc0 ! partial store
++.type aes_t4_decrypt,#function
++.size aes_t4_decrypt,.-aes_t4_decrypt
++___
++}
++
++######################################################################
++# key setup subroutines
++#
++{
++my ($inp,$bits,$out,$tmp)=map("%o$_",(0..5));
++$code.=<<___;
++.globl aes_t4_set_encrypt_key
++.align 32
++aes_t4_set_encrypt_key:
++.Lset_encrypt_key:
++ and $inp, 7, $tmp
++ alignaddr $inp, %g0, $inp
++ cmp $bits, 192
++ ldd [$inp + 0], %f0
++ bl,pt %icc,.L128
++ ldd [$inp + 8], %f2
++
++ be,pt %icc,.L192
++ ldd [$inp + 16], %f4
++ brz,pt $tmp, .L256aligned
++ ldd [$inp + 24], %f6
++
++ ldd [$inp + 32], %f8
++ faligndata %f0, %f2, %f0
++ faligndata %f2, %f4, %f2
++ faligndata %f4, %f6, %f4
++ faligndata %f6, %f8, %f6
++.L256aligned:
++___
++for ($i=0; $i<6; $i++) {
++ $code.=<<___;
++ std %f0, [$out + `32*$i+0`]
++ aes_kexpand1 %f0, %f6, $i, %f0
++ std %f2, [$out + `32*$i+8`]
++ aes_kexpand2 %f2, %f0, %f2
++ std %f4, [$out + `32*$i+16`]
++ aes_kexpand0 %f4, %f2, %f4
++ std %f6, [$out + `32*$i+24`]
++ aes_kexpand2 %f6, %f4, %f6
++___
++}
++$code.=<<___;
++ std %f0, [$out + `32*$i+0`]
++ aes_kexpand1 %f0, %f6, $i, %f0
++ std %f2, [$out + `32*$i+8`]
++ aes_kexpand2 %f2, %f0, %f2
++ std %f4, [$out + `32*$i+16`]
++ std %f6, [$out + `32*$i+24`]
++ std %f0, [$out + `32*$i+32`]
++ std %f2, [$out + `32*$i+40`]
++
++ mov 14, $tmp
++ st $tmp, [$out + 240]
++ retl
++ xor %o0, %o0, %o0
++
++.align 16
++.L192:
++ brz,pt $tmp, .L192aligned
++ nop
++
++ ldd [$inp + 24], %f6
++ faligndata %f0, %f2, %f0
++ faligndata %f2, %f4, %f2
++ faligndata %f4, %f6, %f4
++.L192aligned:
++___
++for ($i=0; $i<7; $i++) {
++ $code.=<<___;
++ std %f0, [$out + `24*$i+0`]
++ aes_kexpand1 %f0, %f4, $i, %f0
++ std %f2, [$out + `24*$i+8`]
++ aes_kexpand2 %f2, %f0, %f2
++ std %f4, [$out + `24*$i+16`]
++ aes_kexpand2 %f4, %f2, %f4
++___
++}
++$code.=<<___;
++ std %f0, [$out + `24*$i+0`]
++ aes_kexpand1 %f0, %f4, $i, %f0
++ std %f2, [$out + `24*$i+8`]
++ aes_kexpand2 %f2, %f0, %f2
++ std %f4, [$out + `24*$i+16`]
++ std %f0, [$out + `24*$i+24`]
++ std %f2, [$out + `24*$i+32`]
++
++ mov 12, $tmp
++ st $tmp, [$out + 240]
++ retl
++ xor %o0, %o0, %o0
++
++.align 16
++.L128:
++ brz,pt $tmp, .L128aligned
++ nop
++
++ ldd [$inp + 16], %f4
++ faligndata %f0, %f2, %f0
++ faligndata %f2, %f4, %f2
++.L128aligned:
++___
++for ($i=0; $i<10; $i++) {
++ $code.=<<___;
++ std %f0, [$out + `16*$i+0`]
++ aes_kexpand1 %f0, %f2, $i, %f0
++ std %f2, [$out + `16*$i+8`]
++ aes_kexpand2 %f2, %f0, %f2
++___
++}
++$code.=<<___;
++ std %f0, [$out + `16*$i+0`]
++ std %f2, [$out + `16*$i+8`]
++
++ mov 10, $tmp
++ st $tmp, [$out + 240]
++ retl
++ xor %o0, %o0, %o0
++.type aes_t4_set_encrypt_key,#function
++.size aes_t4_set_encrypt_key,.-aes_t4_set_encrypt_key
++
++.globl aes_t4_set_decrypt_key
++.align 32
++aes_t4_set_decrypt_key:
++ mov %o7, %o5
++ call .Lset_encrypt_key
++ nop
++
++ mov %o5, %o7
++ sll $tmp, 4, $inp ! $tmp is number of rounds
++ add $tmp, 2, $tmp
++ add $out, $inp, $inp ! $inp=$out+16*rounds
++ srl $tmp, 2, $tmp ! $tmp=(rounds+2)/4
++
++.Lkey_flip:
++ ldd [$out + 0], %f0
++ ldd [$out + 8], %f2
++ ldd [$out + 16], %f4
++ ldd [$out + 24], %f6
++ ldd [$inp + 0], %f8
++ ldd [$inp + 8], %f10
++ ldd [$inp - 16], %f12
++ ldd [$inp - 8], %f14
++ sub $tmp, 1, $tmp
++ std %f0, [$inp + 0]
++ std %f2, [$inp + 8]
++ std %f4, [$inp - 16]
++ std %f6, [$inp - 8]
++ std %f8, [$out + 0]
++ std %f10, [$out + 8]
++ std %f12, [$out + 16]
++ std %f14, [$out + 24]
++ add $out, 32, $out
++ brnz $tmp, .Lkey_flip
++ sub $inp, 32, $inp
++
++ retl
++ xor %o0, %o0, %o0
++.type aes_t4_set_decrypt_key,#function
++.size aes_t4_set_decrypt_key,.-aes_t4_set_decrypt_key
++___
++}
++
++{{{
++my ($inp,$out,$len,$key,$ivec,$enc)=map("%i$_",(0..5));
++my ($ileft,$iright,$ooff,$omask,$ivoff)=map("%l$_",(1..7));
++
++$code.=<<___;
++.align 32
++_aes128_loadkey:
++ ldx [$key + 0], %g4
++ ldx [$key + 8], %g5
++___
++for ($i=2; $i<22;$i++) { # load key schedule
++ $code.=<<___;
++ ldd [$key + `8*$i`], %f`12+2*$i`
++___
++}
++$code.=<<___;
++ retl
++ nop
++.type _aes128_loadkey,#function
++.size _aes128_loadkey,.-_aes128_loadkey
++_aes128_load_enckey=_aes128_loadkey
++_aes128_load_deckey=_aes128_loadkey
++
++.align 32
++_aes128_encrypt_1x:
++___
++for ($i=0; $i<4; $i++) {
++ $code.=<<___;
++ aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f4
++ aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
++ aes_eround01 %f`16+8*$i+4`, %f4, %f2, %f0
++ aes_eround23 %f`16+8*$i+6`, %f4, %f2, %f2
++___
++}
++$code.=<<___;
++ aes_eround01 %f48, %f0, %f2, %f4
++ aes_eround23 %f50, %f0, %f2, %f2
++ aes_eround01_l %f52, %f4, %f2, %f0
++ retl
++ aes_eround23_l %f54, %f4, %f2, %f2
++.type _aes128_encrypt_1x,#function
++.size _aes128_encrypt_1x,.-_aes128_encrypt_1x
++
++.align 32
++_aes128_encrypt_2x:
++___
++for ($i=0; $i<4; $i++) {
++ $code.=<<___;
++ aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f8
++ aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
++ aes_eround01 %f`16+8*$i+0`, %f4, %f6, %f10
++ aes_eround23 %f`16+8*$i+2`, %f4, %f6, %f6
++ aes_eround01 %f`16+8*$i+4`, %f8, %f2, %f0
++ aes_eround23 %f`16+8*$i+6`, %f8, %f2, %f2
++ aes_eround01 %f`16+8*$i+4`, %f10, %f6, %f4
++ aes_eround23 %f`16+8*$i+6`, %f10, %f6, %f6
++___
++}
++$code.=<<___;
++ aes_eround01 %f48, %f0, %f2, %f8
++ aes_eround23 %f50, %f0, %f2, %f2
++ aes_eround01 %f48, %f4, %f6, %f10
++ aes_eround23 %f50, %f4, %f6, %f6
++ aes_eround01_l %f52, %f8, %f2, %f0
++ aes_eround23_l %f54, %f8, %f2, %f2
++ aes_eround01_l %f52, %f10, %f6, %f4
++ retl
++ aes_eround23_l %f54, %f10, %f6, %f6
++.type _aes128_encrypt_2x,#function
++.size _aes128_encrypt_2x,.-_aes128_encrypt_2x
++
++.align 32
++_aes128_decrypt_1x:
++___
++for ($i=0; $i<4; $i++) {
++ $code.=<<___;
++ aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f4
++ aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
++ aes_dround01 %f`16+8*$i+4`, %f4, %f2, %f0
++ aes_dround23 %f`16+8*$i+6`, %f4, %f2, %f2
++___
++}
++$code.=<<___;
++ aes_dround01 %f48, %f0, %f2, %f4
++ aes_dround23 %f50, %f0, %f2, %f2
++ aes_dround01_l %f52, %f4, %f2, %f0
++ retl
++ aes_dround23_l %f54, %f4, %f2, %f2
++.type _aes128_decrypt_1x,#function
++.size _aes128_decrypt_1x,.-_aes128_decrypt_1x
++
++.align 32
++_aes128_decrypt_2x:
++___
++for ($i=0; $i<4; $i++) {
++ $code.=<<___;
++ aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f8
++ aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
++ aes_dround01 %f`16+8*$i+0`, %f4, %f6, %f10
++ aes_dround23 %f`16+8*$i+2`, %f4, %f6, %f6
++ aes_dround01 %f`16+8*$i+4`, %f8, %f2, %f0
++ aes_dround23 %f`16+8*$i+6`, %f8, %f2, %f2
++ aes_dround01 %f`16+8*$i+4`, %f10, %f6, %f4
++ aes_dround23 %f`16+8*$i+6`, %f10, %f6, %f6
++___
++}
++$code.=<<___;
++ aes_dround01 %f48, %f0, %f2, %f8
++ aes_dround23 %f50, %f0, %f2, %f2
++ aes_dround01 %f48, %f4, %f6, %f10
++ aes_dround23 %f50, %f4, %f6, %f6
++ aes_dround01_l %f52, %f8, %f2, %f0
++ aes_dround23_l %f54, %f8, %f2, %f2
++ aes_dround01_l %f52, %f10, %f6, %f4
++ retl
++ aes_dround23_l %f54, %f10, %f6, %f6
++.type _aes128_decrypt_2x,#function
++.size _aes128_decrypt_2x,.-_aes128_decrypt_2x
++
++.align 32
++_aes192_loadkey:
++_aes256_loadkey:
++ ldx [$key + 0], %g4
++ ldx [$key + 8], %g5
++___
++for ($i=2; $i<26;$i++) { # load key schedule
++ $code.=<<___;
++ ldd [$key + `8*$i`], %f`12+2*$i`
++___
++}
++$code.=<<___;
++ retl
++ nop
++.type _aes192_loadkey,#function
++.size _aes192_loadkey,.-_aes192_loadkey
++_aes192_load_enckey=_aes192_loadkey
++_aes192_load_deckey=_aes192_loadkey
++_aes256_load_enckey=_aes192_loadkey
++_aes256_load_deckey=_aes192_loadkey
++
++.align 32
++_aes192_encrypt_1x:
++___
++for ($i=0; $i<5; $i++) {
++ $code.=<<___;
++ aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f4
++ aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
++ aes_eround01 %f`16+8*$i+4`, %f4, %f2, %f0
++ aes_eround23 %f`16+8*$i+6`, %f4, %f2, %f2
++___
++}
++$code.=<<___;
++ aes_eround01 %f56, %f0, %f2, %f4
++ aes_eround23 %f58, %f0, %f2, %f2
++ aes_eround01_l %f60, %f4, %f2, %f0
++ retl
++ aes_eround23_l %f62, %f4, %f2, %f2
++.type _aes192_encrypt_1x,#function
++.size _aes192_encrypt_1x,.-_aes192_encrypt_1x
++
++.align 32
++_aes192_encrypt_2x:
++___
++for ($i=0; $i<5; $i++) {
++ $code.=<<___;
++ aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f8
++ aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
++ aes_eround01 %f`16+8*$i+0`, %f4, %f6, %f10
++ aes_eround23 %f`16+8*$i+2`, %f4, %f6, %f6
++ aes_eround01 %f`16+8*$i+4`, %f8, %f2, %f0
++ aes_eround23 %f`16+8*$i+6`, %f8, %f2, %f2
++ aes_eround01 %f`16+8*$i+4`, %f10, %f6, %f4
++ aes_eround23 %f`16+8*$i+6`, %f10, %f6, %f6
++___
++}
++$code.=<<___;
++ aes_eround01 %f56, %f0, %f2, %f8
++ aes_eround23 %f58, %f0, %f2, %f2
++ aes_eround01 %f56, %f4, %f6, %f10
++ aes_eround23 %f58, %f4, %f6, %f6
++ aes_eround01_l %f60, %f8, %f2, %f0
++ aes_eround23_l %f62, %f8, %f2, %f2
++ aes_eround01_l %f60, %f10, %f6, %f4
++ retl
++ aes_eround23_l %f62, %f10, %f6, %f6
++.type _aes192_encrypt_2x,#function
++.size _aes192_encrypt_2x,.-_aes192_encrypt_2x
++
++.align 32
++_aes192_decrypt_1x:
++___
++for ($i=0; $i<5; $i++) {
++ $code.=<<___;
++ aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f4
++ aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
++ aes_dround01 %f`16+8*$i+4`, %f4, %f2, %f0
++ aes_dround23 %f`16+8*$i+6`, %f4, %f2, %f2
++___
++}
++$code.=<<___;
++ aes_dround01 %f56, %f0, %f2, %f4
++ aes_dround23 %f58, %f0, %f2, %f2
++ aes_dround01_l %f60, %f4, %f2, %f0
++ retl
++ aes_dround23_l %f62, %f4, %f2, %f2
++.type _aes192_decrypt_1x,#function
++.size _aes192_decrypt_1x,.-_aes192_decrypt_1x
++
++.align 32
++_aes192_decrypt_2x:
++___
++for ($i=0; $i<5; $i++) {
++ $code.=<<___;
++ aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f8
++ aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
++ aes_dround01 %f`16+8*$i+0`, %f4, %f6, %f10
++ aes_dround23 %f`16+8*$i+2`, %f4, %f6, %f6
++ aes_dround01 %f`16+8*$i+4`, %f8, %f2, %f0
++ aes_dround23 %f`16+8*$i+6`, %f8, %f2, %f2
++ aes_dround01 %f`16+8*$i+4`, %f10, %f6, %f4
++ aes_dround23 %f`16+8*$i+6`, %f10, %f6, %f6
++___
++}
++$code.=<<___;
++ aes_dround01 %f56, %f0, %f2, %f8
++ aes_dround23 %f58, %f0, %f2, %f2
++ aes_dround01 %f56, %f4, %f6, %f10
++ aes_dround23 %f58, %f4, %f6, %f6
++ aes_dround01_l %f60, %f8, %f2, %f0
++ aes_dround23_l %f62, %f8, %f2, %f2
++ aes_dround01_l %f60, %f10, %f6, %f4
++ retl
++ aes_dround23_l %f62, %f10, %f6, %f6
++.type _aes192_decrypt_2x,#function
++.size _aes192_decrypt_2x,.-_aes192_decrypt_2x
++
++.align 32
++_aes256_encrypt_1x:
++ aes_eround01 %f16, %f0, %f2, %f4
++ aes_eround23 %f18, %f0, %f2, %f2
++ ldd [$key + 208], %f16
++ ldd [$key + 216], %f18
++ aes_eround01 %f20, %f4, %f2, %f0
++ aes_eround23 %f22, %f4, %f2, %f2
++ ldd [$key + 224], %f20
++ ldd [$key + 232], %f22
++___
++for ($i=1; $i<6; $i++) {
++ $code.=<<___;
++ aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f4
++ aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
++ aes_eround01 %f`16+8*$i+4`, %f4, %f2, %f0
++ aes_eround23 %f`16+8*$i+6`, %f4, %f2, %f2
++___
++}
++$code.=<<___;
++ aes_eround01 %f16, %f0, %f2, %f4
++ aes_eround23 %f18, %f0, %f2, %f2
++ ldd [$key + 16], %f16
++ ldd [$key + 24], %f18
++ aes_eround01_l %f20, %f4, %f2, %f0
++ aes_eround23_l %f22, %f4, %f2, %f2
++ ldd [$key + 32], %f20
++ retl
++ ldd [$key + 40], %f22
++.type _aes256_encrypt_1x,#function
++.size _aes256_encrypt_1x,.-_aes256_encrypt_1x
++
++.align 32
++_aes256_encrypt_2x:
++ aes_eround01 %f16, %f0, %f2, %f8
++ aes_eround23 %f18, %f0, %f2, %f2
++ aes_eround01 %f16, %f4, %f6, %f10
++ aes_eround23 %f18, %f4, %f6, %f6
++ ldd [$key + 208], %f16
++ ldd [$key + 216], %f18
++ aes_eround01 %f20, %f8, %f2, %f0
++ aes_eround23 %f22, %f8, %f2, %f2
++ aes_eround01 %f20, %f10, %f6, %f4
++ aes_eround23 %f22, %f10, %f6, %f6
++ ldd [$key + 224], %f20
++ ldd [$key + 232], %f22
++___
++for ($i=1; $i<6; $i++) {
++ $code.=<<___;
++ aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f8
++ aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
++ aes_eround01 %f`16+8*$i+0`, %f4, %f6, %f10
++ aes_eround23 %f`16+8*$i+2`, %f4, %f6, %f6
++ aes_eround01 %f`16+8*$i+4`, %f8, %f2, %f0
++ aes_eround23 %f`16+8*$i+6`, %f8, %f2, %f2
++ aes_eround01 %f`16+8*$i+4`, %f10, %f6, %f4
++ aes_eround23 %f`16+8*$i+6`, %f10, %f6, %f6
++___
++}
++$code.=<<___;
++ aes_eround01 %f16, %f0, %f2, %f8
++ aes_eround23 %f18, %f0, %f2, %f2
++ aes_eround01 %f16, %f4, %f6, %f10
++ aes_eround23 %f18, %f4, %f6, %f6
++ ldd [$key + 16], %f16
++ ldd [$key + 24], %f18
++ aes_eround01_l %f20, %f8, %f2, %f0
++ aes_eround23_l %f22, %f8, %f2, %f2
++ aes_eround01_l %f20, %f10, %f6, %f4
++ aes_eround23_l %f22, %f10, %f6, %f6
++ ldd [$key + 32], %f20
++ retl
++ ldd [$key + 40], %f22
++.type _aes256_encrypt_2x,#function
++.size _aes256_encrypt_2x,.-_aes256_encrypt_2x
++
++.align 32
++_aes256_decrypt_1x:
++ aes_dround01 %f16, %f0, %f2, %f4
++ aes_dround23 %f18, %f0, %f2, %f2
++ ldd [$key + 208], %f16
++ ldd [$key + 216], %f18
++ aes_dround01 %f20, %f4, %f2, %f0
++ aes_dround23 %f22, %f4, %f2, %f2
++ ldd [$key + 224], %f20
++ ldd [$key + 232], %f22
++___
++for ($i=1; $i<6; $i++) {
++ $code.=<<___;
++ aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f4
++ aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
++ aes_dround01 %f`16+8*$i+4`, %f4, %f2, %f0
++ aes_dround23 %f`16+8*$i+6`, %f4, %f2, %f2
++___
++}
++$code.=<<___;
++ aes_dround01 %f16, %f0, %f2, %f4
++ aes_dround23 %f18, %f0, %f2, %f2
++ ldd [$key + 16], %f16
++ ldd [$key + 24], %f18
++ aes_dround01_l %f20, %f4, %f2, %f0
++ aes_dround23_l %f22, %f4, %f2, %f2
++ ldd [$key + 32], %f20
++ retl
++ ldd [$key + 40], %f22
++.type _aes256_decrypt_1x,#function
++.size _aes256_decrypt_1x,.-_aes256_decrypt_1x
++
++.align 32
++_aes256_decrypt_2x:
++ aes_dround01 %f16, %f0, %f2, %f8
++ aes_dround23 %f18, %f0, %f2, %f2
++ aes_dround01 %f16, %f4, %f6, %f10
++ aes_dround23 %f18, %f4, %f6, %f6
++ ldd [$key + 208], %f16
++ ldd [$key + 216], %f18
++ aes_dround01 %f20, %f8, %f2, %f0
++ aes_dround23 %f22, %f8, %f2, %f2
++ aes_dround01 %f20, %f10, %f6, %f4
++ aes_dround23 %f22, %f10, %f6, %f6
++ ldd [$key + 224], %f20
++ ldd [$key + 232], %f22
++___
++for ($i=1; $i<6; $i++) {
++ $code.=<<___;
++ aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f8
++ aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
++ aes_dround01 %f`16+8*$i+0`, %f4, %f6, %f10
++ aes_dround23 %f`16+8*$i+2`, %f4, %f6, %f6
++ aes_dround01 %f`16+8*$i+4`, %f8, %f2, %f0
++ aes_dround23 %f`16+8*$i+6`, %f8, %f2, %f2
++ aes_dround01 %f`16+8*$i+4`, %f10, %f6, %f4
++ aes_dround23 %f`16+8*$i+6`, %f10, %f6, %f6
++___
++}
++$code.=<<___;
++ aes_dround01 %f16, %f0, %f2, %f8
++ aes_dround23 %f18, %f0, %f2, %f2
++ aes_dround01 %f16, %f4, %f6, %f10
++ aes_dround23 %f18, %f4, %f6, %f6
++ ldd [$key + 16], %f16
++ ldd [$key + 24], %f18
++ aes_dround01_l %f20, %f8, %f2, %f0
++ aes_dround23_l %f22, %f8, %f2, %f2
++ aes_dround01_l %f20, %f10, %f6, %f4
++ aes_dround23_l %f22, %f10, %f6, %f6
++ ldd [$key + 32], %f20
++ retl
++ ldd [$key + 40], %f22
++.type _aes256_decrypt_2x,#function
++.size _aes256_decrypt_2x,.-_aes256_decrypt_2x
++___
++
++&alg_cbc_encrypt_implement("aes",128);
++&alg_cbc_encrypt_implement("aes",192);
++&alg_cbc_encrypt_implement("aes",256);
++
++&alg_cbc_decrypt_implement("aes",128);
++&alg_cbc_decrypt_implement("aes",192);
++&alg_cbc_decrypt_implement("aes",256);
++
++if ($::evp) {
++ &alg_ctr32_implement("aes",128);
++ &alg_ctr32_implement("aes",192);
++ &alg_ctr32_implement("aes",256);
++}
++}}}
++
++if (!$::evp) {
++$code.=<<___;
++.global AES_encrypt
++AES_encrypt=aes_t4_encrypt
++.global AES_decrypt
++AES_decrypt=aes_t4_decrypt
++.global AES_set_encrypt_key
++.align 32
++AES_set_encrypt_key:
++ andcc %o2, 7, %g0 ! check alignment
++ bnz,a,pn %icc, 1f
++ mov -1, %o0
++ brz,a,pn %o0, 1f
++ mov -1, %o0
++ brz,a,pn %o2, 1f
++ mov -1, %o0
++ andncc %o1, 0x1c0, %g0
++ bnz,a,pn %icc, 1f
++ mov -2, %o0
++ cmp %o1, 128
++ bl,a,pn %icc, 1f
++ mov -2, %o0
++ b aes_t4_set_encrypt_key
++ nop
++1: retl
++ nop
++.type AES_set_encrypt_key,#function
++.size AES_set_encrypt_key,.-AES_set_encrypt_key
++
++.global AES_set_decrypt_key
++.align 32
++AES_set_decrypt_key:
++ andcc %o2, 7, %g0 ! check alignment
++ bnz,a,pn %icc, 1f
++ mov -1, %o0
++ brz,a,pn %o0, 1f
++ mov -1, %o0
++ brz,a,pn %o2, 1f
++ mov -1, %o0
++ andncc %o1, 0x1c0, %g0
++ bnz,a,pn %icc, 1f
++ mov -2, %o0
++ cmp %o1, 128
++ bl,a,pn %icc, 1f
++ mov -2, %o0
++ b aes_t4_set_decrypt_key
++ nop
++1: retl
++ nop
++.type AES_set_decrypt_key,#function
++.size AES_set_decrypt_key,.-AES_set_decrypt_key
++___
++
++my ($inp,$out,$len,$key,$ivec,$enc)=map("%o$_",(0..5));
++
++$code.=<<___;
++.globl AES_cbc_encrypt
++.align 32
++AES_cbc_encrypt:
++ ld [$key + 240], %g1
++ nop
++ brz $enc, .Lcbc_decrypt
++ cmp %g1, 12
++
++ bl,pt %icc, aes128_t4_cbc_encrypt
++ nop
++ be,pn %icc, aes192_t4_cbc_encrypt
++ nop
++ ba aes256_t4_cbc_encrypt
++ nop
++
++.Lcbc_decrypt:
++ bl,pt %icc, aes128_t4_cbc_decrypt
++ nop
++ be,pn %icc, aes192_t4_cbc_decrypt
++ nop
++ ba aes256_t4_cbc_decrypt
++ nop
++.type AES_cbc_encrypt,#function
++.size AES_cbc_encrypt,.-AES_cbc_encrypt
++___
++}
++$code.=<<___;
++.asciz "AES for SPARC T4, David S. Miller, Andy Polyakov"
++.align 4
++___
++
++&emit_assembler();
++
++close STDOUT;
+Index: crypto/des/asm/dest4-sparcv9.pl
+===================================================================
+diff -uNr openssl-1.0.1m/crypto/des/asm/dest4-sparcv9.pl openssl-1.0.1m/crypto/des/asm/dest4-sparcv9.pl
+--- openssl-1.0.1m/crypto/des/asm/dest4-sparcv9.pl 1970-01-01 01:00:00.000000000 +0100
++++ openssl-1.0.1m/crypto/des/asm/dest4-sparcv9.pl 2015-06-02 09:52:11.809148396 +0200
+@@ -0,0 +1,602 @@
++#!/usr/bin/env perl
++
++# ====================================================================
++# Written by David S. Miller <davem at devemloft.net> and Andy Polyakov
++# <appro at openssl.org>. The module is licensed under 2-clause BSD
++# license. March 2013. All rights reserved.
++# ====================================================================
++
++######################################################################
++# DES for SPARC T4.
++#
++# As with other hardware-assisted ciphers CBC encrypt results [for
++# aligned data] are virtually identical to critical path lengths:
++#
++# DES Triple-DES
++# CBC encrypt 4.14/4.15(*) 11.7/11.7
++# CBC decrypt 1.77/4.11(**) 6.42/7.47
++#
++# (*) numbers after slash are for
++# misaligned data;
++# (**) this is result for largest
++# block size, unlike all other
++# cases smaller blocks results
++# are better[?];
++
++$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
++push(@INC,"${dir}","${dir}../../perlasm");
++require "sparcv9_modes.pl";
++
++&asm_init(@ARGV);
++
++$code.=<<___ if ($::abibits==64);
++.register %g2,#scratch
++.register %g3,#scratch
++___
++
++$code.=<<___;
++.text
++___
++
++{ my ($inp,$out)=("%o0","%o1");
++
++$code.=<<___;
++.align 32
++.globl des_t4_key_expand
++.type des_t4_key_expand,#function
++des_t4_key_expand:
++ andcc $inp, 0x7, %g0
++ alignaddr $inp, %g0, $inp
++ bz,pt %icc, 1f
++ ldd [$inp + 0x00], %f0
++ ldd [$inp + 0x08], %f2
++ faligndata %f0, %f2, %f0
++1: des_kexpand %f0, 0, %f0
++ des_kexpand %f0, 1, %f2
++ std %f0, [$out + 0x00]
++ des_kexpand %f2, 3, %f6
++ std %f2, [$out + 0x08]
++ des_kexpand %f2, 2, %f4
++ des_kexpand %f6, 3, %f10
++ std %f6, [$out + 0x18]
++ des_kexpand %f6, 2, %f8
++ std %f4, [$out + 0x10]
++ des_kexpand %f10, 3, %f14
++ std %f10, [$out + 0x28]
++ des_kexpand %f10, 2, %f12
++ std %f8, [$out + 0x20]
++ des_kexpand %f14, 1, %f16
++ std %f14, [$out + 0x38]
++ des_kexpand %f16, 3, %f20
++ std %f12, [$out + 0x30]
++ des_kexpand %f16, 2, %f18
++ std %f16, [$out + 0x40]
++ des_kexpand %f20, 3, %f24
++ std %f20, [$out + 0x50]
++ des_kexpand %f20, 2, %f22
++ std %f18, [$out + 0x48]
++ des_kexpand %f24, 3, %f28
++ std %f24, [$out + 0x60]
++ des_kexpand %f24, 2, %f26
++ std %f22, [$out + 0x58]
++ des_kexpand %f28, 1, %f30
++ std %f28, [$out + 0x70]
++ std %f26, [$out + 0x68]
++ retl
++ std %f30, [$out + 0x78]
++.size des_t4_key_expand,.-des_t4_key_expand
++___
++}
++{ my ($inp,$out,$len,$key,$ivec) = map("%o$_",(0..4));
++ my ($ileft,$iright,$omask) = map("%g$_",(1..3));
++
++$code.=<<___;
++.globl des_t4_cbc_encrypt
++.align 32
++des_t4_cbc_encrypt:
++ ld [$ivec + 0], %f0 ! load ivec
++ ld [$ivec + 4], %f1
++
++ and $inp, 7, $ileft
++ andn $inp, 7, $inp
++ sll $ileft, 3, $ileft
++ mov 0xff, $omask
++ prefetch [$inp], 20
++ prefetch [$inp + 63], 20
++ sub %g0, $ileft, $iright
++ and $out, 7, %g4
++ alignaddrl $out, %g0, $out
++ srl $omask, %g4, $omask
++ srlx $len, 3, $len
++ movrz %g4, 0, $omask
++ prefetch [$out], 22
++
++ ldd [$key + 0x00], %f4 ! load key schedule
++ ldd [$key + 0x08], %f6
++ ldd [$key + 0x10], %f8
++ ldd [$key + 0x18], %f10
++ ldd [$key + 0x20], %f12
++ ldd [$key + 0x28], %f14
++ ldd [$key + 0x30], %f16
++ ldd [$key + 0x38], %f18
++ ldd [$key + 0x40], %f20
++ ldd [$key + 0x48], %f22
++ ldd [$key + 0x50], %f24
++ ldd [$key + 0x58], %f26
++ ldd [$key + 0x60], %f28
++ ldd [$key + 0x68], %f30
++ ldd [$key + 0x70], %f32
++ ldd [$key + 0x78], %f34
++
++.Ldes_cbc_enc_loop:
++ ldx [$inp + 0], %g4
++ brz,pt $ileft, 4f
++ nop
++
++ ldx [$inp + 8], %g5
++ sllx %g4, $ileft, %g4
++ srlx %g5, $iright, %g5
++ or %g5, %g4, %g4
++4:
++ movxtod %g4, %f2
++ prefetch [$inp + 8+63], 20
++ add $inp, 8, $inp
++ fxor %f2, %f0, %f0 ! ^= ivec
++ prefetch [$out + 63], 22
++
++ des_ip %f0, %f0
++ des_round %f4, %f6, %f0, %f0
++ des_round %f8, %f10, %f0, %f0
++ des_round %f12, %f14, %f0, %f0
++ des_round %f16, %f18, %f0, %f0
++ des_round %f20, %f22, %f0, %f0
++ des_round %f24, %f26, %f0, %f0
++ des_round %f28, %f30, %f0, %f0
++ des_round %f32, %f34, %f0, %f0
++ des_iip %f0, %f0
++
++ brnz,pn $omask, 2f
++ sub $len, 1, $len
++
++ std %f0, [$out + 0]
++ brnz,pt $len, .Ldes_cbc_enc_loop
++ add $out, 8, $out
++
++ st %f0, [$ivec + 0] ! write out ivec
++ retl
++ st %f1, [$ivec + 4]
++
++.align 16
++2: ldxa [$inp]0x82, %g4 ! avoid read-after-write hazard
++ ! and ~4x deterioration
++ ! in inp==out case
++ faligndata %f0, %f0, %f2 ! handle unaligned output
++
++ stda %f2, [$out + $omask]0xc0 ! partial store
++ add $out, 8, $out
++ orn %g0, $omask, $omask
++ stda %f2, [$out + $omask]0xc0 ! partial store
++
++ brnz,pt $len, .Ldes_cbc_enc_loop+4
++ orn %g0, $omask, $omask
++
++ st %f0, [$ivec + 0] ! write out ivec
++ retl
++ st %f1, [$ivec + 4]
++.type des_t4_cbc_encrypt,#function
++.size des_t4_cbc_encrypt,.-des_t4_cbc_encrypt
++
++.globl des_t4_cbc_decrypt
++.align 32
++des_t4_cbc_decrypt:
++ ld [$ivec + 0], %f2 ! load ivec
++ ld [$ivec + 4], %f3
++
++ and $inp, 7, $ileft
++ andn $inp, 7, $inp
++ sll $ileft, 3, $ileft
++ mov 0xff, $omask
++ prefetch [$inp], 20
++ prefetch [$inp + 63], 20
++ sub %g0, $ileft, $iright
++ and $out, 7, %g4
++ alignaddrl $out, %g0, $out
++ srl $omask, %g4, $omask
++ srlx $len, 3, $len
++ movrz %g4, 0, $omask
++ prefetch [$out], 22
++
++ ldd [$key + 0x78], %f4 ! load key schedule
++ ldd [$key + 0x70], %f6
++ ldd [$key + 0x68], %f8
++ ldd [$key + 0x60], %f10
++ ldd [$key + 0x58], %f12
++ ldd [$key + 0x50], %f14
++ ldd [$key + 0x48], %f16
++ ldd [$key + 0x40], %f18
++ ldd [$key + 0x38], %f20
++ ldd [$key + 0x30], %f22
++ ldd [$key + 0x28], %f24
++ ldd [$key + 0x20], %f26
++ ldd [$key + 0x18], %f28
++ ldd [$key + 0x10], %f30
++ ldd [$key + 0x08], %f32
++ ldd [$key + 0x00], %f34
++
++.Ldes_cbc_dec_loop:
++ ldx [$inp + 0], %g4
++ brz,pt $ileft, 4f
++ nop
++
++ ldx [$inp + 8], %g5
++ sllx %g4, $ileft, %g4
++ srlx %g5, $iright, %g5
++ or %g5, %g4, %g4
++4:
++ movxtod %g4, %f0
++ prefetch [$inp + 8+63], 20
++ add $inp, 8, $inp
++ prefetch [$out + 63], 22
++
++ des_ip %f0, %f0
++ des_round %f4, %f6, %f0, %f0
++ des_round %f8, %f10, %f0, %f0
++ des_round %f12, %f14, %f0, %f0
++ des_round %f16, %f18, %f0, %f0
++ des_round %f20, %f22, %f0, %f0
++ des_round %f24, %f26, %f0, %f0
++ des_round %f28, %f30, %f0, %f0
++ des_round %f32, %f34, %f0, %f0
++ des_iip %f0, %f0
++
++ fxor %f2, %f0, %f0 ! ^= ivec
++ movxtod %g4, %f2
++
++ brnz,pn $omask, 2f
++ sub $len, 1, $len
++
++ std %f0, [$out + 0]
++ brnz,pt $len, .Ldes_cbc_dec_loop
++ add $out, 8, $out
++
++ st %f2, [$ivec + 0] ! write out ivec
++ retl
++ st %f3, [$ivec + 4]
++
++.align 16
++2: ldxa [$inp]0x82, %g4 ! avoid read-after-write hazard
++ ! and ~4x deterioration
++ ! in inp==out case
++ faligndata %f0, %f0, %f0 ! handle unaligned output
++
++ stda %f0, [$out + $omask]0xc0 ! partial store
++ add $out, 8, $out
++ orn %g0, $omask, $omask
++ stda %f0, [$out + $omask]0xc0 ! partial store
++
++ brnz,pt $len, .Ldes_cbc_dec_loop+4
++ orn %g0, $omask, $omask
++
++ st %f2, [$ivec + 0] ! write out ivec
++ retl
++ st %f3, [$ivec + 4]
++.type des_t4_cbc_decrypt,#function
++.size des_t4_cbc_decrypt,.-des_t4_cbc_decrypt
++___
++
++# One might wonder why does one have back-to-back des_iip/des_ip
++# pairs between EDE passes. Indeed, aren't they inverse of each other?
++# They almost are. Outcome of the pair is 32-bit words being swapped
++# in target register. Consider pair of des_iip/des_ip as a way to
++# perform the due swap, it's actually fastest way in this case.
++
++$code.=<<___;
++.globl des_t4_ede3_cbc_encrypt
++.align 32
++des_t4_ede3_cbc_encrypt:
++ ld [$ivec + 0], %f0 ! load ivec
++ ld [$ivec + 4], %f1
++
++ and $inp, 7, $ileft
++ andn $inp, 7, $inp
++ sll $ileft, 3, $ileft
++ mov 0xff, $omask
++ prefetch [$inp], 20
++ prefetch [$inp + 63], 20
++ sub %g0, $ileft, $iright
++ and $out, 7, %g4
++ alignaddrl $out, %g0, $out
++ srl $omask, %g4, $omask
++ srlx $len, 3, $len
++ movrz %g4, 0, $omask
++ prefetch [$out], 22
++
++ ldd [$key + 0x00], %f4 ! load key schedule
++ ldd [$key + 0x08], %f6
++ ldd [$key + 0x10], %f8
++ ldd [$key + 0x18], %f10
++ ldd [$key + 0x20], %f12
++ ldd [$key + 0x28], %f14
++ ldd [$key + 0x30], %f16
++ ldd [$key + 0x38], %f18
++ ldd [$key + 0x40], %f20
++ ldd [$key + 0x48], %f22
++ ldd [$key + 0x50], %f24
++ ldd [$key + 0x58], %f26
++ ldd [$key + 0x60], %f28
++ ldd [$key + 0x68], %f30
++ ldd [$key + 0x70], %f32
++ ldd [$key + 0x78], %f34
++
++.Ldes_ede3_cbc_enc_loop:
++ ldx [$inp + 0], %g4
++ brz,pt $ileft, 4f
++ nop
++
++ ldx [$inp + 8], %g5
++ sllx %g4, $ileft, %g4
++ srlx %g5, $iright, %g5
++ or %g5, %g4, %g4
++4:
++ movxtod %g4, %f2
++ prefetch [$inp + 8+63], 20
++ add $inp, 8, $inp
++ fxor %f2, %f0, %f0 ! ^= ivec
++ prefetch [$out + 63], 22
++
++ des_ip %f0, %f0
++ des_round %f4, %f6, %f0, %f0
++ des_round %f8, %f10, %f0, %f0
++ des_round %f12, %f14, %f0, %f0
++ des_round %f16, %f18, %f0, %f0
++ ldd [$key + 0x100-0x08], %f36
++ ldd [$key + 0x100-0x10], %f38
++ des_round %f20, %f22, %f0, %f0
++ ldd [$key + 0x100-0x18], %f40
++ ldd [$key + 0x100-0x20], %f42
++ des_round %f24, %f26, %f0, %f0
++ ldd [$key + 0x100-0x28], %f44
++ ldd [$key + 0x100-0x30], %f46
++ des_round %f28, %f30, %f0, %f0
++ ldd [$key + 0x100-0x38], %f48
++ ldd [$key + 0x100-0x40], %f50
++ des_round %f32, %f34, %f0, %f0
++ ldd [$key + 0x100-0x48], %f52
++ ldd [$key + 0x100-0x50], %f54
++ des_iip %f0, %f0
++
++ ldd [$key + 0x100-0x58], %f56
++ ldd [$key + 0x100-0x60], %f58
++ des_ip %f0, %f0
++ ldd [$key + 0x100-0x68], %f60
++ ldd [$key + 0x100-0x70], %f62
++ des_round %f36, %f38, %f0, %f0
++ ldd [$key + 0x100-0x78], %f36
++ ldd [$key + 0x100-0x80], %f38
++ des_round %f40, %f42, %f0, %f0
++ des_round %f44, %f46, %f0, %f0
++ des_round %f48, %f50, %f0, %f0
++ ldd [$key + 0x100+0x00], %f40
++ ldd [$key + 0x100+0x08], %f42
++ des_round %f52, %f54, %f0, %f0
++ ldd [$key + 0x100+0x10], %f44
++ ldd [$key + 0x100+0x18], %f46
++ des_round %f56, %f58, %f0, %f0
++ ldd [$key + 0x100+0x20], %f48
++ ldd [$key + 0x100+0x28], %f50
++ des_round %f60, %f62, %f0, %f0
++ ldd [$key + 0x100+0x30], %f52
++ ldd [$key + 0x100+0x38], %f54
++ des_round %f36, %f38, %f0, %f0
++ ldd [$key + 0x100+0x40], %f56
++ ldd [$key + 0x100+0x48], %f58
++ des_iip %f0, %f0
++
++ ldd [$key + 0x100+0x50], %f60
++ ldd [$key + 0x100+0x58], %f62
++ des_ip %f0, %f0
++ ldd [$key + 0x100+0x60], %f36
++ ldd [$key + 0x100+0x68], %f38
++ des_round %f40, %f42, %f0, %f0
++ ldd [$key + 0x100+0x70], %f40
++ ldd [$key + 0x100+0x78], %f42
++ des_round %f44, %f46, %f0, %f0
++ des_round %f48, %f50, %f0, %f0
++ des_round %f52, %f54, %f0, %f0
++ des_round %f56, %f58, %f0, %f0
++ des_round %f60, %f62, %f0, %f0
++ des_round %f36, %f38, %f0, %f0
++ des_round %f40, %f42, %f0, %f0
++ des_iip %f0, %f0
++
++ brnz,pn $omask, 2f
++ sub $len, 1, $len
++
++ std %f0, [$out + 0]
++ brnz,pt $len, .Ldes_ede3_cbc_enc_loop
++ add $out, 8, $out
++
++ st %f0, [$ivec + 0] ! write out ivec
++ retl
++ st %f1, [$ivec + 4]
++
++.align 16
++2: ldxa [$inp]0x82, %g4 ! avoid read-after-write hazard
++ ! and ~2x deterioration
++ ! in inp==out case
++ faligndata %f0, %f0, %f2 ! handle unaligned output
++
++ stda %f2, [$out + $omask]0xc0 ! partial store
++ add $out, 8, $out
++ orn %g0, $omask, $omask
++ stda %f2, [$out + $omask]0xc0 ! partial store
++
++ brnz,pt $len, .Ldes_ede3_cbc_enc_loop+4
++ orn %g0, $omask, $omask
++
++ st %f0, [$ivec + 0] ! write out ivec
++ retl
++ st %f1, [$ivec + 4]
++.type des_t4_ede3_cbc_encrypt,#function
++.size des_t4_ede3_cbc_encrypt,.-des_t4_ede3_cbc_encrypt
++
++.globl des_t4_ede3_cbc_decrypt
++.align 32
++des_t4_ede3_cbc_decrypt:
++ ld [$ivec + 0], %f2 ! load ivec
++ ld [$ivec + 4], %f3
++
++ and $inp, 7, $ileft
++ andn $inp, 7, $inp
++ sll $ileft, 3, $ileft
++ mov 0xff, $omask
++ prefetch [$inp], 20
++ prefetch [$inp + 63], 20
++ sub %g0, $ileft, $iright
++ and $out, 7, %g4
++ alignaddrl $out, %g0, $out
++ srl $omask, %g4, $omask
++ srlx $len, 3, $len
++ movrz %g4, 0, $omask
++ prefetch [$out], 22
++
++ ldd [$key + 0x100+0x78], %f4 ! load key schedule
++ ldd [$key + 0x100+0x70], %f6
++ ldd [$key + 0x100+0x68], %f8
++ ldd [$key + 0x100+0x60], %f10
++ ldd [$key + 0x100+0x58], %f12
++ ldd [$key + 0x100+0x50], %f14
++ ldd [$key + 0x100+0x48], %f16
++ ldd [$key + 0x100+0x40], %f18
++ ldd [$key + 0x100+0x38], %f20
++ ldd [$key + 0x100+0x30], %f22
++ ldd [$key + 0x100+0x28], %f24
++ ldd [$key + 0x100+0x20], %f26
++ ldd [$key + 0x100+0x18], %f28
++ ldd [$key + 0x100+0x10], %f30
++ ldd [$key + 0x100+0x08], %f32
++ ldd [$key + 0x100+0x00], %f34
++
++.Ldes_ede3_cbc_dec_loop:
++ ldx [$inp + 0], %g4
++ brz,pt $ileft, 4f
++ nop
++
++ ldx [$inp + 8], %g5
++ sllx %g4, $ileft, %g4
++ srlx %g5, $iright, %g5
++ or %g5, %g4, %g4
++4:
++ movxtod %g4, %f0
++ prefetch [$inp + 8+63], 20
++ add $inp, 8, $inp
++ prefetch [$out + 63], 22
++
++ des_ip %f0, %f0
++ des_round %f4, %f6, %f0, %f0
++ des_round %f8, %f10, %f0, %f0
++ des_round %f12, %f14, %f0, %f0
++ des_round %f16, %f18, %f0, %f0
++ ldd [$key + 0x80+0x00], %f36
++ ldd [$key + 0x80+0x08], %f38
++ des_round %f20, %f22, %f0, %f0
++ ldd [$key + 0x80+0x10], %f40
++ ldd [$key + 0x80+0x18], %f42
++ des_round %f24, %f26, %f0, %f0
++ ldd [$key + 0x80+0x20], %f44
++ ldd [$key + 0x80+0x28], %f46
++ des_round %f28, %f30, %f0, %f0
++ ldd [$key + 0x80+0x30], %f48
++ ldd [$key + 0x80+0x38], %f50
++ des_round %f32, %f34, %f0, %f0
++ ldd [$key + 0x80+0x40], %f52
++ ldd [$key + 0x80+0x48], %f54
++ des_iip %f0, %f0
++
++ ldd [$key + 0x80+0x50], %f56
++ ldd [$key + 0x80+0x58], %f58
++ des_ip %f0, %f0
++ ldd [$key + 0x80+0x60], %f60
++ ldd [$key + 0x80+0x68], %f62
++ des_round %f36, %f38, %f0, %f0
++ ldd [$key + 0x80+0x70], %f36
++ ldd [$key + 0x80+0x78], %f38
++ des_round %f40, %f42, %f0, %f0
++ des_round %f44, %f46, %f0, %f0
++ des_round %f48, %f50, %f0, %f0
++ ldd [$key + 0x80-0x08], %f40
++ ldd [$key + 0x80-0x10], %f42
++ des_round %f52, %f54, %f0, %f0
++ ldd [$key + 0x80-0x18], %f44
++ ldd [$key + 0x80-0x20], %f46
++ des_round %f56, %f58, %f0, %f0
++ ldd [$key + 0x80-0x28], %f48
++ ldd [$key + 0x80-0x30], %f50
++ des_round %f60, %f62, %f0, %f0
++ ldd [$key + 0x80-0x38], %f52
++ ldd [$key + 0x80-0x40], %f54
++ des_round %f36, %f38, %f0, %f0
++ ldd [$key + 0x80-0x48], %f56
++ ldd [$key + 0x80-0x50], %f58
++ des_iip %f0, %f0
++
++ ldd [$key + 0x80-0x58], %f60
++ ldd [$key + 0x80-0x60], %f62
++ des_ip %f0, %f0
++ ldd [$key + 0x80-0x68], %f36
++ ldd [$key + 0x80-0x70], %f38
++ des_round %f40, %f42, %f0, %f0
++ ldd [$key + 0x80-0x78], %f40
++ ldd [$key + 0x80-0x80], %f42
++ des_round %f44, %f46, %f0, %f0
++ des_round %f48, %f50, %f0, %f0
++ des_round %f52, %f54, %f0, %f0
++ des_round %f56, %f58, %f0, %f0
++ des_round %f60, %f62, %f0, %f0
++ des_round %f36, %f38, %f0, %f0
++ des_round %f40, %f42, %f0, %f0
++ des_iip %f0, %f0
++
++ fxor %f2, %f0, %f0 ! ^= ivec
++ movxtod %g4, %f2
++
++ brnz,pn $omask, 2f
++ sub $len, 1, $len
++
++ std %f0, [$out + 0]
++ brnz,pt $len, .Ldes_ede3_cbc_dec_loop
++ add $out, 8, $out
++
++ st %f2, [$ivec + 0] ! write out ivec
++ retl
++ st %f3, [$ivec + 4]
++
++.align 16
++2: ldxa [$inp]0x82, %g4 ! avoid read-after-write hazard
++ ! and ~3x deterioration
++ ! in inp==out case
++ faligndata %f0, %f0, %f0 ! handle unaligned output
++
++ stda %f0, [$out + $omask]0xc0 ! partial store
++ add $out, 8, $out
++ orn %g0, $omask, $omask
++ stda %f0, [$out + $omask]0xc0 ! partial store
++
++ brnz,pt $len, .Ldes_ede3_cbc_dec_loop+4
++ orn %g0, $omask, $omask
++
++ st %f2, [$ivec + 0] ! write out ivec
++ retl
++ st %f3, [$ivec + 4]
++.type des_t4_ede3_cbc_decrypt,#function
++.size des_t4_ede3_cbc_decrypt,.-des_t4_ede3_cbc_decrypt
++___
++}
++$code.=<<___;
++.asciz "DES for SPARC T4, David S. Miller, Andy Polyakov"
++.align 4
++___
++
++&emit_assembler();
++
++close STDOUT;
+Index: crypto/perlasm/sparcv9_modes.pl
+===================================================================
+diff -uNr openssl-1.0.1m/crypto/perlasm/sparcv9_modes.pl openssl-1.0.1m/crypto/perlasm/sparcv9_modes.pl
+--- openssl-1.0.1m/crypto/perlasm/sparcv9_modes.pl 1970-01-01 01:00:00.000000000 +0100
++++ openssl-1.0.1m/crypto/perlasm/sparcv9_modes.pl 2015-06-02 09:52:11.809148396 +0200
+@@ -0,0 +1,1680 @@
++#!/usr/bin/env perl
++
++# Specific modes implementations for SPARC Architecture 2011. There
++# is T4 dependency though, an ASI value that is not specified in the
++# Architecture Manual. But as SPARC universe is rather monocultural,
++# we imply that processor capable of executing crypto instructions
++# can handle the ASI in question as well. This means that we ought to
++# keep eyes open when new processors emerge...
++#
++# As for above mentioned ASI. It's so called "block initializing
++# store" which cancels "read" in "read-update-write" on cache lines.
++# This is "cooperative" optimization, as it reduces overall pressure
++# on memory interface. Benefits can't be observed/quantified with
++# usual benchmarks, on the contrary you can notice that single-thread
++# performance for parallelizable modes is ~1.5% worse for largest
++# block sizes [though few percent better for not so long ones]. All
++# this based on suggestions from David Miller.
++
++sub asm_init { # to be called with @ARGV as argument
++ for (@_) { $::abibits=64 if (/\-m64/ || /\-xarch\=v9/); }
++ if ($::abibits==64) { $::bias=2047; $::frame=192; $::size_t_cc="%xcc"; }
++ else { $::bias=0; $::frame=112; $::size_t_cc="%icc"; }
++}
++
++# unified interface
++my ($inp,$out,$len,$key,$ivec)=map("%i$_",(0..5));
++# local variables
++my ($ileft,$iright,$ooff,$omask,$ivoff,$blk_init)=map("%l$_",(0..7));
++
++sub alg_cbc_encrypt_implement {
++my ($alg,$bits) = @_;
++
++$::code.=<<___;
++.globl ${alg}${bits}_t4_cbc_encrypt
++.align 32
++${alg}${bits}_t4_cbc_encrypt:
++ save %sp, -$::frame, %sp
++ sub $inp, $out, $blk_init ! $inp!=$out
++___
++$::code.=<<___ if (!$::evp);
++ andcc $ivec, 7, $ivoff
++ alignaddr $ivec, %g0, $ivec
++
++ ldd [$ivec + 0], %f0 ! load ivec
++ bz,pt %icc, 1f
++ ldd [$ivec + 8], %f2
++ ldd [$ivec + 16], %f4
++ faligndata %f0, %f2, %f0
++ faligndata %f2, %f4, %f2
++1:
++___
++$::code.=<<___ if ($::evp);
++ ld [$ivec + 0], %f0
++ ld [$ivec + 4], %f1
++ ld [$ivec + 8], %f2
++ ld [$ivec + 12], %f3
++___
++$::code.=<<___;
++ prefetch [$inp], 20
++ prefetch [$inp + 63], 20
++ call _${alg}${bits}_load_enckey
++ and $inp, 7, $ileft
++ andn $inp, 7, $inp
++ sll $ileft, 3, $ileft
++ mov 64, $iright
++ mov 0xff, $omask
++ sub $iright, $ileft, $iright
++ and $out, 7, $ooff
++ cmp $len, 127
++ movrnz $ooff, 0, $blk_init ! if ( $out&7 ||
++ movleu $::size_t_cc, 0, $blk_init ! $len<128 ||
++ brnz,pn $blk_init, .L${bits}cbc_enc_blk ! $inp==$out)
++ srl $omask, $ooff, $omask
++
++ alignaddrl $out, %g0, $out
++ srlx $len, 4, $len
++ prefetch [$out], 22
++
++.L${bits}_cbc_enc_loop:
++ ldx [$inp + 0], %o0
++ brz,pt $ileft, 4f
++ ldx [$inp + 8], %o1
++
++ ldx [$inp + 16], %o2
++ sllx %o0, $ileft, %o0
++ srlx %o1, $iright, %g1
++ sllx %o1, $ileft, %o1
++ or %g1, %o0, %o0
++ srlx %o2, $iright, %o2
++ or %o2, %o1, %o1
++4:
++ xor %g4, %o0, %o0 ! ^= rk[0]
++ xor %g5, %o1, %o1
++ movxtod %o0, %f12
++ movxtod %o1, %f14
++
++ fxor %f12, %f0, %f0 ! ^= ivec
++ fxor %f14, %f2, %f2
++ prefetch [$out + 63], 22
++ prefetch [$inp + 16+63], 20
++ call _${alg}${bits}_encrypt_1x
++ add $inp, 16, $inp
++
++ brnz,pn $ooff, 2f
++ sub $len, 1, $len
++
++ std %f0, [$out + 0]
++ std %f2, [$out + 8]
++ brnz,pt $len, .L${bits}_cbc_enc_loop
++ add $out, 16, $out
++___
++$::code.=<<___ if ($::evp);
++ st %f0, [$ivec + 0]
++ st %f1, [$ivec + 4]
++ st %f2, [$ivec + 8]
++ st %f3, [$ivec + 12]
++___
++$::code.=<<___ if (!$::evp);
++ brnz,pn $ivoff, 3f
++ nop
++
++ std %f0, [$ivec + 0] ! write out ivec
++ std %f2, [$ivec + 8]
++___
++$::code.=<<___;
++ ret
++ restore
++
++.align 16
++2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
++ ! and ~3x deterioration
++ ! in inp==out case
++ faligndata %f0, %f0, %f4 ! handle unaligned output
++ faligndata %f0, %f2, %f6
++ faligndata %f2, %f2, %f8
++
++ stda %f4, [$out + $omask]0xc0 ! partial store
++ std %f6, [$out + 8]
++ add $out, 16, $out
++ orn %g0, $omask, $omask
++ stda %f8, [$out + $omask]0xc0 ! partial store
++
++ brnz,pt $len, .L${bits}_cbc_enc_loop+4
++ orn %g0, $omask, $omask
++___
++$::code.=<<___ if ($::evp);
++ st %f0, [$ivec + 0]
++ st %f1, [$ivec + 4]
++ st %f2, [$ivec + 8]
++ st %f3, [$ivec + 12]
++___
++$::code.=<<___ if (!$::evp);
++ brnz,pn $ivoff, 3f
++ nop
++
++ std %f0, [$ivec + 0] ! write out ivec
++ std %f2, [$ivec + 8]
++ ret
++ restore
++
++.align 16
++3: alignaddrl $ivec, $ivoff, %g0 ! handle unaligned ivec
++ mov 0xff, $omask
++ srl $omask, $ivoff, $omask
++ faligndata %f0, %f0, %f4
++ faligndata %f0, %f2, %f6
++ faligndata %f2, %f2, %f8
++ stda %f4, [$ivec + $omask]0xc0
++ std %f6, [$ivec + 8]
++ add $ivec, 16, $ivec
++ orn %g0, $omask, $omask
++ stda %f8, [$ivec + $omask]0xc0
++___
++$::code.=<<___;
++ ret
++ restore
++
++!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
++.align 32
++.L${bits}cbc_enc_blk:
++ add $out, $len, $blk_init
++ and $blk_init, 63, $blk_init ! tail
++ sub $len, $blk_init, $len
++ add $blk_init, 15, $blk_init ! round up to 16n
++ srlx $len, 4, $len
++ srl $blk_init, 4, $blk_init
++
++.L${bits}_cbc_enc_blk_loop:
++ ldx [$inp + 0], %o0
++ brz,pt $ileft, 5f
++ ldx [$inp + 8], %o1
++
++ ldx [$inp + 16], %o2
++ sllx %o0, $ileft, %o0
++ srlx %o1, $iright, %g1
++ sllx %o1, $ileft, %o1
++ or %g1, %o0, %o0
++ srlx %o2, $iright, %o2
++ or %o2, %o1, %o1
++5:
++ xor %g4, %o0, %o0 ! ^= rk[0]
++ xor %g5, %o1, %o1
++ movxtod %o0, %f12
++ movxtod %o1, %f14
++
++ fxor %f12, %f0, %f0 ! ^= ivec
++ fxor %f14, %f2, %f2
++ prefetch [$inp + 16+63], 20
++ call _${alg}${bits}_encrypt_1x
++ add $inp, 16, $inp
++ sub $len, 1, $len
++
++ stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
++ add $out, 8, $out
++ stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
++ brnz,pt $len, .L${bits}_cbc_enc_blk_loop
++ add $out, 8, $out
++
++ membar #StoreLoad|#StoreStore
++ brnz,pt $blk_init, .L${bits}_cbc_enc_loop
++ mov $blk_init, $len
++___
++$::code.=<<___ if ($::evp);
++ st %f0, [$ivec + 0]
++ st %f1, [$ivec + 4]
++ st %f2, [$ivec + 8]
++ st %f3, [$ivec + 12]
++___
++$::code.=<<___ if (!$::evp);
++ brnz,pn $ivoff, 3b
++ nop
++
++ std %f0, [$ivec + 0] ! write out ivec
++ std %f2, [$ivec + 8]
++___
++$::code.=<<___;
++ ret
++ restore
++.type ${alg}${bits}_t4_cbc_encrypt,#function
++.size ${alg}${bits}_t4_cbc_encrypt,.-${alg}${bits}_t4_cbc_encrypt
++___
++}
++
++sub alg_cbc_decrypt_implement {
++my ($alg,$bits) = @_;
++
++$::code.=<<___;
++.globl ${alg}${bits}_t4_cbc_decrypt
++.align 32
++${alg}${bits}_t4_cbc_decrypt:
++ save %sp, -$::frame, %sp
++ sub $inp, $out, $blk_init ! $inp!=$out
++___
++$::code.=<<___ if (!$::evp);
++ andcc $ivec, 7, $ivoff
++ alignaddr $ivec, %g0, $ivec
++
++ ldd [$ivec + 0], %f12 ! load ivec
++ bz,pt %icc, 1f
++ ldd [$ivec + 8], %f14
++ ldd [$ivec + 16], %f0
++ faligndata %f12, %f14, %f12
++ faligndata %f14, %f0, %f14
++1:
++___
++$::code.=<<___ if ($::evp);
++ ld [$ivec + 0], %f12 ! load ivec
++ ld [$ivec + 4], %f13
++ ld [$ivec + 8], %f14
++ ld [$ivec + 12], %f15
++___
++$::code.=<<___;
++ prefetch [$inp], 20
++ prefetch [$inp + 63], 20
++ call _${alg}${bits}_load_deckey
++ and $inp, 7, $ileft
++ andn $inp, 7, $inp
++ sll $ileft, 3, $ileft
++ mov 64, $iright
++ mov 0xff, $omask
++ sub $iright, $ileft, $iright
++ and $out, 7, $ooff
++ cmp $len, 255
++ movrnz $ooff, 0, $blk_init ! if ( $out&7 ||
++ movleu $::size_t_cc, 0, $blk_init ! $len<256 ||
++ brnz,pn $blk_init, .L${bits}cbc_dec_blk ! $inp==$out)
++ srl $omask, $ooff, $omask
++
++ andcc $len, 16, %g0 ! is number of blocks even?
++ srlx $len, 4, $len
++ alignaddrl $out, %g0, $out
++ bz %icc, .L${bits}_cbc_dec_loop2x
++ prefetch [$out], 22
++.L${bits}_cbc_dec_loop:
++ ldx [$inp + 0], %o0
++ brz,pt $ileft, 4f
++ ldx [$inp + 8], %o1
++
++ ldx [$inp + 16], %o2
++ sllx %o0, $ileft, %o0
++ srlx %o1, $iright, %g1
++ sllx %o1, $ileft, %o1
++ or %g1, %o0, %o0
++ srlx %o2, $iright, %o2
++ or %o2, %o1, %o1
++4:
++ xor %g4, %o0, %o2 ! ^= rk[0]
++ xor %g5, %o1, %o3
++ movxtod %o2, %f0
++ movxtod %o3, %f2
++
++ prefetch [$out + 63], 22
++ prefetch [$inp + 16+63], 20
++ call _${alg}${bits}_decrypt_1x
++ add $inp, 16, $inp
++
++ fxor %f12, %f0, %f0 ! ^= ivec
++ fxor %f14, %f2, %f2
++ movxtod %o0, %f12
++ movxtod %o1, %f14
++
++ brnz,pn $ooff, 2f
++ sub $len, 1, $len
++
++ std %f0, [$out + 0]
++ std %f2, [$out + 8]
++ brnz,pt $len, .L${bits}_cbc_dec_loop2x
++ add $out, 16, $out
++___
++$::code.=<<___ if ($::evp);
++ st %f12, [$ivec + 0]
++ st %f13, [$ivec + 4]
++ st %f14, [$ivec + 8]
++ st %f15, [$ivec + 12]
++___
++$::code.=<<___ if (!$::evp);
++ brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec
++ nop
++
++ std %f12, [$ivec + 0] ! write out ivec
++ std %f14, [$ivec + 8]
++___
++$::code.=<<___;
++ ret
++ restore
++
++.align 16
++2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
++ ! and ~3x deterioration
++ ! in inp==out case
++ faligndata %f0, %f0, %f4 ! handle unaligned output
++ faligndata %f0, %f2, %f6
++ faligndata %f2, %f2, %f8
++
++ stda %f4, [$out + $omask]0xc0 ! partial store
++ std %f6, [$out + 8]
++ add $out, 16, $out
++ orn %g0, $omask, $omask
++ stda %f8, [$out + $omask]0xc0 ! partial store
++
++ brnz,pt $len, .L${bits}_cbc_dec_loop2x+4
++ orn %g0, $omask, $omask
++___
++$::code.=<<___ if ($::evp);
++ st %f12, [$ivec + 0]
++ st %f13, [$ivec + 4]
++ st %f14, [$ivec + 8]
++ st %f15, [$ivec + 12]
++___
++$::code.=<<___ if (!$::evp);
++ brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec
++ nop
++
++ std %f12, [$ivec + 0] ! write out ivec
++ std %f14, [$ivec + 8]
++___
++$::code.=<<___;
++ ret
++ restore
++
++!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
++.align 32
++.L${bits}_cbc_dec_loop2x:
++ ldx [$inp + 0], %o0
++ ldx [$inp + 8], %o1
++ ldx [$inp + 16], %o2
++ brz,pt $ileft, 4f
++ ldx [$inp + 24], %o3
++
++ ldx [$inp + 32], %o4
++ sllx %o0, $ileft, %o0
++ srlx %o1, $iright, %g1
++ or %g1, %o0, %o0
++ sllx %o1, $ileft, %o1
++ srlx %o2, $iright, %g1
++ or %g1, %o1, %o1
++ sllx %o2, $ileft, %o2
++ srlx %o3, $iright, %g1
++ or %g1, %o2, %o2
++ sllx %o3, $ileft, %o3
++ srlx %o4, $iright, %o4
++ or %o4, %o3, %o3
++4:
++ xor %g4, %o0, %o4 ! ^= rk[0]
++ xor %g5, %o1, %o5
++ movxtod %o4, %f0
++ movxtod %o5, %f2
++ xor %g4, %o2, %o4
++ xor %g5, %o3, %o5
++ movxtod %o4, %f4
++ movxtod %o5, %f6
++
++ prefetch [$out + 63], 22
++ prefetch [$inp + 32+63], 20
++ call _${alg}${bits}_decrypt_2x
++ add $inp, 32, $inp
++
++ movxtod %o0, %f8
++ movxtod %o1, %f10
++ fxor %f12, %f0, %f0 ! ^= ivec
++ fxor %f14, %f2, %f2
++ movxtod %o2, %f12
++ movxtod %o3, %f14
++ fxor %f8, %f4, %f4
++ fxor %f10, %f6, %f6
++
++ brnz,pn $ooff, 2f
++ sub $len, 2, $len
++
++ std %f0, [$out + 0]
++ std %f2, [$out + 8]
++ std %f4, [$out + 16]
++ std %f6, [$out + 24]
++ brnz,pt $len, .L${bits}_cbc_dec_loop2x
++ add $out, 32, $out
++___
++$::code.=<<___ if ($::evp);
++ st %f12, [$ivec + 0]
++ st %f13, [$ivec + 4]
++ st %f14, [$ivec + 8]
++ st %f15, [$ivec + 12]
++___
++$::code.=<<___ if (!$::evp);
++ brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec
++ nop
++
++ std %f12, [$ivec + 0] ! write out ivec
++ std %f14, [$ivec + 8]
++___
++$::code.=<<___;
++ ret
++ restore
++
++.align 16
++2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
++ ! and ~3x deterioration
++ ! in inp==out case
++ faligndata %f0, %f0, %f8 ! handle unaligned output
++ faligndata %f0, %f2, %f0
++ faligndata %f2, %f4, %f2
++ faligndata %f4, %f6, %f4
++ faligndata %f6, %f6, %f6
++ stda %f8, [$out + $omask]0xc0 ! partial store
++ std %f0, [$out + 8]
++ std %f2, [$out + 16]
++ std %f4, [$out + 24]
++ add $out, 32, $out
++ orn %g0, $omask, $omask
++ stda %f6, [$out + $omask]0xc0 ! partial store
++
++ brnz,pt $len, .L${bits}_cbc_dec_loop2x+4
++ orn %g0, $omask, $omask
++___
++$::code.=<<___ if ($::evp);
++ st %f12, [$ivec + 0]
++ st %f13, [$ivec + 4]
++ st %f14, [$ivec + 8]
++ st %f15, [$ivec + 12]
++___
++$::code.=<<___ if (!$::evp);
++ brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec
++ nop
++
++ std %f12, [$ivec + 0] ! write out ivec
++ std %f14, [$ivec + 8]
++ ret
++ restore
++
++.align 16
++.L${bits}_cbc_dec_unaligned_ivec:
++ alignaddrl $ivec, $ivoff, %g0 ! handle unaligned ivec
++ mov 0xff, $omask
++ srl $omask, $ivoff, $omask
++ faligndata %f12, %f12, %f0
++ faligndata %f12, %f14, %f2
++ faligndata %f14, %f14, %f4
++ stda %f0, [$ivec + $omask]0xc0
++ std %f2, [$ivec + 8]
++ add $ivec, 16, $ivec
++ orn %g0, $omask, $omask
++ stda %f4, [$ivec + $omask]0xc0
++___
++$::code.=<<___;
++ ret
++ restore
++
++!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
++.align 32
++.L${bits}cbc_dec_blk:
++ add $out, $len, $blk_init
++ and $blk_init, 63, $blk_init ! tail
++ sub $len, $blk_init, $len
++ add $blk_init, 15, $blk_init ! round up to 16n
++ srlx $len, 4, $len
++ srl $blk_init, 4, $blk_init
++ sub $len, 1, $len
++ add $blk_init, 1, $blk_init
++
++.L${bits}_cbc_dec_blk_loop2x:
++ ldx [$inp + 0], %o0
++ ldx [$inp + 8], %o1
++ ldx [$inp + 16], %o2
++ brz,pt $ileft, 5f
++ ldx [$inp + 24], %o3
++
++ ldx [$inp + 32], %o4
++ sllx %o0, $ileft, %o0
++ srlx %o1, $iright, %g1
++ or %g1, %o0, %o0
++ sllx %o1, $ileft, %o1
++ srlx %o2, $iright, %g1
++ or %g1, %o1, %o1
++ sllx %o2, $ileft, %o2
++ srlx %o3, $iright, %g1
++ or %g1, %o2, %o2
++ sllx %o3, $ileft, %o3
++ srlx %o4, $iright, %o4
++ or %o4, %o3, %o3
++5:
++ xor %g4, %o0, %o4 ! ^= rk[0]
++ xor %g5, %o1, %o5
++ movxtod %o4, %f0
++ movxtod %o5, %f2
++ xor %g4, %o2, %o4
++ xor %g5, %o3, %o5
++ movxtod %o4, %f4
++ movxtod %o5, %f6
++
++ prefetch [$inp + 32+63], 20
++ call _${alg}${bits}_decrypt_2x
++ add $inp, 32, $inp
++ subcc $len, 2, $len
++
++ movxtod %o0, %f8
++ movxtod %o1, %f10
++ fxor %f12, %f0, %f0 ! ^= ivec
++ fxor %f14, %f2, %f2
++ movxtod %o2, %f12
++ movxtod %o3, %f14
++ fxor %f8, %f4, %f4
++ fxor %f10, %f6, %f6
++
++ stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
++ add $out, 8, $out
++ stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
++ add $out, 8, $out
++ stda %f4, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
++ add $out, 8, $out
++ stda %f6, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
++ bgu,pt $::size_t_cc, .L${bits}_cbc_dec_blk_loop2x
++ add $out, 8, $out
++
++ add $blk_init, $len, $len
++ andcc $len, 1, %g0 ! is number of blocks even?
++ membar #StoreLoad|#StoreStore
++ bnz,pt %icc, .L${bits}_cbc_dec_loop
++ srl $len, 0, $len
++ brnz,pn $len, .L${bits}_cbc_dec_loop2x
++ nop
++___
++$::code.=<<___ if ($::evp);
++ st %f12, [$ivec + 0] ! write out ivec
++ st %f13, [$ivec + 4]
++ st %f14, [$ivec + 8]
++ st %f15, [$ivec + 12]
++___
++$::code.=<<___ if (!$::evp);
++ brnz,pn $ivoff, 3b
++ nop
++
++ std %f12, [$ivec + 0] ! write out ivec
++ std %f14, [$ivec + 8]
++___
++$::code.=<<___;
++ ret
++ restore
++.type ${alg}${bits}_t4_cbc_decrypt,#function
++.size ${alg}${bits}_t4_cbc_decrypt,.-${alg}${bits}_t4_cbc_decrypt
++___
++}
++
++sub alg_ctr32_implement {
++my ($alg,$bits) = @_;
++
++$::code.=<<___;
++.globl ${alg}${bits}_t4_ctr32_encrypt
++.align 32
++${alg}${bits}_t4_ctr32_encrypt:
++ save %sp, -$::frame, %sp
++
++ prefetch [$inp], 20
++ prefetch [$inp + 63], 20
++ call _${alg}${bits}_load_enckey
++ sllx $len, 4, $len
++
++ ld [$ivec + 0], %l4 ! counter
++ ld [$ivec + 4], %l5
++ ld [$ivec + 8], %l6
++ ld [$ivec + 12], %l7
++
++ sllx %l4, 32, %o5
++ or %l5, %o5, %o5
++ sllx %l6, 32, %g1
++ xor %o5, %g4, %g4 ! ^= rk[0]
++ xor %g1, %g5, %g5
++ movxtod %g4, %f14 ! most significant 64 bits
++
++ sub $inp, $out, $blk_init ! $inp!=$out
++ and $inp, 7, $ileft
++ andn $inp, 7, $inp
++ sll $ileft, 3, $ileft
++ mov 64, $iright
++ mov 0xff, $omask
++ sub $iright, $ileft, $iright
++ and $out, 7, $ooff
++ cmp $len, 255
++ movrnz $ooff, 0, $blk_init ! if ( $out&7 ||
++ movleu $::size_t_cc, 0, $blk_init ! $len<256 ||
++ brnz,pn $blk_init, .L${bits}_ctr32_blk ! $inp==$out)
++ srl $omask, $ooff, $omask
++
++ andcc $len, 16, %g0 ! is number of blocks even?
++ alignaddrl $out, %g0, $out
++ bz %icc, .L${bits}_ctr32_loop2x
++ srlx $len, 4, $len
++.L${bits}_ctr32_loop:
++ ldx [$inp + 0], %o0
++ brz,pt $ileft, 4f
++ ldx [$inp + 8], %o1
++
++ ldx [$inp + 16], %o2
++ sllx %o0, $ileft, %o0
++ srlx %o1, $iright, %g1
++ sllx %o1, $ileft, %o1
++ or %g1, %o0, %o0
++ srlx %o2, $iright, %o2
++ or %o2, %o1, %o1
++4:
++ xor %g5, %l7, %g1 ! ^= rk[0]
++ add %l7, 1, %l7
++ movxtod %g1, %f2
++ srl %l7, 0, %l7 ! clruw
++ prefetch [$out + 63], 22
++ prefetch [$inp + 16+63], 20
++___
++$::code.=<<___ if ($alg eq "aes");
++ aes_eround01 %f16, %f14, %f2, %f4
++ aes_eround23 %f18, %f14, %f2, %f2
++___
++$::code.=<<___ if ($alg eq "cmll");
++ camellia_f %f16, %f2, %f14, %f2
++ camellia_f %f18, %f14, %f2, %f0
++___
++$::code.=<<___;
++ call _${alg}${bits}_encrypt_1x+8
++ add $inp, 16, $inp
++
++ movxtod %o0, %f10
++ movxtod %o1, %f12
++ fxor %f10, %f0, %f0 ! ^= inp
++ fxor %f12, %f2, %f2
++
++ brnz,pn $ooff, 2f
++ sub $len, 1, $len
++
++ std %f0, [$out + 0]
++ std %f2, [$out + 8]
++ brnz,pt $len, .L${bits}_ctr32_loop2x
++ add $out, 16, $out
++
++ ret
++ restore
++
++.align 16
++2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
++ ! and ~3x deterioration
++ ! in inp==out case
++ faligndata %f0, %f0, %f4 ! handle unaligned output
++ faligndata %f0, %f2, %f6
++ faligndata %f2, %f2, %f8
++ stda %f4, [$out + $omask]0xc0 ! partial store
++ std %f6, [$out + 8]
++ add $out, 16, $out
++ orn %g0, $omask, $omask
++ stda %f8, [$out + $omask]0xc0 ! partial store
++
++ brnz,pt $len, .L${bits}_ctr32_loop2x+4
++ orn %g0, $omask, $omask
++
++ ret
++ restore
++
++!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
++.align 32
++.L${bits}_ctr32_loop2x:
++ ldx [$inp + 0], %o0
++ ldx [$inp + 8], %o1
++ ldx [$inp + 16], %o2
++ brz,pt $ileft, 4f
++ ldx [$inp + 24], %o3
++
++ ldx [$inp + 32], %o4
++ sllx %o0, $ileft, %o0
++ srlx %o1, $iright, %g1
++ or %g1, %o0, %o0
++ sllx %o1, $ileft, %o1
++ srlx %o2, $iright, %g1
++ or %g1, %o1, %o1
++ sllx %o2, $ileft, %o2
++ srlx %o3, $iright, %g1
++ or %g1, %o2, %o2
++ sllx %o3, $ileft, %o3
++ srlx %o4, $iright, %o4
++ or %o4, %o3, %o3
++4:
++ xor %g5, %l7, %g1 ! ^= rk[0]
++ add %l7, 1, %l7
++ movxtod %g1, %f2
++ srl %l7, 0, %l7 ! clruw
++ xor %g5, %l7, %g1
++ add %l7, 1, %l7
++ movxtod %g1, %f6
++ srl %l7, 0, %l7 ! clruw
++ prefetch [$out + 63], 22
++ prefetch [$inp + 32+63], 20
++___
++$::code.=<<___ if ($alg eq "aes");
++ aes_eround01 %f16, %f14, %f2, %f8
++ aes_eround23 %f18, %f14, %f2, %f2
++ aes_eround01 %f16, %f14, %f6, %f10
++ aes_eround23 %f18, %f14, %f6, %f6
++___
++$::code.=<<___ if ($alg eq "cmll");
++ camellia_f %f16, %f2, %f14, %f2
++ camellia_f %f16, %f6, %f14, %f6
++ camellia_f %f18, %f14, %f2, %f0
++ camellia_f %f18, %f14, %f6, %f4
++___
++$::code.=<<___;
++ call _${alg}${bits}_encrypt_2x+16
++ add $inp, 32, $inp
++
++ movxtod %o0, %f8
++ movxtod %o1, %f10
++ movxtod %o2, %f12
++ fxor %f8, %f0, %f0 ! ^= inp
++ movxtod %o3, %f8
++ fxor %f10, %f2, %f2
++ fxor %f12, %f4, %f4
++ fxor %f8, %f6, %f6
++
++ brnz,pn $ooff, 2f
++ sub $len, 2, $len
++
++ std %f0, [$out + 0]
++ std %f2, [$out + 8]
++ std %f4, [$out + 16]
++ std %f6, [$out + 24]
++ brnz,pt $len, .L${bits}_ctr32_loop2x
++ add $out, 32, $out
++
++ ret
++ restore
++
++.align 16
++2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
++ ! and ~3x deterioration
++ ! in inp==out case
++ faligndata %f0, %f0, %f8 ! handle unaligned output
++ faligndata %f0, %f2, %f0
++ faligndata %f2, %f4, %f2
++ faligndata %f4, %f6, %f4
++ faligndata %f6, %f6, %f6
++
++ stda %f8, [$out + $omask]0xc0 ! partial store
++ std %f0, [$out + 8]
++ std %f2, [$out + 16]
++ std %f4, [$out + 24]
++ add $out, 32, $out
++ orn %g0, $omask, $omask
++ stda %f6, [$out + $omask]0xc0 ! partial store
++
++ brnz,pt $len, .L${bits}_ctr32_loop2x+4
++ orn %g0, $omask, $omask
++
++ ret
++ restore
++
++!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
++.align 32
++.L${bits}_ctr32_blk:
++ add $out, $len, $blk_init
++ and $blk_init, 63, $blk_init ! tail
++ sub $len, $blk_init, $len
++ add $blk_init, 15, $blk_init ! round up to 16n
++ srlx $len, 4, $len
++ srl $blk_init, 4, $blk_init
++ sub $len, 1, $len
++ add $blk_init, 1, $blk_init
++
++.L${bits}_ctr32_blk_loop2x:
++ ldx [$inp + 0], %o0
++ ldx [$inp + 8], %o1
++ ldx [$inp + 16], %o2
++ brz,pt $ileft, 5f
++ ldx [$inp + 24], %o3
++
++ ldx [$inp + 32], %o4
++ sllx %o0, $ileft, %o0
++ srlx %o1, $iright, %g1
++ or %g1, %o0, %o0
++ sllx %o1, $ileft, %o1
++ srlx %o2, $iright, %g1
++ or %g1, %o1, %o1
++ sllx %o2, $ileft, %o2
++ srlx %o3, $iright, %g1
++ or %g1, %o2, %o2
++ sllx %o3, $ileft, %o3
++ srlx %o4, $iright, %o4
++ or %o4, %o3, %o3
++5:
++ xor %g5, %l7, %g1 ! ^= rk[0]
++ add %l7, 1, %l7
++ movxtod %g1, %f2
++ srl %l7, 0, %l7 ! clruw
++ xor %g5, %l7, %g1
++ add %l7, 1, %l7
++ movxtod %g1, %f6
++ srl %l7, 0, %l7 ! clruw
++ prefetch [$inp + 32+63], 20
++___
++$::code.=<<___ if ($alg eq "aes");
++ aes_eround01 %f16, %f14, %f2, %f8
++ aes_eround23 %f18, %f14, %f2, %f2
++ aes_eround01 %f16, %f14, %f6, %f10
++ aes_eround23 %f18, %f14, %f6, %f6
++___
++$::code.=<<___ if ($alg eq "cmll");
++ camellia_f %f16, %f2, %f14, %f2
++ camellia_f %f16, %f6, %f14, %f6
++ camellia_f %f18, %f14, %f2, %f0
++ camellia_f %f18, %f14, %f6, %f4
++___
++$::code.=<<___;
++ call _${alg}${bits}_encrypt_2x+16
++ add $inp, 32, $inp
++ subcc $len, 2, $len
++
++ movxtod %o0, %f8
++ movxtod %o1, %f10
++ movxtod %o2, %f12
++ fxor %f8, %f0, %f0 ! ^= inp
++ movxtod %o3, %f8
++ fxor %f10, %f2, %f2
++ fxor %f12, %f4, %f4
++ fxor %f8, %f6, %f6
++
++ stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
++ add $out, 8, $out
++ stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
++ add $out, 8, $out
++ stda %f4, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
++ add $out, 8, $out
++ stda %f6, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
++ bgu,pt $::size_t_cc, .L${bits}_ctr32_blk_loop2x
++ add $out, 8, $out
++
++ add $blk_init, $len, $len
++ andcc $len, 1, %g0 ! is number of blocks even?
++ membar #StoreLoad|#StoreStore
++ bnz,pt %icc, .L${bits}_ctr32_loop
++ srl $len, 0, $len
++ brnz,pn $len, .L${bits}_ctr32_loop2x
++ nop
++
++ ret
++ restore
++.type ${alg}${bits}_t4_ctr32_encrypt,#function
++.size ${alg}${bits}_t4_ctr32_encrypt,.-${alg}${bits}_t4_ctr32_encrypt
++___
++}
++
++sub alg_xts_implement {
++my ($alg,$bits,$dir) = @_;
++my ($inp,$out,$len,$key1,$key2,$ivec)=map("%i$_",(0..5));
++my $rem=$ivec;
++
++$::code.=<<___;
++.globl ${alg}${bits}_t4_xts_${dir}crypt
++.align 32
++${alg}${bits}_t4_xts_${dir}crypt:
++ save %sp, -$::frame-16, %sp
++
++ mov $ivec, %o0
++ add %fp, $::bias-16, %o1
++ call ${alg}_t4_encrypt
++ mov $key2, %o2
++
++ add %fp, $::bias-16, %l7
++ ldxa [%l7]0x88, %g2
++ add %fp, $::bias-8, %l7
++ ldxa [%l7]0x88, %g3 ! %g3:%g2 is tweak
++
++ sethi %hi(0x76543210), %l7
++ or %l7, %lo(0x76543210), %l7
++ bmask %l7, %g0, %g0 ! byte swap mask
++
++ prefetch [$inp], 20
++ prefetch [$inp + 63], 20
++ call _${alg}${bits}_load_${dir}ckey
++ and $len, 15, $rem
++ and $len, -16, $len
++___
++$code.=<<___ if ($dir eq "de");
++ mov 0, %l7
++ movrnz $rem, 16, %l7
++ sub $len, %l7, $len
++___
++$code.=<<___;
++
++ sub $inp, $out, $blk_init ! $inp!=$out
++ and $inp, 7, $ileft
++ andn $inp, 7, $inp
++ sll $ileft, 3, $ileft
++ mov 64, $iright
++ mov 0xff, $omask
++ sub $iright, $ileft, $iright
++ and $out, 7, $ooff
++ cmp $len, 255
++ movrnz $ooff, 0, $blk_init ! if ( $out&7 ||
++ movleu $::size_t_cc, 0, $blk_init ! $len<256 ||
++ brnz,pn $blk_init, .L${bits}_xts_${dir}blk ! $inp==$out)
++ srl $omask, $ooff, $omask
++
++ andcc $len, 16, %g0 ! is number of blocks even?
++___
++$code.=<<___ if ($dir eq "de");
++ brz,pn $len, .L${bits}_xts_${dir}steal
++___
++$code.=<<___;
++ alignaddrl $out, %g0, $out
++ bz %icc, .L${bits}_xts_${dir}loop2x
++ srlx $len, 4, $len
++.L${bits}_xts_${dir}loop:
++ ldx [$inp + 0], %o0
++ brz,pt $ileft, 4f
++ ldx [$inp + 8], %o1
++
++ ldx [$inp + 16], %o2
++ sllx %o0, $ileft, %o0
++ srlx %o1, $iright, %g1
++ sllx %o1, $ileft, %o1
++ or %g1, %o0, %o0
++ srlx %o2, $iright, %o2
++ or %o2, %o1, %o1
++4:
++ movxtod %g2, %f12
++ movxtod %g3, %f14
++ bshuffle %f12, %f12, %f12
++ bshuffle %f14, %f14, %f14
++
++ xor %g4, %o0, %o0 ! ^= rk[0]
++ xor %g5, %o1, %o1
++ movxtod %o0, %f0
++ movxtod %o1, %f2
++
++ fxor %f12, %f0, %f0 ! ^= tweak[0]
++ fxor %f14, %f2, %f2
++
++ prefetch [$out + 63], 22
++ prefetch [$inp + 16+63], 20
++ call _${alg}${bits}_${dir}crypt_1x
++ add $inp, 16, $inp
++
++ fxor %f12, %f0, %f0 ! ^= tweak[0]
++ fxor %f14, %f2, %f2
++
++ srax %g3, 63, %l7 ! next tweak value
++ addcc %g2, %g2, %g2
++ and %l7, 0x87, %l7
++ addxc %g3, %g3, %g3
++ xor %l7, %g2, %g2
++
++ brnz,pn $ooff, 2f
++ sub $len, 1, $len
++
++ std %f0, [$out + 0]
++ std %f2, [$out + 8]
++ brnz,pt $len, .L${bits}_xts_${dir}loop2x
++ add $out, 16, $out
++
++ brnz,pn $rem, .L${bits}_xts_${dir}steal
++ nop
++
++ ret
++ restore
++
++.align 16
++2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
++ ! and ~3x deterioration
++ ! in inp==out case
++ faligndata %f0, %f0, %f4 ! handle unaligned output
++ faligndata %f0, %f2, %f6
++ faligndata %f2, %f2, %f8
++ stda %f4, [$out + $omask]0xc0 ! partial store
++ std %f6, [$out + 8]
++ add $out, 16, $out
++ orn %g0, $omask, $omask
++ stda %f8, [$out + $omask]0xc0 ! partial store
++
++ brnz,pt $len, .L${bits}_xts_${dir}loop2x+4
++ orn %g0, $omask, $omask
++
++ brnz,pn $rem, .L${bits}_xts_${dir}steal
++ nop
++
++ ret
++ restore
++
++!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
++.align 32
++.L${bits}_xts_${dir}loop2x:
++ ldx [$inp + 0], %o0
++ ldx [$inp + 8], %o1
++ ldx [$inp + 16], %o2
++ brz,pt $ileft, 4f
++ ldx [$inp + 24], %o3
++
++ ldx [$inp + 32], %o4
++ sllx %o0, $ileft, %o0
++ srlx %o1, $iright, %g1
++ or %g1, %o0, %o0
++ sllx %o1, $ileft, %o1
++ srlx %o2, $iright, %g1
++ or %g1, %o1, %o1
++ sllx %o2, $ileft, %o2
++ srlx %o3, $iright, %g1
++ or %g1, %o2, %o2
++ sllx %o3, $ileft, %o3
++ srlx %o4, $iright, %o4
++ or %o4, %o3, %o3
++4:
++ movxtod %g2, %f12
++ movxtod %g3, %f14
++ bshuffle %f12, %f12, %f12
++ bshuffle %f14, %f14, %f14
++
++ srax %g3, 63, %l7 ! next tweak value
++ addcc %g2, %g2, %g2
++ and %l7, 0x87, %l7
++ addxc %g3, %g3, %g3
++ xor %l7, %g2, %g2
++
++ movxtod %g2, %f8
++ movxtod %g3, %f10
++ bshuffle %f8, %f8, %f8
++ bshuffle %f10, %f10, %f10
++
++ xor %g4, %o0, %o0 ! ^= rk[0]
++ xor %g5, %o1, %o1
++ xor %g4, %o2, %o2 ! ^= rk[0]
++ xor %g5, %o3, %o3
++ movxtod %o0, %f0
++ movxtod %o1, %f2
++ movxtod %o2, %f4
++ movxtod %o3, %f6
++
++ fxor %f12, %f0, %f0 ! ^= tweak[0]
++ fxor %f14, %f2, %f2
++ fxor %f8, %f4, %f4 ! ^= tweak[0]
++ fxor %f10, %f6, %f6
++
++ prefetch [$out + 63], 22
++ prefetch [$inp + 32+63], 20
++ call _${alg}${bits}_${dir}crypt_2x
++ add $inp, 32, $inp
++
++ movxtod %g2, %f8
++ movxtod %g3, %f10
++
++ srax %g3, 63, %l7 ! next tweak value
++ addcc %g2, %g2, %g2
++ and %l7, 0x87, %l7
++ addxc %g3, %g3, %g3
++ xor %l7, %g2, %g2
++
++ bshuffle %f8, %f8, %f8
++ bshuffle %f10, %f10, %f10
++
++ fxor %f12, %f0, %f0 ! ^= tweak[0]
++ fxor %f14, %f2, %f2
++ fxor %f8, %f4, %f4
++ fxor %f10, %f6, %f6
++
++ brnz,pn $ooff, 2f
++ sub $len, 2, $len
++
++ std %f0, [$out + 0]
++ std %f2, [$out + 8]
++ std %f4, [$out + 16]
++ std %f6, [$out + 24]
++ brnz,pt $len, .L${bits}_xts_${dir}loop2x
++ add $out, 32, $out
++
++ fsrc2 %f4, %f0
++ fsrc2 %f6, %f2
++ brnz,pn $rem, .L${bits}_xts_${dir}steal
++ nop
++
++ ret
++ restore
++
++.align 16
++2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
++ ! and ~3x deterioration
++ ! in inp==out case
++ faligndata %f0, %f0, %f8 ! handle unaligned output
++ faligndata %f0, %f2, %f10
++ faligndata %f2, %f4, %f12
++ faligndata %f4, %f6, %f14
++ faligndata %f6, %f6, %f0
++
++ stda %f8, [$out + $omask]0xc0 ! partial store
++ std %f10, [$out + 8]
++ std %f12, [$out + 16]
++ std %f14, [$out + 24]
++ add $out, 32, $out
++ orn %g0, $omask, $omask
++ stda %f0, [$out + $omask]0xc0 ! partial store
++
++ brnz,pt $len, .L${bits}_xts_${dir}loop2x+4
++ orn %g0, $omask, $omask
++
++ fsrc2 %f4, %f0
++ fsrc2 %f6, %f2
++ brnz,pn $rem, .L${bits}_xts_${dir}steal
++ nop
++
++ ret
++ restore
++
++!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
++.align 32
++.L${bits}_xts_${dir}blk:
++ add $out, $len, $blk_init
++ and $blk_init, 63, $blk_init ! tail
++ sub $len, $blk_init, $len
++ add $blk_init, 15, $blk_init ! round up to 16n
++ srlx $len, 4, $len
++ srl $blk_init, 4, $blk_init
++ sub $len, 1, $len
++ add $blk_init, 1, $blk_init
++
++.L${bits}_xts_${dir}blk2x:
++ ldx [$inp + 0], %o0
++ ldx [$inp + 8], %o1
++ ldx [$inp + 16], %o2
++ brz,pt $ileft, 5f
++ ldx [$inp + 24], %o3
++
++ ldx [$inp + 32], %o4
++ sllx %o0, $ileft, %o0
++ srlx %o1, $iright, %g1
++ or %g1, %o0, %o0
++ sllx %o1, $ileft, %o1
++ srlx %o2, $iright, %g1
++ or %g1, %o1, %o1
++ sllx %o2, $ileft, %o2
++ srlx %o3, $iright, %g1
++ or %g1, %o2, %o2
++ sllx %o3, $ileft, %o3
++ srlx %o4, $iright, %o4
++ or %o4, %o3, %o3
++5:
++ movxtod %g2, %f12
++ movxtod %g3, %f14
++ bshuffle %f12, %f12, %f12
++ bshuffle %f14, %f14, %f14
++
++ srax %g3, 63, %l7 ! next tweak value
++ addcc %g2, %g2, %g2
++ and %l7, 0x87, %l7
++ addxc %g3, %g3, %g3
++ xor %l7, %g2, %g2
++
++ movxtod %g2, %f8
++ movxtod %g3, %f10
++ bshuffle %f8, %f8, %f8
++ bshuffle %f10, %f10, %f10
++
++ xor %g4, %o0, %o0 ! ^= rk[0]
++ xor %g5, %o1, %o1
++ xor %g4, %o2, %o2 ! ^= rk[0]
++ xor %g5, %o3, %o3
++ movxtod %o0, %f0
++ movxtod %o1, %f2
++ movxtod %o2, %f4
++ movxtod %o3, %f6
++
++ fxor %f12, %f0, %f0 ! ^= tweak[0]
++ fxor %f14, %f2, %f2
++ fxor %f8, %f4, %f4 ! ^= tweak[0]
++ fxor %f10, %f6, %f6
++
++ prefetch [$inp + 32+63], 20
++ call _${alg}${bits}_${dir}crypt_2x
++ add $inp, 32, $inp
++
++ movxtod %g2, %f8
++ movxtod %g3, %f10
++
++ srax %g3, 63, %l7 ! next tweak value
++ addcc %g2, %g2, %g2
++ and %l7, 0x87, %l7
++ addxc %g3, %g3, %g3
++ xor %l7, %g2, %g2
++
++ bshuffle %f8, %f8, %f8
++ bshuffle %f10, %f10, %f10
++
++ fxor %f12, %f0, %f0 ! ^= tweak[0]
++ fxor %f14, %f2, %f2
++ fxor %f8, %f4, %f4
++ fxor %f10, %f6, %f6
++
++ stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
++ add $out, 8, $out
++ stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
++ add $out, 8, $out
++ stda %f4, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
++ add $out, 8, $out
++ stda %f6, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
++ bgu,pt $::size_t_cc, .L${bits}_xts_${dir}blk2x
++ add $out, 8, $out
++
++ add $blk_init, $len, $len
++ andcc $len, 1, %g0 ! is number of blocks even?
++ membar #StoreLoad|#StoreStore
++ bnz,pt %icc, .L${bits}_xts_${dir}loop
++ srl $len, 0, $len
++ brnz,pn $len, .L${bits}_xts_${dir}loop2x
++ nop
++
++ fsrc2 %f4, %f0
++ fsrc2 %f6, %f2
++ brnz,pn $rem, .L${bits}_xts_${dir}steal
++ nop
++
++ ret
++ restore
++!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
++___
++$code.=<<___ if ($dir eq "en");
++.align 32
++.L${bits}_xts_${dir}steal:
++ std %f0, [%fp + $::bias-16] ! copy of output
++ std %f2, [%fp + $::bias-8]
++
++ srl $ileft, 3, $ileft
++ add %fp, $::bias-16, %l7
++ add $inp, $ileft, $inp ! original $inp+$len&-15
++ add $out, $ooff, $out ! original $out+$len&-15
++ mov 0, $ileft
++ nop ! align
++
++.L${bits}_xts_${dir}stealing:
++ ldub [$inp + $ileft], %o0
++ ldub [%l7 + $ileft], %o1
++ dec $rem
++ stb %o0, [%l7 + $ileft]
++ stb %o1, [$out + $ileft]
++ brnz $rem, .L${bits}_xts_${dir}stealing
++ inc $ileft
++
++ mov %l7, $inp
++ sub $out, 16, $out
++ mov 0, $ileft
++ sub $out, $ooff, $out
++ ba .L${bits}_xts_${dir}loop ! one more time
++ mov 1, $len ! $rem is 0
++___
++$code.=<<___ if ($dir eq "de");
++.align 32
++.L${bits}_xts_${dir}steal:
++ ldx [$inp + 0], %o0
++ brz,pt $ileft, 8f
++ ldx [$inp + 8], %o1
++
++ ldx [$inp + 16], %o2
++ sllx %o0, $ileft, %o0
++ srlx %o1, $iright, %g1
++ sllx %o1, $ileft, %o1
++ or %g1, %o0, %o0
++ srlx %o2, $iright, %o2
++ or %o2, %o1, %o1
++8:
++ srax %g3, 63, %l7 ! next tweak value
++ addcc %g2, %g2, %o2
++ and %l7, 0x87, %l7
++ addxc %g3, %g3, %o3
++ xor %l7, %o2, %o2
++
++ movxtod %o2, %f12
++ movxtod %o3, %f14
++ bshuffle %f12, %f12, %f12
++ bshuffle %f14, %f14, %f14
++
++ xor %g4, %o0, %o0 ! ^= rk[0]
++ xor %g5, %o1, %o1
++ movxtod %o0, %f0
++ movxtod %o1, %f2
++
++ fxor %f12, %f0, %f0 ! ^= tweak[0]
++ fxor %f14, %f2, %f2
++
++ call _${alg}${bits}_${dir}crypt_1x
++ add $inp, 16, $inp
++
++ fxor %f12, %f0, %f0 ! ^= tweak[0]
++ fxor %f14, %f2, %f2
++
++ std %f0, [%fp + $::bias-16]
++ std %f2, [%fp + $::bias-8]
++
++ srl $ileft, 3, $ileft
++ add %fp, $::bias-16, %l7
++ add $inp, $ileft, $inp ! original $inp+$len&-15
++ add $out, $ooff, $out ! original $out+$len&-15
++ mov 0, $ileft
++ add $out, 16, $out
++ nop ! align
++
++.L${bits}_xts_${dir}stealing:
++ ldub [$inp + $ileft], %o0
++ ldub [%l7 + $ileft], %o1
++ dec $rem
++ stb %o0, [%l7 + $ileft]
++ stb %o1, [$out + $ileft]
++ brnz $rem, .L${bits}_xts_${dir}stealing
++ inc $ileft
++
++ mov %l7, $inp
++ sub $out, 16, $out
++ mov 0, $ileft
++ sub $out, $ooff, $out
++ ba .L${bits}_xts_${dir}loop ! one more time
++ mov 1, $len ! $rem is 0
++___
++$code.=<<___;
++ ret
++ restore
++.type ${alg}${bits}_t4_xts_${dir}crypt,#function
++.size ${alg}${bits}_t4_xts_${dir}crypt,.-${alg}${bits}_t4_xts_${dir}crypt
++___
++}
++
++# Purpose of these subroutines is to explicitly encode VIS instructions,
++# so that one can compile the module without having to specify VIS
++# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
++# Idea is to reserve for option to produce "universal" binary and let
++# programmer detect if current CPU is VIS capable at run-time.
++sub unvis {
++my ($mnemonic,$rs1,$rs2,$rd)=@_;
++my ($ref,$opf);
++my %visopf = ( "faligndata" => 0x048,
++ "bshuffle" => 0x04c,
++ "fnot2" => 0x066,
++ "fxor" => 0x06c,
++ "fsrc2" => 0x078 );
++
++ $ref = "$mnemonic\t$rs1,$rs2,$rd";
++
++ if ($opf=$visopf{$mnemonic}) {
++ foreach ($rs1,$rs2,$rd) {
++ return $ref if (!/%f([0-9]{1,2})/);
++ $_=$1;
++ if ($1>=32) {
++ return $ref if ($1&1);
++ # re-encode for upper double register addressing
++ $_=($1|$1>>5)&31;
++ }
++ }
++
++ return sprintf ".word\t0x%08x !%s",
++ 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
++ $ref;
++ } else {
++ return $ref;
++ }
++}
++
++sub unvis3 {
++my ($mnemonic,$rs1,$rs2,$rd)=@_;
++my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
++my ($ref,$opf);
++my %visopf = ( "addxc" => 0x011,
++ "addxccc" => 0x013,
++ "umulxhi" => 0x016,
++ "alignaddr" => 0x018,
++ "bmask" => 0x019,
++ "alignaddrl" => 0x01a );
++
++ $ref = "$mnemonic\t$rs1,$rs2,$rd";
++
++ if ($opf=$visopf{$mnemonic}) {
++ foreach ($rs1,$rs2,$rd) {
++ return $ref if (!/%([goli])([0-9])/);
++ $_=$bias{$1}+$2;
++ }
++
++ return sprintf ".word\t0x%08x !%s",
++ 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
++ $ref;
++ } else {
++ return $ref;
++ }
++}
++
++sub unaes_round { # 4-argument instructions
++my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
++my ($ref,$opf);
++my %aesopf = ( "aes_eround01" => 0,
++ "aes_eround23" => 1,
++ "aes_dround01" => 2,
++ "aes_dround23" => 3,
++ "aes_eround01_l"=> 4,
++ "aes_eround23_l"=> 5,
++ "aes_dround01_l"=> 6,
++ "aes_dround23_l"=> 7,
++ "aes_kexpand1" => 8 );
++
++ $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
++
++ if (defined($opf=$aesopf{$mnemonic})) {
++ $rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3;
++ foreach ($rs1,$rs2,$rd) {
++ return $ref if (!/%f([0-9]{1,2})/);
++ $_=$1;
++ if ($1>=32) {
++ return $ref if ($1&1);
++ # re-encode for upper double register addressing
++ $_=($1|$1>>5)&31;
++ }
++ }
++
++ return sprintf ".word\t0x%08x !%s",
++ 2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|$opf<<5|$rs2,
++ $ref;
++ } else {
++ return $ref;
++ }
++}
++
++sub unaes_kexpand { # 3-argument instructions
++my ($mnemonic,$rs1,$rs2,$rd)=@_;
++my ($ref,$opf);
++my %aesopf = ( "aes_kexpand0" => 0x130,
++ "aes_kexpand2" => 0x131 );
++
++ $ref = "$mnemonic\t$rs1,$rs2,$rd";
++
++ if (defined($opf=$aesopf{$mnemonic})) {
++ foreach ($rs1,$rs2,$rd) {
++ return $ref if (!/%f([0-9]{1,2})/);
++ $_=$1;
++ if ($1>=32) {
++ return $ref if ($1&1);
++ # re-encode for upper double register addressing
++ $_=($1|$1>>5)&31;
++ }
++ }
++
++ return sprintf ".word\t0x%08x !%s",
++ 2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
++ $ref;
++ } else {
++ return $ref;
++ }
++}
++
++sub uncamellia_f { # 4-argument instructions
++my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
++my ($ref,$opf);
++
++ $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
++
++ if (1) {
++ $rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3;
++ foreach ($rs1,$rs2,$rd) {
++ return $ref if (!/%f([0-9]{1,2})/);
++ $_=$1;
++ if ($1>=32) {
++ return $ref if ($1&1);
++ # re-encode for upper double register addressing
++ $_=($1|$1>>5)&31;
++ }
++ }
++
++ return sprintf ".word\t0x%08x !%s",
++ 2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|0xc<<5|$rs2,
++ $ref;
++ } else {
++ return $ref;
++ }
++}
++
++sub uncamellia3 { # 3-argument instructions
++my ($mnemonic,$rs1,$rs2,$rd)=@_;
++my ($ref,$opf);
++my %cmllopf = ( "camellia_fl" => 0x13c,
++ "camellia_fli" => 0x13d );
++
++ $ref = "$mnemonic\t$rs1,$rs2,$rd";
++
++ if (defined($opf=$cmllopf{$mnemonic})) {
++ foreach ($rs1,$rs2,$rd) {
++ return $ref if (!/%f([0-9]{1,2})/);
++ $_=$1;
++ if ($1>=32) {
++ return $ref if ($1&1);
++ # re-encode for upper double register addressing
++ $_=($1|$1>>5)&31;
++ }
++ }
++
++ return sprintf ".word\t0x%08x !%s",
++ 2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
++ $ref;
++ } else {
++ return $ref;
++ }
++}
++
++sub unmovxtox { # 2-argument instructions
++my ($mnemonic,$rs,$rd)=@_;
++my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24, "f" => 0 );
++my ($ref,$opf);
++my %movxopf = ( "movdtox" => 0x110,
++ "movstouw" => 0x111,
++ "movstosw" => 0x113,
++ "movxtod" => 0x118,
++ "movwtos" => 0x119 );
++
++ $ref = "$mnemonic\t$rs,$rd";
++
++ if (defined($opf=$movxopf{$mnemonic})) {
++ foreach ($rs,$rd) {
++ return $ref if (!/%([fgoli])([0-9]{1,2})/);
++ $_=$bias{$1}+$2;
++ if ($2>=32) {
++ return $ref if ($2&1);
++ # re-encode for upper double register addressing
++ $_=($2|$2>>5)&31;
++ }
++ }
++
++ return sprintf ".word\t0x%08x !%s",
++ 2<<30|$rd<<25|0x36<<19|$opf<<5|$rs,
++ $ref;
++ } else {
++ return $ref;
++ }
++}
++
++sub undes {
++my ($mnemonic)=shift;
++my @args=@_;
++my ($ref,$opf);
++my %desopf = ( "des_round" => 0b1001,
++ "des_ip" => 0b100110100,
++ "des_iip" => 0b100110101,
++ "des_kexpand" => 0b100110110 );
++
++ $ref = "$mnemonic\t".join(",", at _);
++
++ if (defined($opf=$desopf{$mnemonic})) { # 4-arg
++ if ($mnemonic eq "des_round") {
++ foreach (@args[0..3]) {
++ return $ref if (!/%f([0-9]{1,2})/);
++ $_=$1;
++ if ($1>=32) {
++ return $ref if ($1&1);
++ # re-encode for upper double register addressing
++ $_=($1|$1>>5)&31;
++ }
++ }
++ return sprintf ".word\t0x%08x !%s",
++ 2<<30|0b011001<<19|$opf<<5|$args[0]<<14|$args[1]|$args[2]<<9|$args[3]<<25,
++ $ref;
++ } elsif ($mnemonic eq "des_kexpand") { # 3-arg
++ foreach (@args[0..2]) {
++ return $ref if (!/(%f)?([0-9]{1,2})/);
++ $_=$2;
++ if ($2>=32) {
++ return $ref if ($2&1);
++ # re-encode for upper double register addressing
++ $_=($2|$2>>5)&31;
++ }
++ }
++ return sprintf ".word\t0x%08x !%s",
++ 2<<30|0b110110<<19|$opf<<5|$args[0]<<14|$args[1]|$args[2]<<25,
++ $ref;
++ } else { # 2-arg
++ foreach (@args[0..1]) {
++ return $ref if (!/%f([0-9]{1,2})/);
++ $_=$1;
++ if ($1>=32) {
++ return $ref if ($2&1);
++ # re-encode for upper double register addressing
++ $_=($1|$1>>5)&31;
++ }
++ }
++ return sprintf ".word\t0x%08x !%s",
++ 2<<30|0b110110<<19|$opf<<5|$args[0]<<14|$args[1]<<25,
++ $ref;
++ }
++ } else {
++ return $ref;
++ }
++}
++
++sub emit_assembler {
++ foreach (split("\n",$::code)) {
++ s/\`([^\`]*)\`/eval $1/ge;
++
++ s/\b(f[a-z]+2[sd]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})\s*$/$1\t%f0,$2,$3/go;
++
++ s/\b(aes_[edk][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
++ &unaes_round($1,$2,$3,$4,$5)
++ /geo or
++ s/\b(aes_kexpand[02])\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
++ &unaes_kexpand($1,$2,$3,$4)
++ /geo or
++ s/\b(camellia_f)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
++ &uncamellia_f($1,$2,$3,$4,$5)
++ /geo or
++ s/\b(camellia_[^s]+)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
++ &uncamellia3($1,$2,$3,$4)
++ /geo or
++ s/\b(des_\w+)\s+(?<rs1>%f[0-9]{1,2}),\s*(?<rs2>[%fx0-9]+)(,\s*(?<rs3>%f[0-9]{1,2})(,\s*(?<rs4>%f[0-9]{1,2}))?)?/
++ &undes($1,$+{rs1},$+{rs2},$+{rs3},$+{rs4})
++ /geo or
++ s/\b(mov[ds]to\w+)\s+(%f[0-9]{1,2}),\s*(%[goli][0-7])/
++ &unmovxtox($1,$2,$3)
++ /geo or
++ s/\b(mov[xw]to[ds])\s+(%[goli][0-7]),\s*(%f[0-9]{1,2})/
++ &unmovxtox($1,$2,$3)
++ /geo or
++ s/\b([fb][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
++ &unvis($1,$2,$3,$4)
++ /geo or
++ s/\b(umulxhi|bmask|addxc[c]{0,2}|alignaddr[l]*)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
++ &unvis3($1,$2,$3,$4)
++ /geo;
++
++ print $_,"\n";
++ }
++}
++
++1;
+Index: crypto/bn/asm/vis3-mont.pl
+===================================================================
+diff -uNr openssl-1.0.1m/crypto/bn/asm/vis3-mont.pl openssl-1.0.1m/crypto/bn/asm/vis3-mont.pl
+--- openssl-1.0.1m/crypto/bn/asm/vis3-mont.pl 1970-01-01 01:00:00.000000000 +0100
++++ openssl-1.0.1m/crypto/bn/asm/vis3-mont.pl 2015-06-02 09:52:11.809148396 +0200
+@@ -0,0 +1,373 @@
++#!/usr/bin/env perl
++
++# ====================================================================
++# Written by Andy Polyakov <appro at openssl.org> for the OpenSSL
++# project. The module is, however, dual licensed under OpenSSL and
++# CRYPTOGAMS licenses depending on where you obtain it. For further
++# details see http://www.openssl.org/~appro/cryptogams/.
++# ====================================================================
++
++# October 2012.
++#
++# SPARCv9 VIS3 Montgomery multiplicaion procedure suitable for T3 and
++# onward. There are three new instructions used here: umulxhi,
++# addxc[cc] and initializing store. On T3 RSA private key operations
++# are 1.54/1.87/2.11/2.26 times faster for 512/1024/2048/4096-bit key
++# lengths. This is without dedicated squaring procedure. On T4
++# corresponding coefficients are 1.47/2.10/2.80/2.90x, which is mostly
++# for reference purposes, because T4 has dedicated Montgomery
++# multiplication and squaring *instructions* that deliver even more.
++
++$bits=32;
++for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
++if ($bits==64) { $bias=2047; $frame=192; }
++else { $bias=0; $frame=112; }
++
++$code.=<<___ if ($bits==64);
++.register %g2,#scratch
++.register %g3,#scratch
++___
++$code.=<<___;
++.section ".text",#alloc,#execinstr
++___
++
++($n0,$m0,$m1,$lo0,$hi0, $lo1,$hi1,$aj,$alo,$nj,$nlo,$tj)=
++ (map("%g$_",(1..5)),map("%o$_",(0..5,7)));
++
++# int bn_mul_mont(
++$rp="%o0"; # BN_ULONG *rp,
++$ap="%o1"; # const BN_ULONG *ap,
++$bp="%o2"; # const BN_ULONG *bp,
++$np="%o3"; # const BN_ULONG *np,
++$n0p="%o4"; # const BN_ULONG *n0,
++$num="%o5"; # int num); # caller ensures that num is even
++ # and >=6
++$code.=<<___;
++.globl bn_mul_mont_vis3
++.align 32
++bn_mul_mont_vis3:
++ add %sp, $bias, %g4 ! real top of stack
++ sll $num, 2, $num ! size in bytes
++ add $num, 63, %g5
++ andn %g5, 63, %g5 ! buffer size rounded up to 64 bytes
++ add %g5, %g5, %g1
++ add %g5, %g1, %g1 ! 3*buffer size
++ sub %g4, %g1, %g1
++ andn %g1, 63, %g1 ! align at 64 byte
++ sub %g1, $frame, %g1 ! new top of stack
++ sub %g1, %g4, %g1
++
++ save %sp, %g1, %sp
++___
++
++# +-------------------------------+<----- %sp
++# . .
++# +-------------------------------+<----- aligned at 64 bytes
++# | __int64 tmp[0] |
++# +-------------------------------+
++# . .
++# . .
++# +-------------------------------+<----- aligned at 64 bytes
++# | __int64 ap[1..0] | converted ap[]
++# +-------------------------------+
++# | __int64 np[1..0] | converted np[]
++# +-------------------------------+
++# | __int64 ap[3..2] |
++# . .
++# . .
++# +-------------------------------+
++($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5));
++($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz,$anp)=map("%l$_",(0..7));
++($ovf,$i)=($t0,$t1);
++$code.=<<___;
++ ld [$n0p+0], $t0 ! pull n0[0..1] value
++ add %sp, $bias+$frame, $tp
++ ld [$n0p+4], $t1
++ add $tp, %g5, $anp
++ ld [$bp+0], $t2 ! m0=bp[0]
++ sllx $t1, 32, $n0
++ ld [$bp+4], $t3
++ or $t0, $n0, $n0
++ add $bp, 8, $bp
++
++ ld [$ap+0], $t0 ! ap[0]
++ sllx $t3, 32, $m0
++ ld [$ap+4], $t1
++ or $t2, $m0, $m0
++
++ ld [$ap+8], $t2 ! ap[1]
++ sllx $t1, 32, $aj
++ ld [$ap+12], $t3
++ or $t0, $aj, $aj
++ add $ap, 16, $ap
++ stxa $aj, [$anp]0xe2 ! converted ap[0]
++
++ mulx $aj, $m0, $lo0 ! ap[0]*bp[0]
++ umulxhi $aj, $m0, $hi0
++
++ ld [$np+0], $t0 ! np[0]
++ sllx $t3, 32, $aj
++ ld [$np+4], $t1
++ or $t2, $aj, $aj
++
++ ld [$np+8], $t2 ! np[1]
++ sllx $t1, 32, $nj
++ ld [$np+12], $t3
++ or $t0, $nj, $nj
++ add $np, 16, $np
++ stx $nj, [$anp+8] ! converted np[0]
++
++ mulx $lo0, $n0, $m1 ! "tp[0]"*n0
++ stx $aj, [$anp+16] ! converted ap[1]
++
++ mulx $aj, $m0, $alo ! ap[1]*bp[0]
++ umulxhi $aj, $m0, $aj ! ahi=aj
++
++ mulx $nj, $m1, $lo1 ! np[0]*m1
++ umulxhi $nj, $m1, $hi1
++
++ sllx $t3, 32, $nj
++ or $t2, $nj, $nj
++ stx $nj, [$anp+24] ! converted np[1]
++ add $anp, 32, $anp
++
++ addcc $lo0, $lo1, $lo1
++ addxc %g0, $hi1, $hi1
++
++ mulx $nj, $m1, $nlo ! np[1]*m1
++ umulxhi $nj, $m1, $nj ! nhi=nj
++
++ ba .L1st
++ sub $num, 24, $cnt ! cnt=num-3
++
++.align 16
++.L1st:
++ ld [$ap+0], $t0 ! ap[j]
++ addcc $alo, $hi0, $lo0
++ ld [$ap+4], $t1
++ addxc $aj, %g0, $hi0
++
++ sllx $t1, 32, $aj
++ add $ap, 8, $ap
++ or $t0, $aj, $aj
++ stxa $aj, [$anp]0xe2 ! converted ap[j]
++
++ ld [$np+0], $t2 ! np[j]
++ addcc $nlo, $hi1, $lo1
++ ld [$np+4], $t3
++ addxc $nj, %g0, $hi1 ! nhi=nj
++
++ sllx $t3, 32, $nj
++ add $np, 8, $np
++ mulx $aj, $m0, $alo ! ap[j]*bp[0]
++ or $t2, $nj, $nj
++ umulxhi $aj, $m0, $aj ! ahi=aj
++ stx $nj, [$anp+8] ! converted np[j]
++ add $anp, 16, $anp ! anp++
++
@@ Diff output truncated at 100000 characters. @@
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
More information about the devel
mailing list