...

Text file src/crypto/sha512/sha512block_ppc64le.s

     1	// Copyright 2016 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// Based on CRYPTOGAMS code with the following comment:
     6	// # ====================================================================
     7	// # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
     8	// # project. The module is, however, dual licensed under OpenSSL and
     9	// # CRYPTOGAMS licenses depending on where you obtain it. For further
    10	// # details see http://www.openssl.org/~appro/cryptogams/.
    11	// # ====================================================================
    12	
    13	#include "textflag.h"
    14	
    15	// SHA512 block routine. See sha512block.go for Go equivalent.
    16	//
    17	// The algorithm is detailed in FIPS 180-4:
    18	//
    19	//  https://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf
    20	//
    21	// Wt = Mt; for 0 <= t <= 15
    22	// Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 79
    23	//
    24	// a = H0
    25	// b = H1
    26	// c = H2
    27	// d = H3
    28	// e = H4
    29	// f = H5
    30	// g = H6
    31	// h = H7
    32	//
    33	// for t = 0 to 79 {
    34	//    T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt
    35	//    T2 = BIGSIGMA0(a) + Maj(a,b,c)
    36	//    h = g
    37	//    g = f
    38	//    f = e
    39	//    e = d + T1
    40	//    d = c
    41	//    c = b
    42	//    b = a
    43	//    a = T1 + T2
    44	// }
    45	//
    46	// H0 = a + H0
    47	// H1 = b + H1
    48	// H2 = c + H2
    49	// H3 = d + H3
    50	// H4 = e + H4
    51	// H5 = f + H5
    52	// H6 = g + H6
    53	// H7 = h + H7
    54	
    55	#define CTX	R3
    56	#define INP	R4
    57	#define END	R5
    58	#define TBL	R6
    59	#define IDX	R7
    60	#define CNT	R8
    61	#define LEN	R9
    62	#define OFFLOAD	R11
    63	#define TEMP	R12
    64	
    65	#define HEX00	R0
    66	#define HEX10	R10
    67	#define HEX20	R25
    68	#define HEX30	R26
    69	#define HEX40	R27
    70	#define HEX50	R28
    71	#define HEX60	R29
    72	#define HEX70	R31
    73	
    74	// V0-V7 are A-H
    75	// V8-V23 are used for the message schedule
    76	#define KI	V24
    77	#define FUNC	V25
    78	#define S0	V26
    79	#define S1	V27
    80	#define s0	V28
    81	#define s1	V29
    82	#define LEMASK	V31	// Permutation control register for little endian
    83	
    84	// 2 copies of each Kt, to fill both doublewords of a vector register
    85	DATA  ·kcon+0x000(SB)/8, $0x428a2f98d728ae22
    86	DATA  ·kcon+0x008(SB)/8, $0x428a2f98d728ae22
    87	DATA  ·kcon+0x010(SB)/8, $0x7137449123ef65cd
    88	DATA  ·kcon+0x018(SB)/8, $0x7137449123ef65cd
    89	DATA  ·kcon+0x020(SB)/8, $0xb5c0fbcfec4d3b2f
    90	DATA  ·kcon+0x028(SB)/8, $0xb5c0fbcfec4d3b2f
    91	DATA  ·kcon+0x030(SB)/8, $0xe9b5dba58189dbbc
    92	DATA  ·kcon+0x038(SB)/8, $0xe9b5dba58189dbbc
    93	DATA  ·kcon+0x040(SB)/8, $0x3956c25bf348b538
    94	DATA  ·kcon+0x048(SB)/8, $0x3956c25bf348b538
    95	DATA  ·kcon+0x050(SB)/8, $0x59f111f1b605d019
    96	DATA  ·kcon+0x058(SB)/8, $0x59f111f1b605d019
    97	DATA  ·kcon+0x060(SB)/8, $0x923f82a4af194f9b
    98	DATA  ·kcon+0x068(SB)/8, $0x923f82a4af194f9b
    99	DATA  ·kcon+0x070(SB)/8, $0xab1c5ed5da6d8118
   100	DATA  ·kcon+0x078(SB)/8, $0xab1c5ed5da6d8118
   101	DATA  ·kcon+0x080(SB)/8, $0xd807aa98a3030242
   102	DATA  ·kcon+0x088(SB)/8, $0xd807aa98a3030242
   103	DATA  ·kcon+0x090(SB)/8, $0x12835b0145706fbe
   104	DATA  ·kcon+0x098(SB)/8, $0x12835b0145706fbe
   105	DATA  ·kcon+0x0A0(SB)/8, $0x243185be4ee4b28c
   106	DATA  ·kcon+0x0A8(SB)/8, $0x243185be4ee4b28c
   107	DATA  ·kcon+0x0B0(SB)/8, $0x550c7dc3d5ffb4e2
   108	DATA  ·kcon+0x0B8(SB)/8, $0x550c7dc3d5ffb4e2
   109	DATA  ·kcon+0x0C0(SB)/8, $0x72be5d74f27b896f
   110	DATA  ·kcon+0x0C8(SB)/8, $0x72be5d74f27b896f
   111	DATA  ·kcon+0x0D0(SB)/8, $0x80deb1fe3b1696b1
   112	DATA  ·kcon+0x0D8(SB)/8, $0x80deb1fe3b1696b1
   113	DATA  ·kcon+0x0E0(SB)/8, $0x9bdc06a725c71235
   114	DATA  ·kcon+0x0E8(SB)/8, $0x9bdc06a725c71235
   115	DATA  ·kcon+0x0F0(SB)/8, $0xc19bf174cf692694
   116	DATA  ·kcon+0x0F8(SB)/8, $0xc19bf174cf692694
   117	DATA  ·kcon+0x100(SB)/8, $0xe49b69c19ef14ad2
   118	DATA  ·kcon+0x108(SB)/8, $0xe49b69c19ef14ad2
   119	DATA  ·kcon+0x110(SB)/8, $0xefbe4786384f25e3
   120	DATA  ·kcon+0x118(SB)/8, $0xefbe4786384f25e3
   121	DATA  ·kcon+0x120(SB)/8, $0x0fc19dc68b8cd5b5
   122	DATA  ·kcon+0x128(SB)/8, $0x0fc19dc68b8cd5b5
   123	DATA  ·kcon+0x130(SB)/8, $0x240ca1cc77ac9c65
   124	DATA  ·kcon+0x138(SB)/8, $0x240ca1cc77ac9c65
   125	DATA  ·kcon+0x140(SB)/8, $0x2de92c6f592b0275
   126	DATA  ·kcon+0x148(SB)/8, $0x2de92c6f592b0275
   127	DATA  ·kcon+0x150(SB)/8, $0x4a7484aa6ea6e483
   128	DATA  ·kcon+0x158(SB)/8, $0x4a7484aa6ea6e483
   129	DATA  ·kcon+0x160(SB)/8, $0x5cb0a9dcbd41fbd4
   130	DATA  ·kcon+0x168(SB)/8, $0x5cb0a9dcbd41fbd4
   131	DATA  ·kcon+0x170(SB)/8, $0x76f988da831153b5
   132	DATA  ·kcon+0x178(SB)/8, $0x76f988da831153b5
   133	DATA  ·kcon+0x180(SB)/8, $0x983e5152ee66dfab
   134	DATA  ·kcon+0x188(SB)/8, $0x983e5152ee66dfab
   135	DATA  ·kcon+0x190(SB)/8, $0xa831c66d2db43210
   136	DATA  ·kcon+0x198(SB)/8, $0xa831c66d2db43210
   137	DATA  ·kcon+0x1A0(SB)/8, $0xb00327c898fb213f
   138	DATA  ·kcon+0x1A8(SB)/8, $0xb00327c898fb213f
   139	DATA  ·kcon+0x1B0(SB)/8, $0xbf597fc7beef0ee4
   140	DATA  ·kcon+0x1B8(SB)/8, $0xbf597fc7beef0ee4
   141	DATA  ·kcon+0x1C0(SB)/8, $0xc6e00bf33da88fc2
   142	DATA  ·kcon+0x1C8(SB)/8, $0xc6e00bf33da88fc2
   143	DATA  ·kcon+0x1D0(SB)/8, $0xd5a79147930aa725
   144	DATA  ·kcon+0x1D8(SB)/8, $0xd5a79147930aa725
   145	DATA  ·kcon+0x1E0(SB)/8, $0x06ca6351e003826f
   146	DATA  ·kcon+0x1E8(SB)/8, $0x06ca6351e003826f
   147	DATA  ·kcon+0x1F0(SB)/8, $0x142929670a0e6e70
   148	DATA  ·kcon+0x1F8(SB)/8, $0x142929670a0e6e70
   149	DATA  ·kcon+0x200(SB)/8, $0x27b70a8546d22ffc
   150	DATA  ·kcon+0x208(SB)/8, $0x27b70a8546d22ffc
   151	DATA  ·kcon+0x210(SB)/8, $0x2e1b21385c26c926
   152	DATA  ·kcon+0x218(SB)/8, $0x2e1b21385c26c926
   153	DATA  ·kcon+0x220(SB)/8, $0x4d2c6dfc5ac42aed
   154	DATA  ·kcon+0x228(SB)/8, $0x4d2c6dfc5ac42aed
   155	DATA  ·kcon+0x230(SB)/8, $0x53380d139d95b3df
   156	DATA  ·kcon+0x238(SB)/8, $0x53380d139d95b3df
   157	DATA  ·kcon+0x240(SB)/8, $0x650a73548baf63de
   158	DATA  ·kcon+0x248(SB)/8, $0x650a73548baf63de
   159	DATA  ·kcon+0x250(SB)/8, $0x766a0abb3c77b2a8
   160	DATA  ·kcon+0x258(SB)/8, $0x766a0abb3c77b2a8
   161	DATA  ·kcon+0x260(SB)/8, $0x81c2c92e47edaee6
   162	DATA  ·kcon+0x268(SB)/8, $0x81c2c92e47edaee6
   163	DATA  ·kcon+0x270(SB)/8, $0x92722c851482353b
   164	DATA  ·kcon+0x278(SB)/8, $0x92722c851482353b
   165	DATA  ·kcon+0x280(SB)/8, $0xa2bfe8a14cf10364
   166	DATA  ·kcon+0x288(SB)/8, $0xa2bfe8a14cf10364
   167	DATA  ·kcon+0x290(SB)/8, $0xa81a664bbc423001
   168	DATA  ·kcon+0x298(SB)/8, $0xa81a664bbc423001
   169	DATA  ·kcon+0x2A0(SB)/8, $0xc24b8b70d0f89791
   170	DATA  ·kcon+0x2A8(SB)/8, $0xc24b8b70d0f89791
   171	DATA  ·kcon+0x2B0(SB)/8, $0xc76c51a30654be30
   172	DATA  ·kcon+0x2B8(SB)/8, $0xc76c51a30654be30
   173	DATA  ·kcon+0x2C0(SB)/8, $0xd192e819d6ef5218
   174	DATA  ·kcon+0x2C8(SB)/8, $0xd192e819d6ef5218
   175	DATA  ·kcon+0x2D0(SB)/8, $0xd69906245565a910
   176	DATA  ·kcon+0x2D8(SB)/8, $0xd69906245565a910
   177	DATA  ·kcon+0x2E0(SB)/8, $0xf40e35855771202a
   178	DATA  ·kcon+0x2E8(SB)/8, $0xf40e35855771202a
   179	DATA  ·kcon+0x2F0(SB)/8, $0x106aa07032bbd1b8
   180	DATA  ·kcon+0x2F8(SB)/8, $0x106aa07032bbd1b8
   181	DATA  ·kcon+0x300(SB)/8, $0x19a4c116b8d2d0c8
   182	DATA  ·kcon+0x308(SB)/8, $0x19a4c116b8d2d0c8
   183	DATA  ·kcon+0x310(SB)/8, $0x1e376c085141ab53
   184	DATA  ·kcon+0x318(SB)/8, $0x1e376c085141ab53
   185	DATA  ·kcon+0x320(SB)/8, $0x2748774cdf8eeb99
   186	DATA  ·kcon+0x328(SB)/8, $0x2748774cdf8eeb99
   187	DATA  ·kcon+0x330(SB)/8, $0x34b0bcb5e19b48a8
   188	DATA  ·kcon+0x338(SB)/8, $0x34b0bcb5e19b48a8
   189	DATA  ·kcon+0x340(SB)/8, $0x391c0cb3c5c95a63
   190	DATA  ·kcon+0x348(SB)/8, $0x391c0cb3c5c95a63
   191	DATA  ·kcon+0x350(SB)/8, $0x4ed8aa4ae3418acb
   192	DATA  ·kcon+0x358(SB)/8, $0x4ed8aa4ae3418acb
   193	DATA  ·kcon+0x360(SB)/8, $0x5b9cca4f7763e373
   194	DATA  ·kcon+0x368(SB)/8, $0x5b9cca4f7763e373
   195	DATA  ·kcon+0x370(SB)/8, $0x682e6ff3d6b2b8a3
   196	DATA  ·kcon+0x378(SB)/8, $0x682e6ff3d6b2b8a3
   197	DATA  ·kcon+0x380(SB)/8, $0x748f82ee5defb2fc
   198	DATA  ·kcon+0x388(SB)/8, $0x748f82ee5defb2fc
   199	DATA  ·kcon+0x390(SB)/8, $0x78a5636f43172f60
   200	DATA  ·kcon+0x398(SB)/8, $0x78a5636f43172f60
   201	DATA  ·kcon+0x3A0(SB)/8, $0x84c87814a1f0ab72
   202	DATA  ·kcon+0x3A8(SB)/8, $0x84c87814a1f0ab72
   203	DATA  ·kcon+0x3B0(SB)/8, $0x8cc702081a6439ec
   204	DATA  ·kcon+0x3B8(SB)/8, $0x8cc702081a6439ec
   205	DATA  ·kcon+0x3C0(SB)/8, $0x90befffa23631e28
   206	DATA  ·kcon+0x3C8(SB)/8, $0x90befffa23631e28
   207	DATA  ·kcon+0x3D0(SB)/8, $0xa4506cebde82bde9
   208	DATA  ·kcon+0x3D8(SB)/8, $0xa4506cebde82bde9
   209	DATA  ·kcon+0x3E0(SB)/8, $0xbef9a3f7b2c67915
   210	DATA  ·kcon+0x3E8(SB)/8, $0xbef9a3f7b2c67915
   211	DATA  ·kcon+0x3F0(SB)/8, $0xc67178f2e372532b
   212	DATA  ·kcon+0x3F8(SB)/8, $0xc67178f2e372532b
   213	DATA  ·kcon+0x400(SB)/8, $0xca273eceea26619c
   214	DATA  ·kcon+0x408(SB)/8, $0xca273eceea26619c
   215	DATA  ·kcon+0x410(SB)/8, $0xd186b8c721c0c207
   216	DATA  ·kcon+0x418(SB)/8, $0xd186b8c721c0c207
   217	DATA  ·kcon+0x420(SB)/8, $0xeada7dd6cde0eb1e
   218	DATA  ·kcon+0x428(SB)/8, $0xeada7dd6cde0eb1e
   219	DATA  ·kcon+0x430(SB)/8, $0xf57d4f7fee6ed178
   220	DATA  ·kcon+0x438(SB)/8, $0xf57d4f7fee6ed178
   221	DATA  ·kcon+0x440(SB)/8, $0x06f067aa72176fba
   222	DATA  ·kcon+0x448(SB)/8, $0x06f067aa72176fba
   223	DATA  ·kcon+0x450(SB)/8, $0x0a637dc5a2c898a6
   224	DATA  ·kcon+0x458(SB)/8, $0x0a637dc5a2c898a6
   225	DATA  ·kcon+0x460(SB)/8, $0x113f9804bef90dae
   226	DATA  ·kcon+0x468(SB)/8, $0x113f9804bef90dae
   227	DATA  ·kcon+0x470(SB)/8, $0x1b710b35131c471b
   228	DATA  ·kcon+0x478(SB)/8, $0x1b710b35131c471b
   229	DATA  ·kcon+0x480(SB)/8, $0x28db77f523047d84
   230	DATA  ·kcon+0x488(SB)/8, $0x28db77f523047d84
   231	DATA  ·kcon+0x490(SB)/8, $0x32caab7b40c72493
   232	DATA  ·kcon+0x498(SB)/8, $0x32caab7b40c72493
   233	DATA  ·kcon+0x4A0(SB)/8, $0x3c9ebe0a15c9bebc
   234	DATA  ·kcon+0x4A8(SB)/8, $0x3c9ebe0a15c9bebc
   235	DATA  ·kcon+0x4B0(SB)/8, $0x431d67c49c100d4c
   236	DATA  ·kcon+0x4B8(SB)/8, $0x431d67c49c100d4c
   237	DATA  ·kcon+0x4C0(SB)/8, $0x4cc5d4becb3e42b6
   238	DATA  ·kcon+0x4C8(SB)/8, $0x4cc5d4becb3e42b6
   239	DATA  ·kcon+0x4D0(SB)/8, $0x597f299cfc657e2a
   240	DATA  ·kcon+0x4D8(SB)/8, $0x597f299cfc657e2a
   241	DATA  ·kcon+0x4E0(SB)/8, $0x5fcb6fab3ad6faec
   242	DATA  ·kcon+0x4E8(SB)/8, $0x5fcb6fab3ad6faec
   243	DATA  ·kcon+0x4F0(SB)/8, $0x6c44198c4a475817
   244	DATA  ·kcon+0x4F8(SB)/8, $0x6c44198c4a475817
   245	DATA  ·kcon+0x500(SB)/8, $0x0000000000000000
   246	DATA  ·kcon+0x508(SB)/8, $0x0000000000000000
   247	DATA  ·kcon+0x510(SB)/8, $0x1011121314151617
   248	DATA  ·kcon+0x518(SB)/8, $0x0001020304050607
   249	GLOBL ·kcon(SB), RODATA, $1312
   250	
   251	#define SHA512ROUND0(a, b, c, d, e, f, g, h, xi) \
   252		VSEL		g, f, e, FUNC; \
   253		VSHASIGMAD	$15, e, $1, S1; \
   254		VADDUDM		xi, h, h; \
   255		VSHASIGMAD	$0, a, $1, S0; \
   256		VADDUDM		FUNC, h, h; \
   257		VXOR		b, a, FUNC; \
   258		VADDUDM		S1, h, h; \
   259		VSEL		b, c, FUNC, FUNC; \
   260		VADDUDM		KI, g, g; \
   261		VADDUDM		h, d, d; \
   262		VADDUDM		FUNC, S0, S0; \
   263		LVX		(TBL)(IDX), KI; \
   264		ADD		$16, IDX; \
   265		VADDUDM		S0, h, h
   266	
   267	#define SHA512ROUND1(a, b, c, d, e, f, g, h, xi, xj, xj_1, xj_9, xj_14) \
   268		VSHASIGMAD	$0, xj_1, $0, s0; \
   269		VSEL		g, f, e, FUNC; \
   270		VSHASIGMAD	$15, e, $1, S1; \
   271		VADDUDM		xi, h, h; \
   272		VSHASIGMAD	$0, a, $1, S0; \
   273		VSHASIGMAD	$15, xj_14, $0, s1; \
   274		VADDUDM		FUNC, h, h; \
   275		VXOR		b, a, FUNC; \
   276		VADDUDM		xj_9, xj, xj; \
   277		VADDUDM		S1, h, h; \
   278		VSEL		b, c, FUNC, FUNC; \
   279		VADDUDM		KI, g, g; \
   280		VADDUDM		h, d, d; \
   281		VADDUDM		FUNC, S0, S0; \
   282		VADDUDM		s0, xj, xj; \
   283		LVX		(TBL)(IDX), KI; \
   284		ADD		$16, IDX; \
   285		VADDUDM		S0, h, h; \
   286		VADDUDM		s1, xj, xj
   287	
   288	// func block(dig *digest, p []byte)
   289	TEXT ·block(SB),0,$128-32
   290		MOVD	dig+0(FP), CTX
   291		MOVD	p_base+8(FP), INP
   292		MOVD	p_len+16(FP), LEN
   293	
   294		SRD	$6, LEN
   295		SLD	$6, LEN
   296	
   297		ADD	INP, LEN, END
   298	
   299		CMP	INP, END
   300		BEQ	end
   301	
   302		MOVD	$·kcon(SB), TBL
   303		MOVD	R1, OFFLOAD
   304	
   305		MOVD	R0, CNT
   306		MOVWZ	$0x10, HEX10
   307		MOVWZ	$0x20, HEX20
   308		MOVWZ	$0x30, HEX30
   309		MOVWZ	$0x40, HEX40
   310		MOVWZ	$0x50, HEX50
   311		MOVWZ	$0x60, HEX60
   312		MOVWZ	$0x70, HEX70
   313	
   314		MOVWZ	$8, IDX
   315		LVSL	(IDX)(R0), LEMASK
   316		VSPLTISB	$0x0F, KI
   317		VXOR	KI, LEMASK, LEMASK
   318	
   319		LXVD2X	(CTX)(HEX00), VS32	// v0 = vs32
   320		LXVD2X	(CTX)(HEX10), VS34	// v2 = vs34
   321		LXVD2X	(CTX)(HEX20), VS36	// v4 = vs36
   322		// unpack the input values into vector registers
   323		VSLDOI	$8, V0, V0, V1
   324		LXVD2X	(CTX)(HEX30), VS38	// v6 = vs38
   325		VSLDOI	$8, V2, V2, V3
   326		VSLDOI	$8, V4, V4, V5
   327		VSLDOI	$8, V6, V6, V7
   328	
   329	loop:
   330		LVX	(TBL)(HEX00), KI
   331		MOVWZ	$16, IDX
   332	
   333		LXVD2X	(INP)(R0), VS40	// load v8 (=vs40) in advance
   334		ADD	$16, INP
   335	
   336		STVX	V0, (OFFLOAD+HEX00)
   337		STVX	V1, (OFFLOAD+HEX10)
   338		STVX	V2, (OFFLOAD+HEX20)
   339		STVX	V3, (OFFLOAD+HEX30)
   340		STVX	V4, (OFFLOAD+HEX40)
   341		STVX	V5, (OFFLOAD+HEX50)
   342		STVX	V6, (OFFLOAD+HEX60)
   343		STVX	V7, (OFFLOAD+HEX70)
   344	
   345		VADDUDM	KI, V7, V7	// h+K[i]
   346		LVX	(TBL)(IDX), KI
   347		ADD	$16, IDX
   348	
   349		VPERM	V8, V8, LEMASK, V8
   350		SHA512ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V8)
   351		LXVD2X	(INP)(R0), VS42	// load v10 (=vs42) in advance
   352		ADD	$16, INP, INP
   353		VSLDOI	$8, V8, V8, V9
   354		SHA512ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V9)
   355		VPERM	V10, V10, LEMASK, V10
   356		SHA512ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V10)
   357		LXVD2X	(INP)(R0), VS44	// load v12 (=vs44) in advance
   358		ADD	$16, INP, INP
   359		VSLDOI	$8, V10, V10, V11
   360		SHA512ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V11)
   361		VPERM	V12, V12, LEMASK, V12
   362		SHA512ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V12)
   363		LXVD2X	(INP)(R0), VS46	// load v14 (=vs46) in advance
   364		ADD	$16, INP, INP
   365		VSLDOI	$8, V12, V12, V13
   366		SHA512ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V13)
   367		VPERM	V14, V14, LEMASK, V14
   368		SHA512ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V14)
   369		LXVD2X	(INP)(R0), VS48	// load v16 (=vs48) in advance
   370		ADD	$16, INP, INP
   371		VSLDOI	$8, V14, V14, V15
   372		SHA512ROUND0(V1, V2, V3, V4, V5, V6, V7, V0, V15)
   373		VPERM	V16, V16, LEMASK, V16
   374		SHA512ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V16)
   375		LXVD2X	(INP)(R0), VS50	// load v18 (=vs50) in advance
   376		ADD	$16, INP, INP
   377		VSLDOI	$8, V16, V16, V17
   378		SHA512ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V17)
   379		VPERM	V18, V18, LEMASK, V18
   380		SHA512ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V18)
   381		LXVD2X	(INP)(R0), VS52	// load v20 (=vs52) in advance
   382		ADD	$16, INP, INP
   383		VSLDOI	$8, V18, V18, V19
   384		SHA512ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V19)
   385		VPERM	V20, V20, LEMASK, V20
   386		SHA512ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V20)
   387		LXVD2X	(INP)(R0), VS54	// load v22 (=vs54) in advance
   388		ADD	$16, INP, INP
   389		VSLDOI	$8, V20, V20, V21
   390		SHA512ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V21)
   391		VPERM	V22, V22, LEMASK, V22
   392		SHA512ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V22)
   393		VSLDOI	$8, V22, V22, V23
   394		SHA512ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22)
   395	
   396		MOVWZ	$4, TEMP
   397		MOVWZ	TEMP, CTR
   398	
   399	L16_xx:
   400		SHA512ROUND1(V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V18, V23)
   401		SHA512ROUND1(V7, V0, V1, V2, V3, V4, V5, V6, V9, V10, V11, V19, V8)
   402		SHA512ROUND1(V6, V7, V0, V1, V2, V3, V4, V5, V10, V11, V12, V20, V9)
   403		SHA512ROUND1(V5, V6, V7, V0, V1, V2, V3, V4, V11, V12, V13, V21, V10)
   404		SHA512ROUND1(V4, V5, V6, V7, V0, V1, V2, V3, V12, V13, V14, V22, V11)
   405		SHA512ROUND1(V3, V4, V5, V6, V7, V0, V1, V2, V13, V14, V15, V23, V12)
   406		SHA512ROUND1(V2, V3, V4, V5, V6, V7, V0, V1, V14, V15, V16, V8, V13)
   407		SHA512ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V15, V16, V17, V9, V14)
   408		SHA512ROUND1(V0, V1, V2, V3, V4, V5, V6, V7, V16, V17, V18, V10, V15)
   409		SHA512ROUND1(V7, V0, V1, V2, V3, V4, V5, V6, V17, V18, V19, V11, V16)
   410		SHA512ROUND1(V6, V7, V0, V1, V2, V3, V4, V5, V18, V19, V20, V12, V17)
   411		SHA512ROUND1(V5, V6, V7, V0, V1, V2, V3, V4, V19, V20, V21, V13, V18)
   412		SHA512ROUND1(V4, V5, V6, V7, V0, V1, V2, V3, V20, V21, V22, V14, V19)
   413		SHA512ROUND1(V3, V4, V5, V6, V7, V0, V1, V2, V21, V22, V23, V15, V20)
   414		SHA512ROUND1(V2, V3, V4, V5, V6, V7, V0, V1, V22, V23, V8, V16, V21)
   415		SHA512ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22)
   416	
   417		BC	0x10, 0, L16_xx		// bdnz
   418	
   419		LVX	(OFFLOAD)(HEX00), V10
   420	
   421		LVX	(OFFLOAD)(HEX10), V11
   422		VADDUDM	V10, V0, V0
   423		LVX	(OFFLOAD)(HEX20), V12
   424		VADDUDM	V11, V1, V1
   425		LVX	(OFFLOAD)(HEX30), V13
   426		VADDUDM	V12, V2, V2
   427		LVX	(OFFLOAD)(HEX40), V14
   428		VADDUDM	V13, V3, V3
   429		LVX	(OFFLOAD)(HEX50), V15
   430		VADDUDM	V14, V4, V4
   431		LVX	(OFFLOAD)(HEX60), V16
   432		VADDUDM	V15, V5, V5
   433		LVX	(OFFLOAD)(HEX70), V17
   434		VADDUDM	V16, V6, V6
   435		VADDUDM	V17, V7, V7
   436	
   437		CMPU	INP, END
   438		BLT	loop
   439	
   440		VPERM	V0, V1, KI, V0
   441		VPERM	V2, V3, KI, V2
   442		VPERM	V4, V5, KI, V4
   443		VPERM	V6, V7, KI, V6
   444		STXVD2X	VS32, (CTX+HEX00)	// v0 = vs32
   445		STXVD2X	VS34, (CTX+HEX10)	// v2 = vs34
   446		STXVD2X	VS36, (CTX+HEX20)	// v4 = vs36
   447		STXVD2X	VS38, (CTX+HEX30)	// v6 = vs38
   448	
   449	end:
   450		RET
   451	

View as plain text