...

Text file src/vendor/golang.org/x/crypto/poly1305/sum_s390x.s

     1	// Copyright 2018 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// +build s390x,go1.11,!gccgo,!appengine
     6	
     7	#include "textflag.h"
     8	
     9	// Implementation of Poly1305 using the vector facility (vx).
    10	
    11	// constants
    12	#define MOD26 V0
    13	#define EX0   V1
    14	#define EX1   V2
    15	#define EX2   V3
    16	
    17	// temporaries
    18	#define T_0 V4
    19	#define T_1 V5
    20	#define T_2 V6
    21	#define T_3 V7
    22	#define T_4 V8
    23	
    24	// key (r)
    25	#define R_0  V9
    26	#define R_1  V10
    27	#define R_2  V11
    28	#define R_3  V12
    29	#define R_4  V13
    30	#define R5_1 V14
    31	#define R5_2 V15
    32	#define R5_3 V16
    33	#define R5_4 V17
    34	#define RSAVE_0 R5
    35	#define RSAVE_1 R6
    36	#define RSAVE_2 R7
    37	#define RSAVE_3 R8
    38	#define RSAVE_4 R9
    39	#define R5SAVE_1 V28
    40	#define R5SAVE_2 V29
    41	#define R5SAVE_3 V30
    42	#define R5SAVE_4 V31
    43	
    44	// message block
    45	#define F_0 V18
    46	#define F_1 V19
    47	#define F_2 V20
    48	#define F_3 V21
    49	#define F_4 V22
    50	
    51	// accumulator
    52	#define H_0 V23
    53	#define H_1 V24
    54	#define H_2 V25
    55	#define H_3 V26
    56	#define H_4 V27
    57	
    58	GLOBL ·keyMask<>(SB), RODATA, $16
    59	DATA ·keyMask<>+0(SB)/8, $0xffffff0ffcffff0f
    60	DATA ·keyMask<>+8(SB)/8, $0xfcffff0ffcffff0f
    61	
    62	GLOBL ·bswapMask<>(SB), RODATA, $16
    63	DATA ·bswapMask<>+0(SB)/8, $0x0f0e0d0c0b0a0908
    64	DATA ·bswapMask<>+8(SB)/8, $0x0706050403020100
    65	
    66	GLOBL ·constants<>(SB), RODATA, $64
    67	// MOD26
    68	DATA ·constants<>+0(SB)/8, $0x3ffffff
    69	DATA ·constants<>+8(SB)/8, $0x3ffffff
    70	// EX0
    71	DATA ·constants<>+16(SB)/8, $0x0006050403020100
    72	DATA ·constants<>+24(SB)/8, $0x1016151413121110
    73	// EX1
    74	DATA ·constants<>+32(SB)/8, $0x060c0b0a09080706
    75	DATA ·constants<>+40(SB)/8, $0x161c1b1a19181716
    76	// EX2
    77	DATA ·constants<>+48(SB)/8, $0x0d0d0d0d0d0f0e0d
    78	DATA ·constants<>+56(SB)/8, $0x1d1d1d1d1d1f1e1d
    79	
    80	// h = (f*g) % (2**130-5) [partial reduction]
    81	#define MULTIPLY(f0, f1, f2, f3, f4, g0, g1, g2, g3, g4, g51, g52, g53, g54, h0, h1, h2, h3, h4) \
    82		VMLOF  f0, g0, h0        \
    83		VMLOF  f0, g1, h1        \
    84		VMLOF  f0, g2, h2        \
    85		VMLOF  f0, g3, h3        \
    86		VMLOF  f0, g4, h4        \
    87		VMLOF  f1, g54, T_0      \
    88		VMLOF  f1, g0, T_1       \
    89		VMLOF  f1, g1, T_2       \
    90		VMLOF  f1, g2, T_3       \
    91		VMLOF  f1, g3, T_4       \
    92		VMALOF f2, g53, h0, h0   \
    93		VMALOF f2, g54, h1, h1   \
    94		VMALOF f2, g0, h2, h2    \
    95		VMALOF f2, g1, h3, h3    \
    96		VMALOF f2, g2, h4, h4    \
    97		VMALOF f3, g52, T_0, T_0 \
    98		VMALOF f3, g53, T_1, T_1 \
    99		VMALOF f3, g54, T_2, T_2 \
   100		VMALOF f3, g0, T_3, T_3  \
   101		VMALOF f3, g1, T_4, T_4  \
   102		VMALOF f4, g51, h0, h0   \
   103		VMALOF f4, g52, h1, h1   \
   104		VMALOF f4, g53, h2, h2   \
   105		VMALOF f4, g54, h3, h3   \
   106		VMALOF f4, g0, h4, h4    \
   107		VAG    T_0, h0, h0       \
   108		VAG    T_1, h1, h1       \
   109		VAG    T_2, h2, h2       \
   110		VAG    T_3, h3, h3       \
   111		VAG    T_4, h4, h4
   112	
   113	// carry h0->h1 h3->h4, h1->h2 h4->h0, h0->h1 h2->h3, h3->h4
   114	#define REDUCE(h0, h1, h2, h3, h4) \
   115		VESRLG $26, h0, T_0  \
   116		VESRLG $26, h3, T_1  \
   117		VN     MOD26, h0, h0 \
   118		VN     MOD26, h3, h3 \
   119		VAG    T_0, h1, h1   \
   120		VAG    T_1, h4, h4   \
   121		VESRLG $26, h1, T_2  \
   122		VESRLG $26, h4, T_3  \
   123		VN     MOD26, h1, h1 \
   124		VN     MOD26, h4, h4 \
   125		VESLG  $2, T_3, T_4  \
   126		VAG    T_3, T_4, T_4 \
   127		VAG    T_2, h2, h2   \
   128		VAG    T_4, h0, h0   \
   129		VESRLG $26, h2, T_0  \
   130		VESRLG $26, h0, T_1  \
   131		VN     MOD26, h2, h2 \
   132		VN     MOD26, h0, h0 \
   133		VAG    T_0, h3, h3   \
   134		VAG    T_1, h1, h1   \
   135		VESRLG $26, h3, T_2  \
   136		VN     MOD26, h3, h3 \
   137		VAG    T_2, h4, h4
   138	
   139	// expand in0 into d[0] and in1 into d[1]
   140	#define EXPAND(in0, in1, d0, d1, d2, d3, d4) \
   141		VGBM   $0x0707, d1       \ // d1=tmp
   142		VPERM  in0, in1, EX2, d4 \
   143		VPERM  in0, in1, EX0, d0 \
   144		VPERM  in0, in1, EX1, d2 \
   145		VN     d1, d4, d4        \
   146		VESRLG $26, d0, d1       \
   147		VESRLG $30, d2, d3       \
   148		VESRLG $4, d2, d2        \
   149		VN     MOD26, d0, d0     \
   150		VN     MOD26, d1, d1     \
   151		VN     MOD26, d2, d2     \
   152		VN     MOD26, d3, d3
   153	
   154	// pack h4:h0 into h1:h0 (no carry)
   155	#define PACK(h0, h1, h2, h3, h4) \
   156		VESLG $26, h1, h1  \
   157		VESLG $26, h3, h3  \
   158		VO    h0, h1, h0   \
   159		VO    h2, h3, h2   \
   160		VESLG $4, h2, h2   \
   161		VLEIB $7, $48, h1  \
   162		VSLB  h1, h2, h2   \
   163		VO    h0, h2, h0   \
   164		VLEIB $7, $104, h1 \
   165		VSLB  h1, h4, h3   \
   166		VO    h3, h0, h0   \
   167		VLEIB $7, $24, h1  \
   168		VSRLB h1, h4, h1
   169	
   170	// if h > 2**130-5 then h -= 2**130-5
   171	#define MOD(h0, h1, t0, t1, t2) \
   172		VZERO t0          \
   173		VLEIG $1, $5, t0  \
   174		VACCQ h0, t0, t1  \
   175		VAQ   h0, t0, t0  \
   176		VONE  t2          \
   177		VLEIG $1, $-4, t2 \
   178		VAQ   t2, t1, t1  \
   179		VACCQ h1, t1, t1  \
   180		VONE  t2          \
   181		VAQ   t2, t1, t1  \
   182		VN    h0, t1, t2  \
   183		VNC   t0, t1, t1  \
   184		VO    t1, t2, h0
   185	
   186	// func poly1305vx(out *[16]byte, m *byte, mlen uint64, key *[32]key)
   187	TEXT ·poly1305vx(SB), $0-32
   188		// This code processes up to 2 blocks (32 bytes) per iteration
   189		// using the algorithm described in:
   190		// NEON crypto, Daniel J. Bernstein & Peter Schwabe
   191		// https://cryptojedi.org/papers/neoncrypto-20120320.pdf
   192		LMG out+0(FP), R1, R4 // R1=out, R2=m, R3=mlen, R4=key
   193	
   194		// load MOD26, EX0, EX1 and EX2
   195		MOVD $·constants<>(SB), R5
   196		VLM  (R5), MOD26, EX2
   197	
   198		// setup r
   199		VL   (R4), T_0
   200		MOVD $·keyMask<>(SB), R6
   201		VL   (R6), T_1
   202		VN   T_0, T_1, T_0
   203		EXPAND(T_0, T_0, R_0, R_1, R_2, R_3, R_4)
   204	
   205		// setup r*5
   206		VLEIG $0, $5, T_0
   207		VLEIG $1, $5, T_0
   208	
   209		// store r (for final block)
   210		VMLOF T_0, R_1, R5SAVE_1
   211		VMLOF T_0, R_2, R5SAVE_2
   212		VMLOF T_0, R_3, R5SAVE_3
   213		VMLOF T_0, R_4, R5SAVE_4
   214		VLGVG $0, R_0, RSAVE_0
   215		VLGVG $0, R_1, RSAVE_1
   216		VLGVG $0, R_2, RSAVE_2
   217		VLGVG $0, R_3, RSAVE_3
   218		VLGVG $0, R_4, RSAVE_4
   219	
   220		// skip r**2 calculation
   221		CMPBLE R3, $16, skip
   222	
   223		// calculate r**2
   224		MULTIPLY(R_0, R_1, R_2, R_3, R_4, R_0, R_1, R_2, R_3, R_4, R5SAVE_1, R5SAVE_2, R5SAVE_3, R5SAVE_4, H_0, H_1, H_2, H_3, H_4)
   225		REDUCE(H_0, H_1, H_2, H_3, H_4)
   226		VLEIG $0, $5, T_0
   227		VLEIG $1, $5, T_0
   228		VMLOF T_0, H_1, R5_1
   229		VMLOF T_0, H_2, R5_2
   230		VMLOF T_0, H_3, R5_3
   231		VMLOF T_0, H_4, R5_4
   232		VLR   H_0, R_0
   233		VLR   H_1, R_1
   234		VLR   H_2, R_2
   235		VLR   H_3, R_3
   236		VLR   H_4, R_4
   237	
   238		// initialize h
   239		VZERO H_0
   240		VZERO H_1
   241		VZERO H_2
   242		VZERO H_3
   243		VZERO H_4
   244	
   245	loop:
   246		CMPBLE R3, $32, b2
   247		VLM    (R2), T_0, T_1
   248		SUB    $32, R3
   249		MOVD   $32(R2), R2
   250		EXPAND(T_0, T_1, F_0, F_1, F_2, F_3, F_4)
   251		VLEIB  $4, $1, F_4
   252		VLEIB  $12, $1, F_4
   253	
   254	multiply:
   255		VAG    H_0, F_0, F_0
   256		VAG    H_1, F_1, F_1
   257		VAG    H_2, F_2, F_2
   258		VAG    H_3, F_3, F_3
   259		VAG    H_4, F_4, F_4
   260		MULTIPLY(F_0, F_1, F_2, F_3, F_4, R_0, R_1, R_2, R_3, R_4, R5_1, R5_2, R5_3, R5_4, H_0, H_1, H_2, H_3, H_4)
   261		REDUCE(H_0, H_1, H_2, H_3, H_4)
   262		CMPBNE R3, $0, loop
   263	
   264	finish:
   265		// sum vectors
   266		VZERO  T_0
   267		VSUMQG H_0, T_0, H_0
   268		VSUMQG H_1, T_0, H_1
   269		VSUMQG H_2, T_0, H_2
   270		VSUMQG H_3, T_0, H_3
   271		VSUMQG H_4, T_0, H_4
   272	
   273		// h may be >= 2*(2**130-5) so we need to reduce it again
   274		REDUCE(H_0, H_1, H_2, H_3, H_4)
   275	
   276		// carry h1->h4
   277		VESRLG $26, H_1, T_1
   278		VN     MOD26, H_1, H_1
   279		VAQ    T_1, H_2, H_2
   280		VESRLG $26, H_2, T_2
   281		VN     MOD26, H_2, H_2
   282		VAQ    T_2, H_3, H_3
   283		VESRLG $26, H_3, T_3
   284		VN     MOD26, H_3, H_3
   285		VAQ    T_3, H_4, H_4
   286	
   287		// h is now < 2*(2**130-5)
   288		// pack h into h1 (hi) and h0 (lo)
   289		PACK(H_0, H_1, H_2, H_3, H_4)
   290	
   291		// if h > 2**130-5 then h -= 2**130-5
   292		MOD(H_0, H_1, T_0, T_1, T_2)
   293	
   294		// h += s
   295		MOVD  $·bswapMask<>(SB), R5
   296		VL    (R5), T_1
   297		VL    16(R4), T_0
   298		VPERM T_0, T_0, T_1, T_0    // reverse bytes (to big)
   299		VAQ   T_0, H_0, H_0
   300		VPERM H_0, H_0, T_1, H_0    // reverse bytes (to little)
   301		VST   H_0, (R1)
   302	
   303		RET
   304	
   305	b2:
   306		CMPBLE R3, $16, b1
   307	
   308		// 2 blocks remaining
   309		SUB    $17, R3
   310		VL     (R2), T_0
   311		VLL    R3, 16(R2), T_1
   312		ADD    $1, R3
   313		MOVBZ  $1, R0
   314		CMPBEQ R3, $16, 2(PC)
   315		VLVGB  R3, R0, T_1
   316		EXPAND(T_0, T_1, F_0, F_1, F_2, F_3, F_4)
   317		CMPBNE R3, $16, 2(PC)
   318		VLEIB  $12, $1, F_4
   319		VLEIB  $4, $1, F_4
   320	
   321		// setup [r²,r]
   322		VLVGG $1, RSAVE_0, R_0
   323		VLVGG $1, RSAVE_1, R_1
   324		VLVGG $1, RSAVE_2, R_2
   325		VLVGG $1, RSAVE_3, R_3
   326		VLVGG $1, RSAVE_4, R_4
   327		VPDI  $0, R5_1, R5SAVE_1, R5_1
   328		VPDI  $0, R5_2, R5SAVE_2, R5_2
   329		VPDI  $0, R5_3, R5SAVE_3, R5_3
   330		VPDI  $0, R5_4, R5SAVE_4, R5_4
   331	
   332		MOVD $0, R3
   333		BR   multiply
   334	
   335	skip:
   336		VZERO H_0
   337		VZERO H_1
   338		VZERO H_2
   339		VZERO H_3
   340		VZERO H_4
   341	
   342		CMPBEQ R3, $0, finish
   343	
   344	b1:
   345		// 1 block remaining
   346		SUB    $1, R3
   347		VLL    R3, (R2), T_0
   348		ADD    $1, R3
   349		MOVBZ  $1, R0
   350		CMPBEQ R3, $16, 2(PC)
   351		VLVGB  R3, R0, T_0
   352		VZERO  T_1
   353		EXPAND(T_0, T_1, F_0, F_1, F_2, F_3, F_4)
   354		CMPBNE R3, $16, 2(PC)
   355		VLEIB  $4, $1, F_4
   356		VLEIG  $1, $1, R_0
   357		VZERO  R_1
   358		VZERO  R_2
   359		VZERO  R_3
   360		VZERO  R_4
   361		VZERO  R5_1
   362		VZERO  R5_2
   363		VZERO  R5_3
   364		VZERO  R5_4
   365	
   366		// setup [r, 1]
   367		VLVGG $0, RSAVE_0, R_0
   368		VLVGG $0, RSAVE_1, R_1
   369		VLVGG $0, RSAVE_2, R_2
   370		VLVGG $0, RSAVE_3, R_3
   371		VLVGG $0, RSAVE_4, R_4
   372		VPDI  $0, R5SAVE_1, R5_1, R5_1
   373		VPDI  $0, R5SAVE_2, R5_2, R5_2
   374		VPDI  $0, R5SAVE_3, R5_3, R5_3
   375		VPDI  $0, R5SAVE_4, R5_4, R5_4
   376	
   377		MOVD $0, R3
   378		BR   multiply

View as plain text