1 module crypto.blake2.blake2s_round;
2 
3 import std.conv: to;
4 import inteli.emmintrin;
5 
6 
7 package:
8 pure nothrow @nogc:
9 
10 
11 alias LOADU = _mm_loadu_si128;
12 alias STOREU = _mm_storeu_si128;
13 alias TOF = _mm_castsi128_ps;
14 alias TOI = _mm_castps_si128;
15 
16 
17 __m128i _mm_roti_epi32( in __m128i r, in int c )
18 @safe
19 {
20     return _mm_xor_si128(_mm_srli_epi32((r), -(c) ),_mm_slli_epi32((r), 32-(-(c)) ));
21 }
22 
23 version(LDC)
24 {
25     template tmplG1 (int buf)
26     {
27         const tmplG1 = `
28             rows[0] = _mm_add_epi32( _mm_add_epi32( rows[0], bufs[` ~to!string(buf)~ `]), rows[1] );
29             rows[3] = _mm_xor_si128( rows[3], rows[0] );
30             rows[3] = _mm_roti_epi32(rows[3], -16);
31             rows[2] = _mm_add_epi32( rows[2], rows[3] );
32             rows[1] = _mm_xor_si128( rows[1], rows[2] );
33             rows[1] = _mm_roti_epi32(rows[1], -12);
34         `;
35     }
36 
37     template tmplG2 (int buf)
38     {
39         const tmplG2 = `
40             rows[0] = _mm_add_epi32( _mm_add_epi32( rows[0], bufs[` ~to!string(buf)~ `]), rows[1] );
41             rows[3] = _mm_xor_si128( rows[3], rows[0] );
42             rows[3] = _mm_roti_epi32(rows[3], -8);
43             rows[2] = _mm_add_epi32( rows[2], rows[3] );
44             rows[1] = _mm_xor_si128( rows[1], rows[2] );
45             rows[1] = _mm_roti_epi32(rows[1], -7);
46         `;
47     }
48 }
49 else
50 {
51     void fG1 (ref __m128i[4] rows, in __m128i buf)
52     {
53         rows[0] = _mm_add_epi32(_mm_add_epi32(rows[0], buf), rows[1] );
54         rows[3] = _mm_xor_si128(rows[3], rows[0] );
55         rows[3] = _mm_roti_epi32(rows[3], -16);
56         rows[2] = _mm_add_epi32(rows[2], rows[3] );
57         rows[1] = _mm_xor_si128(rows[1], rows[2] );
58         rows[1] = _mm_roti_epi32(rows[1], -12);
59     }
60 
61     void fG2 (ref __m128i[4] rows, in __m128i buf)
62     {
63         rows[0] = _mm_add_epi32(_mm_add_epi32(rows[0], buf), rows[1] );
64         rows[3] = _mm_xor_si128(rows[3], rows[0] );
65         rows[3] = _mm_roti_epi32(rows[3], -8 );
66         rows[2] = _mm_add_epi32(rows[2], rows[3] );
67         rows[1] = _mm_xor_si128(rows[1], rows[2] );
68         rows[1] = _mm_roti_epi32(rows[1], -7 );
69     }
70 }
71 
72 
73 immutable DIAGONALIZE = `
74     rows[0] = _mm_shuffle_epi32!(_MM_SHUFFLE(2,1,0,3))( rows[0] );
75     rows[3] = _mm_shuffle_epi32!(_MM_SHUFFLE(1,0,3,2))( rows[3] );
76     rows[2] = _mm_shuffle_epi32!(_MM_SHUFFLE(0,3,2,1))( rows[2] );
77 `;
78 
79 immutable UNDIAGONALIZE = `
80     rows[0] = _mm_shuffle_epi32!(_MM_SHUFFLE(0,3,2,1))( rows[0] );
81     rows[3] = _mm_shuffle_epi32!(_MM_SHUFFLE(1,0,3,2))( rows[3] );
82     rows[2] = _mm_shuffle_epi32!(_MM_SHUFFLE(2,1,0,3))( rows[2] );
83 `;
84 
85 immutable matrix = [
86         [
87             [6,  4,  2,  0 ],
88             [7,  5,  3,  1 ],
89             [12, 10, 8,  14],
90             [13, 11, 9,  15]
91         ],
92         [
93             [13, 9,  4,  14],
94             [6,  15, 8,  10],
95             [11, 0,  1,  5 ],
96             [7,  2,  12, 3 ]
97         ],
98         [
99             [15, 5,  12, 11],
100             [13, 2,  0,  8 ],
101             [7,  3,  10, 9 ],
102             [1,  6,  14, 4 ]
103         ],
104         [
105             [11, 13, 3,  7 ],
106             [14, 12, 1,  9 ],
107             [4,  5,  2,  15],
108             [0,  10, 6,  8 ]
109         ],
110         [
111             [10, 2,  5,  9 ],
112             [15, 4,  7,  0 ],
113             [6,  11, 14, 3 ],
114             [8,  12, 1,  13]
115         ],
116         [
117             [8,  0,  6,  2 ],
118             [3,  11, 10, 12],
119             [15, 7,  4,  1 ],
120             [14, 5,  13, 9 ]
121         ],
122         [
123             [4,  14, 1,  12],
124             [10, 13, 15, 5 ],
125             [9,  6,  0,  8 ],
126             [2,  3,  7,  11]
127         ],
128         [
129             [3,  12, 7,  13],
130             [9,  1,  14, 11],
131             [8,  15, 5,  2 ],
132             [6,  4,  0,  10]
133         ],
134         [
135             [0,  11, 14, 6 ],
136             [8,  3,  9,  15],
137             [1,  13, 12, 10],
138             [4,  7,  2,  5 ]
139         ],
140         [
141             [1,  7,  8,  10],
142             [5,  6,  4,  2 ],
143             [3,  9,  15, 13],
144             [12, 14, 11, 0 ]
145         ]
146     ];
147 
148 
149 version(LDC)
150 {
151     template tmplLoadMsg (int r, int c, int buf)
152     {
153         const cell = matrix[r][c];
154         const tmplLoadMsg = "
155             bufs["~to!string(buf)~"] = _mm_set_epi32(
156                 m["~to!string(cell[0])~"],
157                 m["~to!string(cell[1])~"],
158                 m["~to!string(cell[2])~"],
159                 m["~to!string(cell[3])~"]
160             );
161         ";
162     }
163     template tmplRound (int r)
164     {
165         const tmplRound =
166         tmplLoadMsg!(r, 0, 0) ~
167         tmplG1!0 ~
168         tmplLoadMsg!(r, 1, 1) ~
169         tmplG2!1 ~
170         DIAGONALIZE ~
171         tmplLoadMsg!(r, 2, 2) ~
172         tmplG1!2 ~
173         tmplLoadMsg!(r, 3, 3) ~
174         tmplG2!3 ~
175         UNDIAGONALIZE
176         ;
177     }
178 
179 }
180 else
181 {
182     void loadMsg (in const(uint)[16] m, in int r, in int c, out __m128i buf)
183     {
184         const cell = matrix[r][c];
185         buf = _mm_set_epi32(m[cell[0]], m[cell[1]], m[cell[2]], m[cell[3]]);
186     }
187 
188     void round (in const(uint)[16] m, in int r, ref __m128i[4] rows, ref __m128i[4] bufs)
189     {
190         loadMsg(m, r, 0, bufs[0]);
191         fG1(rows, bufs[0]);
192         loadMsg(m, r, 1, bufs[1]);
193         fG2(rows, bufs[1]);
194         mixin(DIAGONALIZE);
195         loadMsg(m, r, 2, bufs[2]);
196         fG1(rows, bufs[2]);
197         loadMsg(m, r, 3, bufs[3]);
198         fG2(rows, bufs[3]);
199         mixin(UNDIAGONALIZE);
200     }
201 }