1 module crypto.blake2.blake2b_round;
2 
3 import inteli.emmintrin;
4 
5 
6 package:
7 pure nothrow @nogc:
8 
9 
10 alias LOADU = _mm_loadu_si128;
11 alias STOREU = _mm_storeu_si128;
12 alias TOF = _mm_castsi128_ps;
13 alias TOI = _mm_castps_si128;
14 
15 
16 struct Row {
17     __m128i l, h;
18 
19     this (__m128i l, __m128i h)
20     {
21         this.l = l;
22         this.h = h;
23     }
24 }
25 
26 __m128i _mm_roti_epi64( in __m128i r, in int c )
27 @safe
28 {
29     return _mm_xor_si128(_mm_srli_epi64( (r), -(c) ),_mm_slli_epi64( (r), 64-(-(c)) ));
30 }
31 
32 immutable G1 = `
33     rows[0].l = _mm_add_epi64(_mm_add_epi64(rows[0].l, b[0]), rows[1].l);
34     rows[0].h = _mm_add_epi64(_mm_add_epi64(rows[0].h, b[1]), rows[1].h);
35 
36     rows[3].l = _mm_xor_si128(rows[3].l, rows[0].l);
37     rows[3].h = _mm_xor_si128(rows[3].h, rows[0].h);
38 
39     rows[3].l = _mm_roti_epi64(rows[3].l, -32);
40     rows[3].h = _mm_roti_epi64(rows[3].h, -32);
41 
42     rows[2].l = _mm_add_epi64(rows[2].l, rows[3].l);
43     rows[2].h = _mm_add_epi64(rows[2].h, rows[3].h);
44 
45     rows[1].l = _mm_xor_si128(rows[1].l, rows[2].l);
46     rows[1].h = _mm_xor_si128(rows[1].h, rows[2].h);
47 
48     rows[1].l = _mm_roti_epi64(rows[1].l, -24);
49     rows[1].h = _mm_roti_epi64(rows[1].h, -24);
50 `;
51 
52 immutable G2 = `
53     rows[0].l = _mm_add_epi64(_mm_add_epi64(rows[0].l, b[0]), rows[1].l);
54     rows[0].h = _mm_add_epi64(_mm_add_epi64(rows[0].h, b[1]), rows[1].h);
55 
56     rows[3].l = _mm_xor_si128(rows[3].l, rows[0].l);
57     rows[3].h = _mm_xor_si128(rows[3].h, rows[0].h);
58 
59     rows[3].l = _mm_roti_epi64(rows[3].l, -16);
60     rows[3].h = _mm_roti_epi64(rows[3].h, -16);
61 
62     rows[2].l = _mm_add_epi64(rows[2].l, rows[3].l);
63     rows[2].h = _mm_add_epi64(rows[2].h, rows[3].h);
64 
65     rows[1].l = _mm_xor_si128(rows[1].l, rows[2].l);
66     rows[1].h = _mm_xor_si128(rows[1].h, rows[2].h);
67 
68     rows[1].l = _mm_roti_epi64(rows[1].l, -63);
69     rows[1].h = _mm_roti_epi64(rows[1].h, -63);
70 `;
71 
72 immutable DIAGONALIZE = `
73     t[0] = rows[3].l;
74     t[1] = rows[1].l;
75     rows[3].l = rows[2].l;
76     rows[2].l = rows[2].h;
77     rows[2].h = rows[3].l;
78     rows[3].l = _mm_unpackhi_epi64(rows[3].h, _mm_unpacklo_epi64(t[0], t[0]));
79     rows[3].h = _mm_unpackhi_epi64(t[0], _mm_unpacklo_epi64(rows[3].h, rows[3].h));
80     rows[1].l = _mm_unpackhi_epi64(rows[1].l, _mm_unpacklo_epi64(rows[1].h, rows[1].h));
81     rows[1].h = _mm_unpackhi_epi64(rows[1].h, _mm_unpacklo_epi64(t[1], t[1]));
82 `;
83 
84 immutable UNDIAGONALIZE = `
85     t[0] = rows[2].l;
86     rows[2].l = rows[2].h;
87     rows[2].h = t[0];
88     t[0] = rows[1].l;
89     t[1] = rows[3].l;
90     rows[1].l = _mm_unpackhi_epi64(rows[1].h, _mm_unpacklo_epi64(rows[1].l, rows[1].l));
91     rows[1].h = _mm_unpackhi_epi64(t[0], _mm_unpacklo_epi64(rows[1].h, rows[1].h));
92     rows[3].l = _mm_unpackhi_epi64(rows[3].l, _mm_unpacklo_epi64(rows[3].h, rows[3].h));
93     rows[3].h = _mm_unpackhi_epi64(rows[3].h, _mm_unpacklo_epi64(t[1], t[1]));
94 `;
95 
96 immutable matrix = [
97         [
98             [2,  0,  6,  4 ],
99             [3,  1,  7,  5 ],
100             [10, 8,  14, 12],
101             [11, 9,  15, 13]
102         ],
103         [
104             [4,  14, 13, 9 ],
105             [8,  10, 6,  15],
106             [0,  1,  5,  11],
107             [2,  12, 3,  7 ]
108         ],
109         [
110             [12, 11, 15, 5 ],
111             [0,  8,  13, 2 ],
112             [3,  10, 9,  7 ],
113             [6,  14, 4,  1 ]
114         ],
115         [
116             [3,  7,  11, 13],
117             [1,  9,  14, 12],
118             [5,  2,  15, 4 ],
119             [10, 6,  8,  0 ]
120         ],
121         [
122             [5,  9,  10, 2 ],
123             [7,  0,  15, 4 ],
124             [11, 14, 3,  6 ],
125             [12, 1,  13, 8 ]
126         ],
127         [
128             [6,  2,  8,  0 ],
129             [10, 12, 3,  11],
130             [7,  4,  1,  15],
131             [5,  13, 9,  14]
132         ],
133         [
134             [1,  12, 4,  14],
135             [15, 5,  10, 13],
136             [6,  0,  8,  9 ],
137             [3,  7,  11, 2 ]
138         ],
139         [
140             [7,  13, 3,  12],
141             [14, 11, 9,  1 ],
142             [15, 5,  2,  8 ],
143             [4,  0,  10, 6 ]
144         ],
145         [
146             [14, 6,  0,  11],
147             [9,  15, 8,  3 ],
148             [13, 12, 10, 1 ],
149             [7,  2,  5,  4 ]
150         ],
151         [
152             [8,  10, 1,  7 ],
153             [4,  2,  5,  6 ],
154             [9,  15, 13, 3 ],
155             [14, 11, 0,  12]
156         ],
157         [
158             [2,  0,  6,  4 ],
159             [3,  1,  7,  5 ],
160             [10, 8,  14, 12],
161             [11, 9,  15, 13]
162         ],
163         [
164             [4,  14, 13, 9 ],
165             [8,  10, 6,  15],
166             [0,  1,  5,  11],
167             [2,  12, 3,  7 ]
168         ]
169     ];
170 
171 
172 version (LDC)
173 {
174     template tmplLoadMsg (int r, int c)
175     {
176         import std.conv: to;
177 
178         const cell = matrix[r][c];
179         const tmplLoadMsg = "
180             b[0] = _mm_set_epi64x(m["~to!string(cell[0])~"], m["~to!string(cell[1])~"]);
181             b[1] = _mm_set_epi64x(m["~to!string(cell[2])~"], m["~to!string(cell[3])~"]);
182         ";
183     }
184 
185     template tmplRound (int r)
186     {
187         const tmplRound =
188         tmplLoadMsg!(r, 0) ~
189         G1 ~
190         tmplLoadMsg!(r, 1) ~
191         G2 ~
192         DIAGONALIZE ~
193         tmplLoadMsg!(r, 2) ~
194         G1 ~
195         tmplLoadMsg!(r, 3) ~
196         G2 ~
197         UNDIAGONALIZE
198         ;
199     }
200 }
201 else
202 {
203     void loadMsg (in const(ulong)[16] m, in int r, in int c, out __m128i b0, out __m128i b1)
204     {
205         const cell = matrix[r][c];
206         b0 = _mm_set_epi64x(m[cell[0]], m[cell[1]]);
207         b1 = _mm_set_epi64x(m[cell[2]], m[cell[3]]);
208     }
209 
210     void round (in const(ulong)[16] m, in int r, ref Row[4] rows, ref __m128i[2] b, ref __m128i[2] t)
211     {
212         loadMsg(m, r, 0, b[0], b[1]);
213         mixin(G1);
214         loadMsg(m, r, 1, b[0], b[1]);
215         mixin(G2);
216         mixin(DIAGONALIZE);
217         loadMsg(m, r, 2, b[0], b[1]);
218         mixin(G1);
219         loadMsg(m, r, 3, b[0], b[1]);
220         mixin(G2);
221         mixin(UNDIAGONALIZE);
222     }
223 }