1 module crypto.blake2.blake2b_round; 2 3 import inteli.emmintrin; 4 5 6 package: 7 pure nothrow @nogc: 8 9 10 alias LOADU = _mm_loadu_si128; 11 alias STOREU = _mm_storeu_si128; 12 alias TOF = _mm_castsi128_ps; 13 alias TOI = _mm_castps_si128; 14 15 16 struct Row { 17 __m128i l, h; 18 19 this (__m128i l, __m128i h) 20 { 21 this.l = l; 22 this.h = h; 23 } 24 } 25 26 __m128i _mm_roti_epi64( in __m128i r, in int c ) 27 @safe 28 { 29 return _mm_xor_si128(_mm_srli_epi64( (r), -(c) ),_mm_slli_epi64( (r), 64-(-(c)) )); 30 } 31 32 immutable G1 = ` 33 rows[0].l = _mm_add_epi64(_mm_add_epi64(rows[0].l, b[0]), rows[1].l); 34 rows[0].h = _mm_add_epi64(_mm_add_epi64(rows[0].h, b[1]), rows[1].h); 35 36 rows[3].l = _mm_xor_si128(rows[3].l, rows[0].l); 37 rows[3].h = _mm_xor_si128(rows[3].h, rows[0].h); 38 39 rows[3].l = _mm_roti_epi64(rows[3].l, -32); 40 rows[3].h = _mm_roti_epi64(rows[3].h, -32); 41 42 rows[2].l = _mm_add_epi64(rows[2].l, rows[3].l); 43 rows[2].h = _mm_add_epi64(rows[2].h, rows[3].h); 44 45 rows[1].l = _mm_xor_si128(rows[1].l, rows[2].l); 46 rows[1].h = _mm_xor_si128(rows[1].h, rows[2].h); 47 48 rows[1].l = _mm_roti_epi64(rows[1].l, -24); 49 rows[1].h = _mm_roti_epi64(rows[1].h, -24); 50 `; 51 52 immutable G2 = ` 53 rows[0].l = _mm_add_epi64(_mm_add_epi64(rows[0].l, b[0]), rows[1].l); 54 rows[0].h = _mm_add_epi64(_mm_add_epi64(rows[0].h, b[1]), rows[1].h); 55 56 rows[3].l = _mm_xor_si128(rows[3].l, rows[0].l); 57 rows[3].h = _mm_xor_si128(rows[3].h, rows[0].h); 58 59 rows[3].l = _mm_roti_epi64(rows[3].l, -16); 60 rows[3].h = _mm_roti_epi64(rows[3].h, -16); 61 62 rows[2].l = _mm_add_epi64(rows[2].l, rows[3].l); 63 rows[2].h = _mm_add_epi64(rows[2].h, rows[3].h); 64 65 rows[1].l = _mm_xor_si128(rows[1].l, rows[2].l); 66 rows[1].h = _mm_xor_si128(rows[1].h, rows[2].h); 67 68 rows[1].l = _mm_roti_epi64(rows[1].l, -63); 69 rows[1].h = _mm_roti_epi64(rows[1].h, -63); 70 `; 71 72 immutable DIAGONALIZE = ` 73 t[0] = rows[3].l; 74 t[1] = rows[1].l; 75 rows[3].l = rows[2].l; 76 rows[2].l = rows[2].h; 77 rows[2].h = rows[3].l; 78 rows[3].l = _mm_unpackhi_epi64(rows[3].h, _mm_unpacklo_epi64(t[0], t[0])); 79 rows[3].h = _mm_unpackhi_epi64(t[0], _mm_unpacklo_epi64(rows[3].h, rows[3].h)); 80 rows[1].l = _mm_unpackhi_epi64(rows[1].l, _mm_unpacklo_epi64(rows[1].h, rows[1].h)); 81 rows[1].h = _mm_unpackhi_epi64(rows[1].h, _mm_unpacklo_epi64(t[1], t[1])); 82 `; 83 84 immutable UNDIAGONALIZE = ` 85 t[0] = rows[2].l; 86 rows[2].l = rows[2].h; 87 rows[2].h = t[0]; 88 t[0] = rows[1].l; 89 t[1] = rows[3].l; 90 rows[1].l = _mm_unpackhi_epi64(rows[1].h, _mm_unpacklo_epi64(rows[1].l, rows[1].l)); 91 rows[1].h = _mm_unpackhi_epi64(t[0], _mm_unpacklo_epi64(rows[1].h, rows[1].h)); 92 rows[3].l = _mm_unpackhi_epi64(rows[3].l, _mm_unpacklo_epi64(rows[3].h, rows[3].h)); 93 rows[3].h = _mm_unpackhi_epi64(rows[3].h, _mm_unpacklo_epi64(t[1], t[1])); 94 `; 95 96 immutable matrix = [ 97 [ 98 [2, 0, 6, 4 ], 99 [3, 1, 7, 5 ], 100 [10, 8, 14, 12], 101 [11, 9, 15, 13] 102 ], 103 [ 104 [4, 14, 13, 9 ], 105 [8, 10, 6, 15], 106 [0, 1, 5, 11], 107 [2, 12, 3, 7 ] 108 ], 109 [ 110 [12, 11, 15, 5 ], 111 [0, 8, 13, 2 ], 112 [3, 10, 9, 7 ], 113 [6, 14, 4, 1 ] 114 ], 115 [ 116 [3, 7, 11, 13], 117 [1, 9, 14, 12], 118 [5, 2, 15, 4 ], 119 [10, 6, 8, 0 ] 120 ], 121 [ 122 [5, 9, 10, 2 ], 123 [7, 0, 15, 4 ], 124 [11, 14, 3, 6 ], 125 [12, 1, 13, 8 ] 126 ], 127 [ 128 [6, 2, 8, 0 ], 129 [10, 12, 3, 11], 130 [7, 4, 1, 15], 131 [5, 13, 9, 14] 132 ], 133 [ 134 [1, 12, 4, 14], 135 [15, 5, 10, 13], 136 [6, 0, 8, 9 ], 137 [3, 7, 11, 2 ] 138 ], 139 [ 140 [7, 13, 3, 12], 141 [14, 11, 9, 1 ], 142 [15, 5, 2, 8 ], 143 [4, 0, 10, 6 ] 144 ], 145 [ 146 [14, 6, 0, 11], 147 [9, 15, 8, 3 ], 148 [13, 12, 10, 1 ], 149 [7, 2, 5, 4 ] 150 ], 151 [ 152 [8, 10, 1, 7 ], 153 [4, 2, 5, 6 ], 154 [9, 15, 13, 3 ], 155 [14, 11, 0, 12] 156 ], 157 [ 158 [2, 0, 6, 4 ], 159 [3, 1, 7, 5 ], 160 [10, 8, 14, 12], 161 [11, 9, 15, 13] 162 ], 163 [ 164 [4, 14, 13, 9 ], 165 [8, 10, 6, 15], 166 [0, 1, 5, 11], 167 [2, 12, 3, 7 ] 168 ] 169 ]; 170 171 172 version (LDC) 173 { 174 template tmplLoadMsg (int r, int c) 175 { 176 import std.conv: to; 177 178 const cell = matrix[r][c]; 179 const tmplLoadMsg = " 180 b[0] = _mm_set_epi64x(m["~to!string(cell[0])~"], m["~to!string(cell[1])~"]); 181 b[1] = _mm_set_epi64x(m["~to!string(cell[2])~"], m["~to!string(cell[3])~"]); 182 "; 183 } 184 185 template tmplRound (int r) 186 { 187 const tmplRound = 188 tmplLoadMsg!(r, 0) ~ 189 G1 ~ 190 tmplLoadMsg!(r, 1) ~ 191 G2 ~ 192 DIAGONALIZE ~ 193 tmplLoadMsg!(r, 2) ~ 194 G1 ~ 195 tmplLoadMsg!(r, 3) ~ 196 G2 ~ 197 UNDIAGONALIZE 198 ; 199 } 200 } 201 else 202 { 203 void loadMsg (in const(ulong)[16] m, in int r, in int c, out __m128i b0, out __m128i b1) 204 { 205 const cell = matrix[r][c]; 206 b0 = _mm_set_epi64x(m[cell[0]], m[cell[1]]); 207 b1 = _mm_set_epi64x(m[cell[2]], m[cell[3]]); 208 } 209 210 void round (in const(ulong)[16] m, in int r, ref Row[4] rows, ref __m128i[2] b, ref __m128i[2] t) 211 { 212 loadMsg(m, r, 0, b[0], b[1]); 213 mixin(G1); 214 loadMsg(m, r, 1, b[0], b[1]); 215 mixin(G2); 216 mixin(DIAGONALIZE); 217 loadMsg(m, r, 2, b[0], b[1]); 218 mixin(G1); 219 loadMsg(m, r, 3, b[0], b[1]); 220 mixin(G2); 221 mixin(UNDIAGONALIZE); 222 } 223 }