1 module dcrypt.aead.gcm.multiplier; 2 3 package: 4 5 import dcrypt.aead.gcm.galoisfield; 6 7 // TODO Dynamically make use of intel pclmulqdq instruction for fast multiplication. 8 9 /// test if T is a GCM multiplier 10 @safe 11 template isGCMMultiplier(T) 12 { 13 enum bool isGCMMultiplier = 14 is(T == struct) && 15 is(typeof( 16 { 17 ubyte[16] block; 18 T m = void; 19 m.init(block); 20 m.multiply(block); 21 })); 22 } 23 24 /// This struct provides schoolbook multiplication in GF(2^128). 25 @safe 26 struct GCMBasicMultiplier 27 { 28 29 private { 30 ubyte[16] H; 31 } 32 33 this(in ubyte[] H) nothrow @nogc 34 in { 35 assert(H.length == 16, "H: invalid length"); 36 } 37 body { 38 init(H); 39 } 40 41 nothrow @nogc { 42 /** 43 * initialize the multiplicator 44 */ 45 void init(in ubyte[] H) 46 in { 47 assert(H.length == 16, "H: invalid length"); 48 } 49 body { 50 this.H[] = H[]; 51 } 52 53 /// Multiply x by H and store result in x. 54 /// 55 /// Params: 56 /// x = 16 byte block 57 void multiply(ubyte[] x) 58 in { 59 assert(x.length == 16, "x: invalid length."); 60 } 61 body { 62 GF128.multiply(x, H); 63 } 64 } 65 66 /// test multiplication using schoolbook multiplication 67 unittest { 68 69 immutable ubyte[16] H = cast(immutable ubyte[16]) x"66e94bd4ef8a2c3b884cfa59ca342b2e"; 70 ubyte[16] X1 = cast(immutable ubyte[16]) x"0388dace60b6a392f328c2b971b2fe78"; 71 72 GCMBasicMultiplier mult = GCMBasicMultiplier(H); 73 74 mult.multiply(X1); 75 76 assert(X1 == x"5e2ec746917062882c85b0685353deb7", "GF128 multiplication with 8k table failed!"); 77 } 78 79 } 80 81 /// This struct provides table driven multiplication in GF(2^128). 82 @safe 83 struct GCMMultiplier8kTable 84 { 85 86 private { 87 ubyte[16][16][32] M; 88 } 89 90 this(in ubyte[] H) nothrow @nogc 91 in { 92 assert(H.length == 16, "H: invalid length"); 93 } 94 body { 95 init(H); 96 } 97 98 nothrow @nogc { 99 /** 100 * initialize the multiplicator 101 */ 102 void init(in ubyte[] H) { 103 tableSetup(H); 104 } 105 106 /// Multiply x by H and store result in x. 107 /// 108 /// Params: 109 /// x = 16 byte block 110 void multiply(ubyte[] x) 111 in { 112 assert(x.length == 16, "x: invalid length."); 113 } 114 body { 115 116 ubyte[16] z; 117 118 for(uint i = 0; i < 16; ++i) { 119 z[] ^= M[2*i][x[i]>>4][]; 120 z[] ^= M[2*i+1][x[i]&0xF][]; 121 } 122 123 x[] = z[]; 124 } 125 } 126 127 /// test multiplication using 8k table 128 unittest { 129 130 immutable ubyte[16] H = cast(immutable ubyte[16]) x"66e94bd4ef8a2c3b884cfa59ca342b2e"; 131 ubyte[16] X1 = cast(immutable ubyte[16]) x"0388dace60b6a392f328c2b971b2fe78"; 132 133 GCMMultiplier8kTable mult = GCMMultiplier8kTable(H); 134 135 mult.multiply(X1); 136 137 assert(X1 == x"5e2ec746917062882c85b0685353deb7", "GF128 multiplication with 8k table failed!"); 138 } 139 140 private void tableSetup(in ubyte[] H) nothrow @nogc 141 in { 142 assert(H.length == 16, "H: invalid length"); 143 } 144 body { 145 ubyte[16] Pi; 146 Pi[0] = 0x80; 147 ubyte[1] oneByte; 148 for(int i = 0; i < 32; ++i) { 149 for(uint j = 0; j < 16; ++j) { 150 M[i][j] = H; 151 oneByte[0] = cast(ubyte) (j<<4); 152 GF128.multiply(M[i][j], oneByte); 153 GF128.multiply(M[i][j], Pi); 154 } 155 multiplyP4(Pi); 156 } 157 } 158 159 private void multiplyP4(ubyte[] x) nothrow @nogc { 160 foreach(i;0..4){ 161 GF128.multiplyP(x); 162 } 163 } 164 165 } 166 167 /// This class provides table driven multiplication in GF(2^128). 168 /// The 64k table is rather large and probably won't fit into the cache. 169 /// Use the 8k table to avoid timing based leaks. 170 @safe 171 struct GCMMultiplier64kTable 172 { 173 174 private { 175 ubyte[16][256][16] M; 176 } 177 178 this(in ubyte[] H) nothrow @nogc 179 in { 180 assert(H.length == 16, "H: invalid length"); 181 } 182 body { 183 init(H); 184 } 185 186 nothrow @nogc { 187 188 /// initialize the multiplicator 189 void init(in ubyte[] H) { 190 tableSetup(H); 191 } 192 193 /// Multiply x by H and store result in x. 194 /// 195 /// Params: 196 /// x = 16 byte block 197 void multiply(ubyte[] x) 198 in { 199 assert(x.length == 16, "x: invalid length."); 200 } 201 body { 202 203 ubyte[16] z; 204 205 for(uint i = 0; i < 16; ++i) { 206 z[] ^= M[i][x[i]][]; 207 } 208 209 x[] = z[]; 210 } 211 } 212 213 /// test multiplication using 64k table 214 unittest { 215 immutable ubyte[16] H = cast(immutable ubyte[16]) x"66e94bd4ef8a2c3b884cfa59ca342b2e"; 216 ubyte[16] X1 = cast(immutable ubyte[16]) x"0388dace60b6a392f328c2b971b2fe78"; 217 218 GCMMultiplier64kTable mult = GCMMultiplier64kTable(H); 219 220 mult.multiply(X1); 221 222 assert(X1 == x"5e2ec746917062882c85b0685353deb7", "GF128 multiplication with 64k table failed!"); 223 } 224 225 private void tableSetup(in ubyte[] H) nothrow @nogc 226 in { 227 assert(H.length == 16, "H: invalid length"); 228 } 229 body { 230 ubyte[16] P; 231 P[0] = 0x80; 232 ubyte[1] oneByte; 233 for(int i = 0; i < 16; ++i) { 234 for(uint j = 0; j <= 255; ++j) { 235 M[i][j] = H; 236 oneByte[0] = cast(ubyte) j; 237 GF128.multiply(M[i][j], oneByte); 238 GF128.multiply(M[i][j], P); 239 } 240 GF128.multiplyP8(P); 241 } 242 } 243 244 } 245 246 247 /// This struct provides hardware accelerated multiplication in GF(2^128) 248 /// using the Intel PCLMULQDQ instruction. 249 /// 250 /// See: https://software.intel.com/sites/default/files/managed/72/cc/clmul-wp-rev-2.02-2014-04-20.pdf 251 @safe 252 struct GCMPCLMULQDQMultiplier 253 { 254 255 private { 256 ubyte[16] H; 257 } 258 259 this(in ubyte[] H) nothrow @nogc 260 in { 261 assert(H.length == 16, "H: invalid length"); 262 } 263 body { 264 init(H); 265 } 266 267 nothrow @nogc { 268 /** 269 * initialize the multiplicator 270 */ 271 void init(in ubyte[] H) 272 in { 273 assert(H.length == 16, "H: invalid length"); 274 } 275 body { 276 this.H[] = H[]; 277 } 278 279 /// Multiply x by H and store result in x. 280 /// 281 /// Params: 282 /// x = 16 byte block 283 void multiply(ubyte[] x) 284 in { 285 assert(x.length == 16, "x: invalid length."); 286 } 287 body { 288 //GF128.multiply(x, H); 289 gfmul(x, H); 290 } 291 } 292 293 /// Multiplies a with b, result is stored in a. 294 @trusted 295 private void gfmul(ubyte[] a, in ubyte[] b) nothrow @nogc 296 in { 297 assert(a.length == 16, "Invalid length of input. Must be 16 bytes."); 298 assert(b.length == 16, "Invalid length of input. Must be 16 bytes."); 299 } 300 body { 301 302 import std.algorithm: reverse; 303 ubyte[16] revB; 304 305 reverse(a); 306 307 revB[] = b[]; 308 reverse(revB[]); 309 310 asm @nogc nothrow { 311 //xmm0 holds operand a (128 bits) 312 //xmm1 holds operand b (128 bits) 313 //rdi holds the pointer to output (128 bits) 314 315 // Note: since pclmulqdq instruction is not supported as of this writing binary opcodes are used. 316 // TODO Replace binary opcodes by pclmulqdq as soon as this instruction is supported by dmd's inline assembler. 317 318 // load pointer to a 319 mov RSI, a+8; 320 mov RDI, RSI; // store destination address 321 // load input into registers XMM0 and XMM1 322 movdqu XMM0, [RSI]; 323 //mov RSI, b+8; 324 //movdqu XMM1, [RSI]; 325 326 movdqu XMM1, revB[RBP]; 327 328 movdqa XMM3, XMM0; 329 db 0x66, 0x0f, 0x3a, 0x44, 0xd9, 0x00; // pclmulqdq XMM3, XMM1, 0x00; // XMM3 holds a0*b0 330 movdqa XMM4, XMM0; 331 db 0x66, 0x0f, 0x3a, 0x44, 0xe1, 0x10; // pclmulqdq XMM4, XMM1, 0x10; //XMM4 holds a0*b1 332 movdqa XMM5, XMM0; 333 db 0x66, 0x0f, 0x3a, 0x44, 0xe9, 0x01; // pclmulqdq XMM5, XMM1, 0x01; // XMM5 holds a1*b0 334 movdqa XMM6, XMM0; 335 db 0x66, 0x0f, 0x3a, 0x44, 0xf1, 0x11; // pclmulqdq XMM6, XMM1, 0x11; // XMM6 holds a1*b1 336 pxor XMM4, XMM5; // XMM4 holds a0*b1 + a1*b0 337 movdqa XMM5, XMM4; 338 psrldq XMM4, 8; 339 pslldq XMM5, 8; 340 pxor XMM3, XMM5; 341 pxor XMM6, XMM4; // <XMM6:XMM3> holds the result of 342 // the carry-less multiplication of XMM0 by XMM1 343 // shift the result by one bit position to the left cope for the fact 344 // that bits are reversed 345 movdqa XMM7, XMM3; 346 movdqa XMM8, XMM6; 347 pslld XMM3, 1; 348 pslld XMM6, 1; 349 psrld XMM7, 31; 350 psrld XMM8, 31; 351 movdqa XMM9, XMM7; 352 pslldq XMM8, 4; 353 pslldq XMM7, 4; 354 psrldq XMM9, 12; 355 por XMM3, XMM7; 356 por XMM6, XMM8; 357 por XMM6, XMM9; 358 //first phase of the reduction 359 movdqa XMM7, XMM3; 360 movdqa XMM8, XMM3; 361 movdqa XMM9, XMM3; 362 pslld XMM7, 31; // packed right shifting << 31 363 pslld XMM8, 30; // packed right shifting shift << 30 364 pslld XMM9, 25; // packed right shifting shift << 25 365 pxor XMM7, XMM8; // xor the shifted versions 366 pxor XMM7, XMM9; 367 movdqa XMM8, XMM7; 368 369 370 pslldq XMM7, 12; 371 psrldq XMM8, 4; 372 pxor XMM3, XMM7; // first phase of the reduction complete 373 movdqa XMM2, XMM3; // second phase of the reduction 374 movdqa XMM4, XMM3; 375 movdqa XMM5, XMM3; 376 psrld XMM2, 1; // packed left shifting >> 1 377 psrld XMM4, 2; // packed left shifting >> 2 378 psrld XMM5, 7; // packed left shifting >> 7 379 380 pxor XMM2, XMM4; // xor the shifted versions 381 pxor XMM2, XMM5; 382 pxor XMM2, XMM8; 383 pxor XMM3, XMM2; 384 pxor XMM6, XMM3; // the result is in xmm6 385 movdqu [RDI], XMM6; // store the result 386 } 387 388 reverse(a); 389 390 } 391 392 // test pclmulqdq instruction with multiplication by 1 393 @trusted 394 unittest { 395 import core.cpuid; 396 version(D_InlineAsm_X86_64) { 397 if(aes) { 398 399 ubyte[16] a = cast(const ubyte[16]) x"12345678000000000000000000000000"; 400 ubyte[16] b = cast(const ubyte[16]) x"01000000000000000000000000000000"; 401 ubyte[16] c; 402 403 asm { 404 movdqu XMM1, a[RBP]; 405 movdqu XMM3, b[EBP]; 406 407 db 0x66, 0x0f, 0x3a, 0x44, 0xd9, 0x00; // pclmulqdq XMM3, XMM1, 0x00; // XMM3 holds a0*b0 408 409 movdqu c[EBP], XMM3; 410 } 411 412 assert(c == x"12345678000000000000000000000000"); 413 } 414 } 415 } 416 417 /// test pclmulqdq instruction with test vectors from 418 /// https://software.intel.com/sites/default/files/managed/72/cc/clmul-wp-rev-2.02-2014-04-20.pdf 419 @trusted 420 unittest { 421 import core.cpuid; 422 423 version(D_InlineAsm_X86_64) { 424 if(aes) { 425 426 /// Python code to convert test vectors into little endian format. 427 /// Reverses the string by bytes (not by hexits): 428 /// 429 /// import binascii 430 /// def conv(xmmstr): 431 /// bytearr=bytearray.fromhex(xmmstr)[::-1] 432 /// return binascii.hexlify(bytearr) 433 /// 434 /// conv('7b5b54657374566563746f725d53475d') 435 /// conv('48692853686179295b477565726f6e5d') 436 /// conv('1d4d84c85c3440c0929633d5d36f0451') 437 /// 438 439 ubyte[16] a = cast(const ubyte[16]) x"5d47535d726f74636556747365545b7b"; // xxm1 high: 7b5b546573745665 low: 63746f725d53475d 440 ubyte[16] b = cast(const ubyte[16]) x"5d6e6f726575475b2979616853286948"; // 4869285368617929 5b477565726f6e5d 441 ubyte[16] c; 442 443 asm { 444 movdqu XMM1, a[RBP]; 445 movdqu XMM3, b[EBP]; 446 447 db 0x66, 0x0f, 0x3a, 0x44, 0xd9, 0x00; // pclmulqdq XMM3, XMM1, 0x00; // XMM3 holds a0*b0 448 449 movdqu c[EBP], XMM3; 450 } 451 assert(c == x"51046fd3d5339692c040345cc8844d1d"); 452 453 asm { 454 movdqu XMM1, a[RBP]; 455 movdqu XMM3, b[EBP]; 456 457 db 0x66, 0x0f, 0x3a, 0x44, 0xd9, 0x01; 458 459 movdqu c[EBP], XMM3; 460 } 461 assert(c == x"1513282aac40a57fa1b56a558d7cd11b"); 462 463 asm { 464 movdqu XMM1, a[RBP]; 465 movdqu XMM3, b[EBP]; 466 467 db 0x66, 0x0f, 0x3a, 0x44, 0xd9, 0x10; 468 469 movdqu c[EBP], XMM3; 470 } 471 assert(c == x"c9d5b7f42d26bfba2f86303adbf62b1a"); 472 473 asm { 474 movdqu XMM1, a[RBP]; 475 movdqu XMM3, b[EBP]; 476 477 db 0x66, 0x0f, 0x3a, 0x44, 0xd9, 0x11; 478 479 movdqu c[EBP], XMM3; 480 } 481 assert(c == x"edd40f413ee06ed6457c2e592c1f1e1d"); 482 } 483 } 484 } 485 486 487 // /// test hardware accelerated multiplication (pclmulqdq) 488 // unittest { 489 // 490 // immutable ubyte[16] H = cast(immutable ubyte[16]) x"00000000000000000000000000000080"; // neutral element 491 // ubyte[16] X1 = cast(immutable ubyte[16]) x"0388dace60b6a392f328c2b971b2fe78"; 492 // 493 // GCMPCLMULQDQMultiplier mult = GCMPCLMULQDQMultiplier(H); 494 // 495 // mult.multiply(X1); 496 // 497 // assert(X1 == x"0388dace60b6a392f328c2b971b2fe78", "GF128 multiplication with pclmulqdq failed!"); 498 // } 499 500 /// test hardware accelerated multiplication (pclmulqdq) 501 unittest { 502 503 import std.algorithm: reverse; 504 505 ubyte[16] H = cast(immutable ubyte[16]) x"952b2a56a5604ac0b32b6656a05b40b6"; 506 ubyte[16] X1 = cast(immutable ubyte[16]) x"dfa6bf4ded81db03ffcaff95f830f061"; 507 508 ubyte[16] expected = cast(immutable ubyte[16]) x"da53eb0ad2c55bb64fc4802cc3feda60"; 509 510 // reverse(H[]); 511 // reverse(X1[]); 512 // reverse(expected[]); 513 514 //GCMMultiplier8kTable mult = GCMMultiplier8kTable(H); 515 GCMPCLMULQDQMultiplier mult = GCMPCLMULQDQMultiplier(H); 516 517 mult.multiply(X1); 518 519 assert(X1 == expected, "GF128 multiplication with pclmulqdq failed!"); 520 } 521 522 // /// test hardware accelerated multiplication (pclmulqdq) 523 // unittest { 524 // 525 // ulong[2] H = [0xb32b6656a05b40b6, 0x952b2a56a5604ac0]; 526 // ulong[2] X1 = [0xffcaff95f830f061, 0xdfa6bf4ded81db03]; 527 // 528 // ulong[2] expected = [0x4fc4802cc3feda60, 0xda53eb0ad2c55bb6]; 529 // 530 // //GCMMultiplier8kTable mult = GCMMultiplier8kTable(H); 531 // GCMPCLMULQDQMultiplier mult = GCMPCLMULQDQMultiplier(cast(ubyte[16])H); 532 // 533 // mult.multiply(cast(ubyte[16])X1); 534 // 535 // assert(X1 == expected, "GF128 multiplication with pclmulqdq failed!"); 536 // } 537 538 }