1 module dcrypt.aead.gcm.multiplier;
2 
3 package:
4 
5 import dcrypt.aead.gcm.galoisfield;
6 
7 // TODO Dynamically make use of intel pclmulqdq instruction for fast multiplication.
8 
9 /// test if T is a GCM multiplier
10 @safe
11 template isGCMMultiplier(T)
12 {
13 	enum bool isGCMMultiplier =
14 		is(T == struct) &&
15 			is(typeof(
16 					{
17 						ubyte[16] block;
18 						T m = void;
19 						m.init(block);
20 						m.multiply(block);
21 					}));
22 }
23 
24 /// This struct provides schoolbook multiplication in GF(2^128).
25 @safe
26 struct GCMBasicMultiplier
27 {
28 	
29 	private {
30 		ubyte[16] H;
31 	}
32 	
33 	this(in ubyte[] H) nothrow @nogc
34 	in {
35 		assert(H.length == 16, "H: invalid length");
36 	}
37 	body {
38 		init(H);
39 	}
40 	
41 	nothrow @nogc {
42 		/**
43 		 * initialize the multiplicator
44 		 */
45 		void init(in ubyte[] H) 
46 		in {
47 			assert(H.length == 16, "H: invalid length");
48 		}
49 		body {
50 			this.H[] = H[];
51 		}
52 		
53 		/// Multiply x by H and store result in x.
54 		/// 
55 		/// Params:
56 		/// x = 16 byte block
57 		void multiply(ubyte[] x)
58 		in {
59 			assert(x.length == 16, "x: invalid length.");
60 		}
61 		body {
62 			GF128.multiply(x, H);
63 		}
64 	}
65 	
66 	/// test multiplication using schoolbook multiplication
67 	unittest {
68 		
69 		immutable ubyte[16] H = cast(immutable ubyte[16]) x"66e94bd4ef8a2c3b884cfa59ca342b2e";
70 		ubyte[16] X1 = cast(immutable ubyte[16]) x"0388dace60b6a392f328c2b971b2fe78";
71 		
72 		GCMBasicMultiplier mult = GCMBasicMultiplier(H);
73 		
74 		mult.multiply(X1);
75 		
76 		assert(X1 == x"5e2ec746917062882c85b0685353deb7", "GF128 multiplication with 8k table failed!");
77 	}
78 	
79 }
80 
81 /// This struct provides table driven multiplication in GF(2^128).
82 @safe
83 struct GCMMultiplier8kTable
84 {
85 	
86 	private {
87 		ubyte[16][16][32] M;
88 	}
89 	
90 	this(in ubyte[] H) nothrow @nogc
91 	in {
92 		assert(H.length == 16, "H: invalid length");
93 	}
94 	body {
95 		init(H);
96 	}
97 	
98 	nothrow @nogc {
99 		/**
100 		 * initialize the multiplicator
101 		 */
102 		void init(in ubyte[] H) {
103 			tableSetup(H);
104 		}
105 		
106 		/// Multiply x by H and store result in x.
107 		/// 
108 		/// Params:
109 		/// x = 16 byte block
110 		void multiply(ubyte[] x)
111 		in {
112 			assert(x.length == 16, "x: invalid length.");
113 		}
114 		body {
115 			
116 			ubyte[16] z;
117 			
118 			for(uint i = 0; i < 16; ++i) {
119 				z[] ^= M[2*i][x[i]>>4][];
120 				z[] ^= M[2*i+1][x[i]&0xF][];
121 			}
122 			
123 			x[] = z[];
124 		}
125 	}
126 	
127 	/// test multiplication using 8k table
128 	unittest {
129 		
130 		immutable ubyte[16] H = cast(immutable ubyte[16]) x"66e94bd4ef8a2c3b884cfa59ca342b2e";
131 		ubyte[16] X1 = cast(immutable ubyte[16]) x"0388dace60b6a392f328c2b971b2fe78";
132 		
133 		GCMMultiplier8kTable mult = GCMMultiplier8kTable(H);
134 		
135 		mult.multiply(X1);
136 		
137 		assert(X1 == x"5e2ec746917062882c85b0685353deb7", "GF128 multiplication with 8k table failed!");
138 	}
139 	
140 	private void tableSetup(in ubyte[] H) nothrow @nogc
141 	in {
142 		assert(H.length == 16, "H: invalid length");
143 	}
144 	body {
145 		ubyte[16] Pi;
146 		Pi[0] = 0x80;
147 		ubyte[1] oneByte;
148 		for(int i = 0; i < 32; ++i) {
149 			for(uint j = 0; j < 16; ++j) {
150 				M[i][j] = H;
151 				oneByte[0] = cast(ubyte) (j<<4);
152 				GF128.multiply(M[i][j], oneByte);
153 				GF128.multiply(M[i][j], Pi);
154 			}
155 			multiplyP4(Pi);
156 		}
157 	}
158 	
159 	private void multiplyP4(ubyte[] x) nothrow @nogc {
160 		foreach(i;0..4){
161 			GF128.multiplyP(x);
162 		}
163 	}
164 	
165 }
166 
167 /// This class provides table driven multiplication in GF(2^128).
168 /// The 64k table is rather large and probably won't fit into the cache.
169 /// Use the 8k table to avoid timing based leaks.
170 @safe
171 struct GCMMultiplier64kTable
172 {
173 	
174 	private {
175 		ubyte[16][256][16] M;
176 	}
177 	
178 	this(in ubyte[] H) nothrow @nogc
179 	in {
180 		assert(H.length == 16, "H: invalid length");
181 	}
182 	body {
183 		init(H);
184 	}
185 	
186 	nothrow @nogc {
187 
188 		/// initialize the multiplicator
189 		void init(in ubyte[] H) {
190 			tableSetup(H);
191 		}
192 		
193 		/// Multiply x by H and store result in x.
194 		/// 
195 		/// Params:
196 		/// x = 16 byte block
197 		void multiply(ubyte[] x)
198 		in {
199 			assert(x.length == 16, "x: invalid length.");
200 		}
201 		body {
202 			
203 			ubyte[16] z;
204 			
205 			for(uint i = 0; i < 16; ++i) {
206 				z[] ^= M[i][x[i]][];
207 			}
208 			
209 			x[] = z[];
210 		}
211 	}
212 	
213 	/// test multiplication using 64k table
214 	unittest {
215 		immutable ubyte[16] H = cast(immutable ubyte[16]) x"66e94bd4ef8a2c3b884cfa59ca342b2e";
216 		ubyte[16] X1 = cast(immutable ubyte[16]) x"0388dace60b6a392f328c2b971b2fe78";
217 		
218 		GCMMultiplier64kTable mult = GCMMultiplier64kTable(H);
219 		
220 		mult.multiply(X1);
221 		
222 		assert(X1 == x"5e2ec746917062882c85b0685353deb7", "GF128 multiplication with 64k table failed!");
223 	}
224 	
225 	private void tableSetup(in ubyte[] H) nothrow @nogc
226 	in {
227 		assert(H.length == 16, "H: invalid length");
228 	}
229 	body {
230 		ubyte[16] P;
231 		P[0] = 0x80;
232 		ubyte[1] oneByte;
233 		for(int i = 0; i < 16; ++i) {
234 			for(uint j = 0; j <= 255; ++j) {
235 				M[i][j] = H;
236 				oneByte[0] = cast(ubyte) j;
237 				GF128.multiply(M[i][j], oneByte);
238 				GF128.multiply(M[i][j], P);
239 			}
240 			GF128.multiplyP8(P);
241 		}
242 	}
243 	
244 }
245 
246 
247 /// This struct provides hardware accelerated multiplication in GF(2^128)
248 /// using the Intel PCLMULQDQ instruction.
249 /// 
250 /// See: https://software.intel.com/sites/default/files/managed/72/cc/clmul-wp-rev-2.02-2014-04-20.pdf
251 @safe
252 struct GCMPCLMULQDQMultiplier
253 {
254 	
255 	private {
256 		ubyte[16] H;
257 	}
258 	
259 	this(in ubyte[] H) nothrow @nogc
260 	in {
261 		assert(H.length == 16, "H: invalid length");
262 	}
263 	body {
264 		init(H);
265 	}
266 	
267 	nothrow @nogc {
268 		/**
269 		 * initialize the multiplicator
270 		 */
271 		void init(in ubyte[] H) 
272 		in {
273 			assert(H.length == 16, "H: invalid length");
274 		}
275 		body {
276 			this.H[] = H[];
277 		}
278 		
279 		/// Multiply x by H and store result in x.
280 		/// 
281 		/// Params:
282 		/// x = 16 byte block
283 		void multiply(ubyte[] x)
284 		in {
285 			assert(x.length == 16, "x: invalid length.");
286 		}
287 		body {
288 			//GF128.multiply(x, H);
289 			gfmul(x, H);
290 		}
291 	}
292 	
293 	/// Multiplies a with b, result is stored in a.
294 	@trusted
295 	private void gfmul(ubyte[] a, in ubyte[] b) nothrow @nogc
296 	in {
297 		assert(a.length == 16, "Invalid length of input. Must be 16 bytes.");
298 		assert(b.length == 16, "Invalid length of input. Must be 16 bytes.");
299 	}
300 	body {
301 
302 		import std.algorithm: reverse;
303 		ubyte[16] revB;
304 
305 		reverse(a);
306 
307 		revB[] = b[];
308 		reverse(revB[]);
309 
310 		asm @nogc nothrow {
311 			//xmm0 holds operand a (128 bits)
312 			//xmm1 holds operand b (128 bits)
313 			//rdi holds the pointer to output (128 bits)
314 
315 			// Note: since pclmulqdq instruction is not supported as of this writing binary opcodes are used.
316 			// TODO Replace binary opcodes by pclmulqdq as soon as this instruction is supported by dmd's inline assembler.
317 
318 			// load pointer to a
319 			mov	RSI, a+8;
320 			mov RDI, RSI; // store destination address
321 			// load input into registers XMM0 and XMM1
322 			movdqu	XMM0, [RSI];
323 			//mov	RSI, b+8;
324 			//movdqu	XMM1, [RSI];
325 
326 			movdqu	XMM1, revB[RBP];
327 
328 			movdqa     XMM3, XMM0;
329 			db 0x66, 0x0f, 0x3a, 0x44, 0xd9, 0x00;	// pclmulqdq  XMM3, XMM1, 0x00;    // XMM3 holds a0*b0
330 			movdqa     XMM4, XMM0;
331 			db 0x66, 0x0f, 0x3a, 0x44, 0xe1, 0x10;	// pclmulqdq  XMM4, XMM1, 0x10;    //XMM4 holds a0*b1
332 			movdqa     XMM5, XMM0;
333 			db 0x66, 0x0f, 0x3a, 0x44, 0xe9, 0x01; // pclmulqdq  XMM5, XMM1, 0x01;     // XMM5 holds a1*b0
334 			movdqa     XMM6, XMM0;
335 			db 0x66, 0x0f, 0x3a, 0x44, 0xf1, 0x11;	// pclmulqdq  XMM6, XMM1, 0x11;    // XMM6 holds a1*b1
336 			pxor       XMM4, XMM5;         // XMM4 holds a0*b1 + a1*b0
337 			movdqa     XMM5, XMM4;
338 			psrldq     XMM4, 8;
339 			pslldq     XMM5, 8;
340 			pxor       XMM3, XMM5;
341 			pxor       XMM6, XMM4;         // <XMM6:XMM3> holds the result of 
342 			// the carry-less multiplication of XMM0 by XMM1
343 			// shift the result by one bit position to the left cope for the fact
344 			// that bits are reversed
345 			movdqa   XMM7, XMM3;
346 			movdqa   XMM8, XMM6;
347 			pslld    XMM3, 1;
348 			pslld    XMM6, 1;
349 			psrld    XMM7, 31;
350 			psrld    XMM8, 31;
351 			movdqa   XMM9, XMM7;
352 			pslldq   XMM8, 4;
353 			pslldq   XMM7, 4;
354 			psrldq   XMM9, 12;
355 			por      XMM3, XMM7;
356 			por      XMM6, XMM8;
357 			por      XMM6, XMM9;
358 			//first phase of the reduction
359 			movdqa   XMM7, XMM3;
360 			movdqa   XMM8, XMM3;
361 			movdqa   XMM9, XMM3;     
362 			pslld    XMM7, 31;            // packed right shifting << 31  
363 			pslld    XMM8, 30;            // packed right shifting shift << 30
364 			pslld    XMM9, 25;            // packed right shifting shift << 25  
365 			pxor     XMM7, XMM8;          // xor the shifted versions
366 			pxor     XMM7, XMM9;    
367 			movdqa   XMM8, XMM7;
368 
369 			
370 			pslldq   XMM7, 12;
371 			psrldq   XMM8, 4;
372 			pxor     XMM3, XMM7;          // first phase of the reduction complete 
373 			movdqa   XMM2, XMM3;           // second phase of the reduction
374 			movdqa   XMM4, XMM3;
375 			movdqa   XMM5, XMM3;  
376 			psrld    XMM2, 1;             // packed left shifting >> 1
377 			psrld    XMM4, 2;             // packed left shifting >> 2
378 			psrld    XMM5, 7;             // packed left shifting >> 7   
379 
380 			pxor     XMM2, XMM4;          // xor the shifted versions
381 			pxor     XMM2, XMM5;
382 			pxor     XMM2, XMM8;
383 			pxor     XMM3, XMM2; 
384 			pxor     XMM6, XMM3;          // the result is in xmm6 
385 			movdqu   [RDI], XMM6;         // store the result
386 		}
387 
388 		reverse(a);
389 
390 	}
391 
392 	// test pclmulqdq instruction with multiplication by 1
393 	@trusted
394 	unittest {
395 		import core.cpuid;
396 		version(D_InlineAsm_X86_64) {
397 			if(aes) {
398 				
399 				ubyte[16] a = cast(const ubyte[16]) x"12345678000000000000000000000000"; 
400 				ubyte[16] b = cast(const ubyte[16]) x"01000000000000000000000000000000"; 
401 				ubyte[16] c;
402 				
403 				asm {
404 					movdqu XMM1, a[RBP];
405 					movdqu XMM3, b[EBP];
406 					
407 					db 0x66, 0x0f, 0x3a, 0x44, 0xd9, 0x00;	// pclmulqdq  XMM3, XMM1, 0x00;    // XMM3 holds a0*b0
408 					
409 					movdqu c[EBP], XMM3;
410 				}
411 				
412 				assert(c == x"12345678000000000000000000000000");
413 			}
414 		}
415 	}
416 	
417 	/// test pclmulqdq instruction with test vectors from
418 	/// https://software.intel.com/sites/default/files/managed/72/cc/clmul-wp-rev-2.02-2014-04-20.pdf
419 	@trusted
420 	unittest {
421 		import core.cpuid;
422 
423 		version(D_InlineAsm_X86_64) {
424 			if(aes) {
425 
426 				/// Python code to convert test vectors into little endian format. 
427 				/// Reverses the string by bytes (not by hexits):
428 				/// 
429 				/// import binascii
430 				/// def conv(xmmstr):
431 				///		bytearr=bytearray.fromhex(xmmstr)[::-1]
432 				///		return binascii.hexlify(bytearr)
433 				///
434 				/// conv('7b5b54657374566563746f725d53475d')
435 				/// conv('48692853686179295b477565726f6e5d')
436 				/// conv('1d4d84c85c3440c0929633d5d36f0451')
437 				/// 
438 
439 				ubyte[16] a = cast(const ubyte[16]) x"5d47535d726f74636556747365545b7b"; // xxm1 high: 7b5b546573745665 low: 63746f725d53475d
440 				ubyte[16] b = cast(const ubyte[16]) x"5d6e6f726575475b2979616853286948"; // 4869285368617929 5b477565726f6e5d
441 				ubyte[16] c;
442 
443 				asm {
444 					movdqu XMM1, a[RBP];
445 					movdqu XMM3, b[EBP];
446 
447 					db 0x66, 0x0f, 0x3a, 0x44, 0xd9, 0x00;	// pclmulqdq  XMM3, XMM1, 0x00;    // XMM3 holds a0*b0
448 
449 					movdqu c[EBP], XMM3;
450 				}
451 				assert(c == x"51046fd3d5339692c040345cc8844d1d");
452 
453 				asm {
454 					movdqu XMM1, a[RBP];
455 					movdqu XMM3, b[EBP];
456 					
457 					db 0x66, 0x0f, 0x3a, 0x44, 0xd9, 0x01;
458 					
459 					movdqu c[EBP], XMM3;
460 				}
461 				assert(c == x"1513282aac40a57fa1b56a558d7cd11b");
462 
463 				asm {
464 					movdqu XMM1, a[RBP];
465 					movdqu XMM3, b[EBP];
466 					
467 					db 0x66, 0x0f, 0x3a, 0x44, 0xd9, 0x10;
468 					
469 					movdqu c[EBP], XMM3;
470 				}
471 				assert(c == x"c9d5b7f42d26bfba2f86303adbf62b1a");
472 
473 				asm {
474 					movdqu XMM1, a[RBP];
475 					movdqu XMM3, b[EBP];
476 					
477 					db 0x66, 0x0f, 0x3a, 0x44, 0xd9, 0x11;
478 					
479 					movdqu c[EBP], XMM3;
480 				}
481 				assert(c == x"edd40f413ee06ed6457c2e592c1f1e1d");
482 			}
483 		}
484 	}
485 
486 	
487 //	/// test hardware accelerated multiplication (pclmulqdq)
488 //	unittest {
489 //		
490 //		immutable ubyte[16] H = cast(immutable ubyte[16]) x"00000000000000000000000000000080"; // neutral element
491 //		ubyte[16] X1 = cast(immutable ubyte[16]) x"0388dace60b6a392f328c2b971b2fe78";
492 //		
493 //		GCMPCLMULQDQMultiplier mult = GCMPCLMULQDQMultiplier(H);
494 //		
495 //		mult.multiply(X1);
496 //		
497 //		assert(X1 == x"0388dace60b6a392f328c2b971b2fe78", "GF128 multiplication with pclmulqdq failed!");
498 //	}
499 	
500 	/// test hardware accelerated multiplication (pclmulqdq)
501 	unittest {
502 
503 		import std.algorithm: reverse;
504 		
505 		ubyte[16] H = cast(immutable ubyte[16]) x"952b2a56a5604ac0b32b6656a05b40b6";
506 		ubyte[16] X1 = cast(immutable ubyte[16]) x"dfa6bf4ded81db03ffcaff95f830f061";
507 
508 		ubyte[16] expected = cast(immutable ubyte[16]) x"da53eb0ad2c55bb64fc4802cc3feda60";
509 
510 //		reverse(H[]);
511 //		reverse(X1[]);
512 //		reverse(expected[]);
513 
514 		//GCMMultiplier8kTable mult = GCMMultiplier8kTable(H);
515 		GCMPCLMULQDQMultiplier mult = GCMPCLMULQDQMultiplier(H);
516 		
517 		mult.multiply(X1);
518 		
519 		assert(X1 == expected, "GF128 multiplication with pclmulqdq failed!");
520 	}
521 
522 //	/// test hardware accelerated multiplication (pclmulqdq)
523 //	unittest {
524 //		
525 //		ulong[2] H = [0xb32b6656a05b40b6, 0x952b2a56a5604ac0];
526 //		ulong[2] X1 = [0xffcaff95f830f061, 0xdfa6bf4ded81db03];
527 //		
528 //		ulong[2] expected = [0x4fc4802cc3feda60, 0xda53eb0ad2c55bb6];
529 //
530 //		//GCMMultiplier8kTable mult = GCMMultiplier8kTable(H);
531 //		GCMPCLMULQDQMultiplier mult = GCMPCLMULQDQMultiplier(cast(ubyte[16])H);
532 //		
533 //		mult.multiply(cast(ubyte[16])X1);
534 //		
535 //		assert(X1 == expected, "GF128 multiplication with pclmulqdq failed!");
536 //	}
537 	
538 }