1 module dcrypt.ecc.curved25519.fieldElement;
2 
3 import dcrypt.bitmanip;
4 import dcrypt.util;
5 
6 @safe nothrow @nogc:
7 
8 /// fe means field element.
9 /// Here the field is \Z/(2^255-19).
10 /// An element t, entries t[0]...t[9], represents the integer
11 /// t[0]+2^26 t[1]+2^51 t[2]+2^77 t[3]+2^102 t[4]+...+2^230 t[9].
12 /// Bounds on each t[i] vary depending on context.
13 
14 @safe
15 struct fe {
16 
17 	enum fe zero = 0;
18 	enum fe one = [1,0,0,0,0,0,0,0,0,0];
19 
20 	private uint[10] value;
21 	private alias value this;
22 
23 	//	this(in ref uint[10] v) {
24 	//		value = v;
25 	//	}
26 	nothrow @nogc:
27 
28 	/// Create fe from 32 bytes.
29 	static fe fromBytes(in ubyte[] bytes)
30 	in {
31 		assert(bytes.length == 32, "bytes.length must be 32.");
32 	} body {
33 		return fe_frombytes(bytes);
34 	}
35 
36 	this(in uint[] v) 
37 	in {
38 		assert(v.length == 10, "v.length must be 10.");
39 	} body {
40 		value = v;
41 	}
42 
43 	// TODO remove
44 	this(in uint v) 
45 	{
46 		value[] = v;
47 	}
48 
49 	fe opAssign(in ref uint[10] v) {
50 		value = v;
51 		return this;
52 	}
53 
54 	/// Comparing in constant time.
55 	bool opEquals()(auto ref const fe rhs) const {
56 		return crypto_equals(this.value, rhs.value);
57 	}
58 
59 	fe opBinary(string op)(auto ref const fe rhs) const pure
60 		if (op == "+" || op == "-" || op == "*")
61 	{
62 		static if(op == "+") {
63 			fe tmp;
64 			tmp.value[] = this.value[] + rhs.value[];
65 			return tmp;
66 		} else static if(op == "-") {
67 			fe tmp;
68 			tmp.value[] = this.value[] - rhs.value[];
69 			return tmp;
70 		} else static if(op == "*") {
71 			return fe_mul(this, rhs);
72 		}
73 	}
74 
75 	fe opUnary(string op)() const
76 		if(op == "-")
77 	{
78 		static if(op == "-") {
79 			fe tmp;
80 			tmp.value[] = -this.value[];
81 			return tmp;
82 		}
83 	}
84 
85 	ref fe opOpAssign(string op)(auto ref const fe rhs) pure
86 		if (op == "+" || op == "-" || op == "*")
87 	{
88 		static if(op == "+") {
89 			value[] += rhs.value[];
90 		} else static if(op == "-") {
91 			value[] -= rhs.value[];
92 		} else static if(op == "*") {
93 			this = this * rhs;
94 		}
95 
96 		return this;
97 	}
98 
99 	/*
100 	 return 1 if f == 0
101 	 return 0 if f != 0
102 
103 	 Preconditions:
104 	 |f| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
105 	 */
106 	@property
107 	bool isNonzero() const
108 	{
109 		immutable ubyte[32] zero = 0;
110 		return !crypto_equals(fe_tobytes(this), zero);
111 	}
112 
113 	/**
114 	 return 1 if f is in {1,3,5,...,q-2}
115 	 return 0 if f is in {0,2,4,...,q-1}
116 
117 	 Preconditions:
118 	 |f| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
119 	 */ 
120 	@property
121 	bool isNegative() const
122 	{
123 		return (toBytes[0] & 1) == 1;
124 	}
125 
126 	@property
127 	fe inverse() const {
128 		return fe_invert(this);
129 	}
130 
131 	/// Changes the sign.
132 	void negate() {
133 		value[] = -value[];
134 	}
135 
136 	@property
137 	ubyte[32] toBytes() const {
138 		return fe_tobytes(this);
139 	}
140 
141 	/// Params:
142 	/// repeat = Repeat squaring n times.
143 	/// Returns: f^2 or f^(2^repeat).
144 	@property
145 	fe sq(uint repeat = 1)() const pure 
146 		if(repeat > 0) 
147 	{
148 		fe f = fe_sq(this);
149 
150 		static if(repeat > 1) {
151 			foreach(i; 1..repeat) {
152 				f = f.sq!1;
153 			}
154 		}
155 
156 		return f;
157 	}
158 
159 	/// Returns: 2*f*f
160 	@property
161 	fe sq2() const pure {
162 		return fe_sq2(this);
163 	}
164 
165 	/// Power by squaring in constant time.
166 	/// Returns f^power
167 	@property
168 	fe cpow(uint power)() const {
169 		fe r = fe.one;
170 		fe sq = this;
171 		for(uint p = power; p > 0; p >>= 1) {
172 			if((p & 1) == 1) {
173 				r *= sq;
174 			}
175 			sq = sq.sq;
176 		}
177 
178 		return r;
179 	}
180 }
181 
182 
183 
184 ///// h = f + g
185 ///// Can overlap h with f or g.
186 /////
187 ///// Preconditions:
188 /////   |f| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
189 /////   |g| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
190 /////
191 ///// Postconditions:
192 /////  |h| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
193 //void fe_add(ref fe h, in ref fe f, in ref fe g)
194 //{
195 //	//h[] = f[] + g[];
196 //	foreach(i; 0..10) {
197 //		h[i] = f[i] + g[i];
198 //	}
199 //}
200 
201 
202 /// Conditional move.
203 /// Replace (f,g) with (g,g) if b == 1;
204 /// replace (f,g) with (f,g) if b == 0.
205 /// 
206 /// Params:
207 /// dest = Destination.
208 /// src = Source.
209 /// condition = Condition.
210 void fe_cmov(ref fe dest, in ref fe src, in bool condition)
211 in {
212 	assert(condition == 0 || condition == 1);
213 } out {
214 	if(condition == 1) {
215 		assert(dest == src);
216 	}
217 } body {
218 	immutable uint mask = -(cast(int)condition);
219 
220 	assert((condition == 0 && mask == 0) || (condition == 1 && mask == 0xFFFFFFFF));
221 
222 	dest[] ^= mask & dest[];
223 	dest[] ^= mask & src[];
224 }
225 
226 // test conditional move
227 unittest {
228 	import std.algorithm: all;
229 
230 	fe a, b;
231 	a[] = 13;
232 	b[] = 42;
233 
234 	assert(all!"a == 13"(a[]));
235 	assert(all!"a == 42"(b[]));
236 
237 	fe_cmov(a, b, 0);
238 
239 	assert(all!"a == 13"(a[]));
240 	assert(all!"a == 42"(b[]));
241 
242 	fe_cmov(a, b, 1);
243 
244 	assert(all!"a == 42"(a[]));
245 }
246 
247 ulong load_3(in ubyte[] inp) pure
248 in {
249 	assert(inp.length == 3);
250 } body {
251 	ulong result;
252 	result = cast(ulong) inp[0];
253 	result |= (cast(ulong) inp[1]) << 8;
254 	result |= (cast(ulong) inp[2]) << 16;
255 	return result;
256 }
257 
258 ulong load_4(in ubyte[] inp) pure
259 in {
260 	assert(inp.length == 4);
261 }  body {
262 	ulong result;
263 	result = cast(ulong) inp[0];
264 	result |= (cast(ulong) inp[1]) << 8;
265 	result |= (cast(ulong) inp[2]) << 16;
266 	result |= (cast(ulong) inp[3]) << 24;
267 	return result;
268 }
269 
270 /*
271  Ignores top bit of h.
272  */
273 private fe fe_frombytes(in ubyte[] s) pure
274 in {
275 	assert(s.length == 32);
276 } body {
277 	long h0 = load_4(s[0..4]);
278 	long h1 = load_3(s[4..7]) << 6;
279 	long h2 = load_3(s[7..10]) << 5;
280 	long h3 = load_3(s[10..13]) << 3;
281 	long h4 = load_3(s[13..16]) << 2;
282 	long h5 = load_4(s[16..20]);
283 	long h6 = load_3(s[20..23]) << 7;
284 	long h7 = load_3(s[23..26]) << 5;
285 	long h8 = load_3(s[26..29]) << 4;
286 	long h9 = (load_3(s[29..32]) & 8388607) << 2;
287 
288 	long carry0;
289 	long carry1;
290 	long carry2;
291 	long carry3;
292 	long carry4;
293 	long carry5;
294 	long carry6;
295 	long carry7;
296 	long carry8;
297 	long carry9;
298 	
299 	carry9 = (h9 + cast(long) (1<<24)) >> 25; h0 += carry9 * 19; h9 -= SHL64(carry9,25);
300 	carry1 = (h1 + cast(long) (1<<24)) >> 25; h2 += carry1; h1 -= SHL64(carry1,25);
301 	carry3 = (h3 + cast(long) (1<<24)) >> 25; h4 += carry3; h3 -= SHL64(carry3,25);
302 	carry5 = (h5 + cast(long) (1<<24)) >> 25; h6 += carry5; h5 -= SHL64(carry5,25);
303 	carry7 = (h7 + cast(long) (1<<24)) >> 25; h8 += carry7; h7 -= SHL64(carry7,25);
304 	
305 	carry0 = (h0 + cast(long) (1<<25)) >> 26; h1 += carry0; h0 -= SHL64(carry0,26);
306 	carry2 = (h2 + cast(long) (1<<25)) >> 26; h3 += carry2; h2 -= SHL64(carry2,26);
307 	carry4 = (h4 + cast(long) (1<<25)) >> 26; h5 += carry4; h4 -= SHL64(carry4,26);
308 	carry6 = (h6 + cast(long) (1<<25)) >> 26; h7 += carry6; h6 -= SHL64(carry6,26);
309 	carry8 = (h8 + cast(long) (1<<25)) >> 26; h9 += carry8; h8 -= SHL64(carry8,26);
310 
311 	fe h;
312 	h[0] = cast(int) h0;
313 	h[1] = cast(int) h1;
314 	h[2] = cast(int) h2;
315 	h[3] = cast(int) h3;
316 	h[4] = cast(int) h4;
317 	h[5] = cast(int) h5;
318 	h[6] = cast(int) h6;
319 	h[7] = cast(int) h7;
320 	h[8] = cast(int) h8;
321 	h[9] = cast(int) h9;
322 	return h;
323 }
324 
325 // TODO replace all SHL* with <<
326 long SHL64(in long val, in uint shift) pure nothrow @nogc {
327 	return cast(long)(cast(ulong) val << shift);
328 }
329 
330 int SHL32(in int val, in uint shift) pure nothrow @nogc  {
331 	return cast(int)(cast(uint) val << shift);
332 }
333 
334 int SHL8(in byte val, in uint shift) pure nothrow @nogc  {
335 	return cast(byte)(cast(ubyte) val << shift);
336 }
337 
338 unittest {
339 	assert(cast(int)(0xFFFFFFFF) << 7 == SHL32(0xFFFFFFFF, 7));
340 }
341 
342 private fe fe_invert(in ref fe z)
343 {
344 	fe t0;
345 	fe t1;
346 	fe t2;
347 	fe t3;
348 	// pow225521
349 	t0 = z.sq; 
350 	t1 = t0.sq.sq; 
351 
352 	t1 *= z;
353 	t0 *= t1;
354 	t2 = t0.sq;
355 	t1 *= t2;
356 
357 	t2 = t1.sq!5;
358 
359 	t1 *= t2;
360 	t2 = t1.sq!10; 
361 
362 	t2 *= t1;
363 
364 	t3 = t2.sq!20;
365 
366 	t2 *= t3;
367 	t2 = t2.sq!10;
368 
369 	t1 *= t2;
370 
371 	t2 = t1.sq!50;
372 
373 	t2 *= t1;
374 	t3 = t2.sq!100;
375 
376 	t2 *= t3;
377 	t2 = t2.sq!50;
378 
379 	t1 *= t2;
380 
381 	t1 = t1.sq!5;
382 
383 	return t1 * t0;
384 }
385 
386 
387 
388 
389 
390 /**
391  Returns: h = f * g
392  Can overlap h with f or g.
393 
394  Preconditions:
395  |f| bounded by 1.65*2^26,1.65*2^25,1.65*2^26,1.65*2^25,etc.
396  |g| bounded by 1.65*2^26,1.65*2^25,1.65*2^26,1.65*2^25,etc.
397 
398  Postconditions:
399  |h| bounded by 1.01*2^25,1.01*2^24,1.01*2^25,1.01*2^24,etc.
400 
401  Note:
402  Notes on implementation strategy:
403 
404  Using schoolbook multiplication.
405  Karatsuba would save a little in some cost models.
406 
407  Most multiplications by 2 and 19 are 32-bit precomputations;
408  cheaper than 64-bit postcomputations.
409 
410  There is one remaining multiplication by 19 in the carry chain;
411  one *19 precomputation can be merged into this,
412  but the resulting data flow is considerably less clean.
413 
414  There are 12 carries below.
415  10 of them are 2-way parallelizable and vectorizable.
416  Can get away with 11 carries, but then data flow is much deeper.
417 
418  With tighter constraints on inputs can squeeze carries into int32.
419  */
420 private fe fe_mul(in ref fe f, in ref fe g) pure
421 {
422 	int f0 = f[0];
423 	int f1 = f[1];
424 	int f2 = f[2];
425 	int f3 = f[3];
426 	int f4 = f[4];
427 	int f5 = f[5];
428 	int f6 = f[6];
429 	int f7 = f[7];
430 	int f8 = f[8];
431 	int f9 = f[9];
432 	int g0 = g[0];
433 	int g1 = g[1];
434 	int g2 = g[2];
435 	int g3 = g[3];
436 	int g4 = g[4];
437 	int g5 = g[5];
438 	int g6 = g[6];
439 	int g7 = g[7];
440 	int g8 = g[8];
441 	int g9 = g[9];
442 	int g1_19 = 19 * g1; /* 1.959375*2^29 */
443 	int g2_19 = 19 * g2; /* 1.959375*2^30; still ok */
444 	int g3_19 = 19 * g3;
445 	int g4_19 = 19 * g4;
446 	int g5_19 = 19 * g5;
447 	int g6_19 = 19 * g6;
448 	int g7_19 = 19 * g7;
449 	int g8_19 = 19 * g8;
450 	int g9_19 = 19 * g9;
451 	int f1_2 = 2 * f1;
452 	int f3_2 = 2 * f3;
453 	int f5_2 = 2 * f5;
454 	int f7_2 = 2 * f7;
455 	int f9_2 = 2 * f9;
456 	long f0g0    = f0   * cast(long) g0;
457 	long f0g1    = f0   * cast(long) g1;
458 	long f0g2    = f0   * cast(long) g2;
459 	long f0g3    = f0   * cast(long) g3;
460 	long f0g4    = f0   * cast(long) g4;
461 	long f0g5    = f0   * cast(long) g5;
462 	long f0g6    = f0   * cast(long) g6;
463 	long f0g7    = f0   * cast(long) g7;
464 	long f0g8    = f0   * cast(long) g8;
465 	long f0g9    = f0   * cast(long) g9;
466 	long f1g0    = f1   * cast(long) g0;
467 	long f1g1_2  = f1_2 * cast(long) g1;
468 	long f1g2    = f1   * cast(long) g2;
469 	long f1g3_2  = f1_2 * cast(long) g3;
470 	long f1g4    = f1   * cast(long) g4;
471 	long f1g5_2  = f1_2 * cast(long) g5;
472 	long f1g6    = f1   * cast(long) g6;
473 	long f1g7_2  = f1_2 * cast(long) g7;
474 	long f1g8    = f1   * cast(long) g8;
475 	long f1g9_38 = f1_2 * cast(long) g9_19;
476 	long f2g0    = f2   * cast(long) g0;
477 	long f2g1    = f2   * cast(long) g1;
478 	long f2g2    = f2   * cast(long) g2;
479 	long f2g3    = f2   * cast(long) g3;
480 	long f2g4    = f2   * cast(long) g4;
481 	long f2g5    = f2   * cast(long) g5;
482 	long f2g6    = f2   * cast(long) g6;
483 	long f2g7    = f2   * cast(long) g7;
484 	long f2g8_19 = f2   * cast(long) g8_19;
485 	long f2g9_19 = f2   * cast(long) g9_19;
486 	long f3g0    = f3   * cast(long) g0;
487 	long f3g1_2  = f3_2 * cast(long) g1;
488 	long f3g2    = f3   * cast(long) g2;
489 	long f3g3_2  = f3_2 * cast(long) g3;
490 	long f3g4    = f3   * cast(long) g4;
491 	long f3g5_2  = f3_2 * cast(long) g5;
492 	long f3g6    = f3   * cast(long) g6;
493 	long f3g7_38 = f3_2 * cast(long) g7_19;
494 	long f3g8_19 = f3   * cast(long) g8_19;
495 	long f3g9_38 = f3_2 * cast(long) g9_19;
496 	long f4g0    = f4   * cast(long) g0;
497 	long f4g1    = f4   * cast(long) g1;
498 	long f4g2    = f4   * cast(long) g2;
499 	long f4g3    = f4   * cast(long) g3;
500 	long f4g4    = f4   * cast(long) g4;
501 	long f4g5    = f4   * cast(long) g5;
502 	long f4g6_19 = f4   * cast(long) g6_19;
503 	long f4g7_19 = f4   * cast(long) g7_19;
504 	long f4g8_19 = f4   * cast(long) g8_19;
505 	long f4g9_19 = f4   * cast(long) g9_19;
506 	long f5g0    = f5   * cast(long) g0;
507 	long f5g1_2  = f5_2 * cast(long) g1;
508 	long f5g2    = f5   * cast(long) g2;
509 	long f5g3_2  = f5_2 * cast(long) g3;
510 	long f5g4    = f5   * cast(long) g4;
511 	long f5g5_38 = f5_2 * cast(long) g5_19;
512 	long f5g6_19 = f5   * cast(long) g6_19;
513 	long f5g7_38 = f5_2 * cast(long) g7_19;
514 	long f5g8_19 = f5   * cast(long) g8_19;
515 	long f5g9_38 = f5_2 * cast(long) g9_19;
516 	long f6g0    = f6   * cast(long) g0;
517 	long f6g1    = f6   * cast(long) g1;
518 	long f6g2    = f6   * cast(long) g2;
519 	long f6g3    = f6   * cast(long) g3;
520 	long f6g4_19 = f6   * cast(long) g4_19;
521 	long f6g5_19 = f6   * cast(long) g5_19;
522 	long f6g6_19 = f6   * cast(long) g6_19;
523 	long f6g7_19 = f6   * cast(long) g7_19;
524 	long f6g8_19 = f6   * cast(long) g8_19;
525 	long f6g9_19 = f6   * cast(long) g9_19;
526 	long f7g0    = f7   * cast(long) g0;
527 	long f7g1_2  = f7_2 * cast(long) g1;
528 	long f7g2    = f7   * cast(long) g2;
529 	long f7g3_38 = f7_2 * cast(long) g3_19;
530 	long f7g4_19 = f7   * cast(long) g4_19;
531 	long f7g5_38 = f7_2 * cast(long) g5_19;
532 	long f7g6_19 = f7   * cast(long) g6_19;
533 	long f7g7_38 = f7_2 * cast(long) g7_19;
534 	long f7g8_19 = f7   * cast(long) g8_19;
535 	long f7g9_38 = f7_2 * cast(long) g9_19;
536 	long f8g0    = f8   * cast(long) g0;
537 	long f8g1    = f8   * cast(long) g1;
538 	long f8g2_19 = f8   * cast(long) g2_19;
539 	long f8g3_19 = f8   * cast(long) g3_19;
540 	long f8g4_19 = f8   * cast(long) g4_19;
541 	long f8g5_19 = f8   * cast(long) g5_19;
542 	long f8g6_19 = f8   * cast(long) g6_19;
543 	long f8g7_19 = f8   * cast(long) g7_19;
544 	long f8g8_19 = f8   * cast(long) g8_19;
545 	long f8g9_19 = f8   * cast(long) g9_19;
546 	long f9g0    = f9   * cast(long) g0;
547 	long f9g1_38 = f9_2 * cast(long) g1_19;
548 	long f9g2_19 = f9   * cast(long) g2_19;
549 	long f9g3_38 = f9_2 * cast(long) g3_19;
550 	long f9g4_19 = f9   * cast(long) g4_19;
551 	long f9g5_38 = f9_2 * cast(long) g5_19;
552 	long f9g6_19 = f9   * cast(long) g6_19;
553 	long f9g7_38 = f9_2 * cast(long) g7_19;
554 	long f9g8_19 = f9   * cast(long) g8_19;
555 	long f9g9_38 = f9_2 * cast(long) g9_19;
556 	long h0 = f0g0+f1g9_38+f2g8_19+f3g7_38+f4g6_19+f5g5_38+f6g4_19+f7g3_38+f8g2_19+f9g1_38;
557 	long h1 = f0g1+f1g0   +f2g9_19+f3g8_19+f4g7_19+f5g6_19+f6g5_19+f7g4_19+f8g3_19+f9g2_19;
558 	long h2 = f0g2+f1g1_2 +f2g0   +f3g9_38+f4g8_19+f5g7_38+f6g6_19+f7g5_38+f8g4_19+f9g3_38;
559 	long h3 = f0g3+f1g2   +f2g1   +f3g0   +f4g9_19+f5g8_19+f6g7_19+f7g6_19+f8g5_19+f9g4_19;
560 	long h4 = f0g4+f1g3_2 +f2g2   +f3g1_2 +f4g0   +f5g9_38+f6g8_19+f7g7_38+f8g6_19+f9g5_38;
561 	long h5 = f0g5+f1g4   +f2g3   +f3g2   +f4g1   +f5g0   +f6g9_19+f7g8_19+f8g7_19+f9g6_19;
562 	long h6 = f0g6+f1g5_2 +f2g4   +f3g3_2 +f4g2   +f5g1_2 +f6g0   +f7g9_38+f8g8_19+f9g7_38;
563 	long h7 = f0g7+f1g6   +f2g5   +f3g4   +f4g3   +f5g2   +f6g1   +f7g0   +f8g9_19+f9g8_19;
564 	long h8 = f0g8+f1g7_2 +f2g6   +f3g5_2 +f4g4   +f5g3_2 +f6g2   +f7g1_2 +f8g0   +f9g9_38;
565 	long h9 = f0g9+f1g8   +f2g7   +f3g6   +f4g5   +f5g4   +f6g3   +f7g2   +f8g1   +f9g0   ;
566 	long carry0;
567 	long carry1;
568 	long carry2;
569 	long carry3;
570 	long carry4;
571 	long carry5;
572 	long carry6;
573 	long carry7;
574 	long carry8;
575 	long carry9;
576 	
577 	/*
578 	 |h0| <= (1.65*1.65*2^52*(1+19+19+19+19)+1.65*1.65*2^50*(38+38+38+38+38))
579 	 i.e. |h0| <= 1.4*2^60; narrower ranges for h2, h4, h6, h8
580 	 |h1| <= (1.65*1.65*2^51*(1+1+19+19+19+19+19+19+19+19))
581 	 i.e. |h1| <= 1.7*2^59; narrower ranges for h3, h5, h7, h9
582 	 */
583 	
584 	carry0 = (h0 + cast(long) (1<<25)) >> 26; h1 += carry0; h0 -= SHL64(carry0,26);
585 	carry4 = (h4 + cast(long) (1<<25)) >> 26; h5 += carry4; h4 -= SHL64(carry4,26);
586 	/* |h0| <= 2^25 */
587 	/* |h4| <= 2^25 */
588 	/* |h1| <= 1.71*2^59 */
589 	/* |h5| <= 1.71*2^59 */
590 	
591 	carry1 = (h1 + cast(long) (1<<24)) >> 25; h2 += carry1; h1 -= SHL64(carry1,25);
592 	carry5 = (h5 + cast(long) (1<<24)) >> 25; h6 += carry5; h5 -= SHL64(carry5,25);
593 	/* |h1| <= 2^24; from now on fits into int32 */
594 	/* |h5| <= 2^24; from now on fits into int32 */
595 	/* |h2| <= 1.41*2^60 */
596 	/* |h6| <= 1.41*2^60 */
597 	
598 	carry2 = (h2 + cast(long) (1<<25)) >> 26; h3 += carry2; h2 -= SHL64(carry2,26);
599 	carry6 = (h6 + cast(long) (1<<25)) >> 26; h7 += carry6; h6 -= SHL64(carry6,26);
600 	/* |h2| <= 2^25; from now on fits into int32 unchanged */
601 	/* |h6| <= 2^25; from now on fits into int32 unchanged */
602 	/* |h3| <= 1.71*2^59 */
603 	/* |h7| <= 1.71*2^59 */
604 	
605 	carry3 = (h3 + cast(long) (1<<24)) >> 25; h4 += carry3; h3 -= SHL64(carry3,25);
606 	carry7 = (h7 + cast(long) (1<<24)) >> 25; h8 += carry7; h7 -= SHL64(carry7,25);
607 	/* |h3| <= 2^24; from now on fits into int32 unchanged */
608 	/* |h7| <= 2^24; from now on fits into int32 unchanged */
609 	/* |h4| <= 1.72*2^34 */
610 	/* |h8| <= 1.41*2^60 */
611 	
612 	carry4 = (h4 + cast(long) (1<<25)) >> 26; h5 += carry4; h4 -= SHL64(carry4,26);
613 	carry8 = (h8 + cast(long) (1<<25)) >> 26; h9 += carry8; h8 -= SHL64(carry8,26);
614 	/* |h4| <= 2^25; from now on fits into int32 unchanged */
615 	/* |h8| <= 2^25; from now on fits into int32 unchanged */
616 	/* |h5| <= 1.01*2^24 */
617 	/* |h9| <= 1.71*2^59 */
618 	
619 	carry9 = (h9 + cast(long) (1<<24)) >> 25; h0 += carry9 * 19; h9 -= SHL64(carry9,25);
620 	/* |h9| <= 2^24; from now on fits into int32 unchanged */
621 	/* |h0| <= 1.1*2^39 */
622 	
623 	carry0 = (h0 + cast(long) (1<<25)) >> 26; h1 += carry0; h0 -= SHL64(carry0,26);
624 	/* |h0| <= 2^25; from now on fits into int32 unchanged */
625 	/* |h1| <= 1.01*2^24 */
626 	fe h;
627 	h[0] = cast(int) h0;
628 	h[1] = cast(int) h1;
629 	h[2] = cast(int) h2;
630 	h[3] = cast(int) h3;
631 	h[4] = cast(int) h4;
632 	h[5] = cast(int) h5;
633 	h[6] = cast(int) h6;
634 	h[7] = cast(int) h7;
635 	h[8] = cast(int) h8;
636 	h[9] = cast(int) h9;
637 
638 	return h;
639 }
640 
641 
642 /// Returns f * 121666
643 ///
644 /// Preconditions:
645 /// |f| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
646 ///
647 /// Postconditions:
648 /// |h| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
649 public fe fe_mul121666(in ref fe f) pure
650 {
651 	int f0 = f[0];
652 	int f1 = f[1];
653 	int f2 = f[2];
654 	int f3 = f[3];
655 	int f4 = f[4];
656 	int f5 = f[5];
657 	int f6 = f[6];
658 	int f7 = f[7];
659 	int f8 = f[8];
660 	int f9 = f[9];
661 	long h0 = f0 * cast(long) 121666;
662 	long h1 = f1 * cast(long) 121666;
663 	long h2 = f2 * cast(long) 121666;
664 	long h3 = f3 * cast(long) 121666;
665 	long h4 = f4 * cast(long) 121666;
666 	long h5 = f5 * cast(long) 121666;
667 	long h6 = f6 * cast(long) 121666;
668 	long h7 = f7 * cast(long) 121666;
669 	long h8 = f8 * cast(long) 121666;
670 	long h9 = f9 * cast(long) 121666;
671 	long carry0;
672 	long carry1;
673 	long carry2;
674 	long carry3;
675 	long carry4;
676 	long carry5;
677 	long carry6;
678 	long carry7;
679 	long carry8;
680 	long carry9;
681 	
682 	carry9 = (h9 + cast(long) (1<<24)) >> 25; h0 += carry9 * 19; h9 -= carry9 << 25;
683 	carry1 = (h1 + cast(long) (1<<24)) >> 25; h2 += carry1; h1 -= carry1 << 25;
684 	carry3 = (h3 + cast(long) (1<<24)) >> 25; h4 += carry3; h3 -= carry3 << 25;
685 	carry5 = (h5 + cast(long) (1<<24)) >> 25; h6 += carry5; h5 -= carry5 << 25;
686 	carry7 = (h7 + cast(long) (1<<24)) >> 25; h8 += carry7; h7 -= carry7 << 25;
687 	
688 	carry0 = (h0 + cast(long) (1<<25)) >> 26; h1 += carry0; h0 -= carry0 << 26;
689 	carry2 = (h2 + cast(long) (1<<25)) >> 26; h3 += carry2; h2 -= carry2 << 26;
690 	carry4 = (h4 + cast(long) (1<<25)) >> 26; h5 += carry4; h4 -= carry4 << 26;
691 	carry6 = (h6 + cast(long) (1<<25)) >> 26; h7 += carry6; h6 -= carry6 << 26;
692 	carry8 = (h8 + cast(long) (1<<25)) >> 26; h9 += carry8; h8 -= carry8 << 26;
693 	
694 	fe h;
695 	h[0] = cast(uint) h0;
696 	h[1] = cast(uint) h1;
697 	h[2] = cast(uint) h2;
698 	h[3] = cast(uint) h3;
699 	h[4] = cast(uint) h4;
700 	h[5] = cast(uint) h5;
701 	h[6] = cast(uint) h6;
702 	h[7] = cast(uint) h7;
703 	h[8] = cast(uint) h8;
704 	h[9] = cast(uint) h9;
705 
706 	return h;
707 }
708 
709 /// Returns: z^(2^(225-23))
710 package fe fe_pow22523(in ref fe z) pure
711 {
712 	fe t0;
713 	fe t1;
714 	fe t2;
715 	t0 = z.sq;
716 	t1 = t0.sq.sq;
717 
718 	t1 *= z;
719 	t0 *= t1;
720 	t0 = t0.sq;
721 	t0 *= t1;
722 
723 	//		t1 = z.cpow!9;
724 	//		t0 = z.cpow!31;
725 
726 	t1 = t0.sq!5;
727 
728 	t0 *= t1;
729 
730 	t1 = t0.sq!10;
731 
732 	t1 *= t0;
733 
734 	t2 = t1.sq!20;
735 
736 	t1 *= t2;
737 
738 	t1 = t1.sq!10;
739 
740 	t0 *= t1;
741 
742 	t1 = t0.sq!50;
743 
744 	t1 *= t0;
745 
746 	t2 = t1.sq!100;
747 
748 	t1 *= t2;
749 
750 	t1 = t1.sq!50;
751 
752 	t0 *= t1;
753 	t0 = t0.sq.sq;
754 
755 	return t0 * z;
756 }
757 
758 
759 /**
760  Returns: h = f * f
761  Can overlap h with f.
762 
763  Preconditions:
764  |f| bounded by 1.65*2^26,1.65*2^25,1.65*2^26,1.65*2^25,etc.
765 
766  Postconditions:
767  |h| bounded by 1.01*2^25,1.01*2^24,1.01*2^25,1.01*2^24,etc.
768 
769  Note:
770  See fe_mul.c for discussion of implementation strategy.
771  */
772 private fe fe_sq(in ref fe f) pure
773 {
774 	int f0 = f[0];
775 	int f1 = f[1];
776 	int f2 = f[2];
777 	int f3 = f[3];
778 	int f4 = f[4];
779 	int f5 = f[5];
780 	int f6 = f[6];
781 	int f7 = f[7];
782 	int f8 = f[8];
783 	int f9 = f[9];
784 	int f0_2 = 2 * f0;
785 	int f1_2 = 2 * f1;
786 	int f2_2 = 2 * f2;
787 	int f3_2 = 2 * f3;
788 	int f4_2 = 2 * f4;
789 	int f5_2 = 2 * f5;
790 	int f6_2 = 2 * f6;
791 	int f7_2 = 2 * f7;
792 	int f5_38 = 38 * f5; /* 1.959375*2^30 */
793 	int f6_19 = 19 * f6; /* 1.959375*2^30 */
794 	int f7_38 = 38 * f7; /* 1.959375*2^30 */
795 	int f8_19 = 19 * f8; /* 1.959375*2^30 */
796 	int f9_38 = 38 * f9; /* 1.959375*2^30 */
797 	long f0f0    = f0   * cast(long) f0;
798 	long f0f1_2  = f0_2 * cast(long) f1;
799 	long f0f2_2  = f0_2 * cast(long) f2;
800 	long f0f3_2  = f0_2 * cast(long) f3;
801 	long f0f4_2  = f0_2 * cast(long) f4;
802 	long f0f5_2  = f0_2 * cast(long) f5;
803 	long f0f6_2  = f0_2 * cast(long) f6;
804 	long f0f7_2  = f0_2 * cast(long) f7;
805 	long f0f8_2  = f0_2 * cast(long) f8;
806 	long f0f9_2  = f0_2 * cast(long) f9;
807 	long f1f1_2  = f1_2 * cast(long) f1;
808 	long f1f2_2  = f1_2 * cast(long) f2;
809 	long f1f3_4  = f1_2 * cast(long) f3_2;
810 	long f1f4_2  = f1_2 * cast(long) f4;
811 	long f1f5_4  = f1_2 * cast(long) f5_2;
812 	long f1f6_2  = f1_2 * cast(long) f6;
813 	long f1f7_4  = f1_2 * cast(long) f7_2;
814 	long f1f8_2  = f1_2 * cast(long) f8;
815 	long f1f9_76 = f1_2 * cast(long) f9_38;
816 	long f2f2    = f2   * cast(long) f2;
817 	long f2f3_2  = f2_2 * cast(long) f3;
818 	long f2f4_2  = f2_2 * cast(long) f4;
819 	long f2f5_2  = f2_2 * cast(long) f5;
820 	long f2f6_2  = f2_2 * cast(long) f6;
821 	long f2f7_2  = f2_2 * cast(long) f7;
822 	long f2f8_38 = f2_2 * cast(long) f8_19;
823 	long f2f9_38 = f2   * cast(long) f9_38;
824 	long f3f3_2  = f3_2 * cast(long) f3;
825 	long f3f4_2  = f3_2 * cast(long) f4;
826 	long f3f5_4  = f3_2 * cast(long) f5_2;
827 	long f3f6_2  = f3_2 * cast(long) f6;
828 	long f3f7_76 = f3_2 * cast(long) f7_38;
829 	long f3f8_38 = f3_2 * cast(long) f8_19;
830 	long f3f9_76 = f3_2 * cast(long) f9_38;
831 	long f4f4    = f4   * cast(long) f4;
832 	long f4f5_2  = f4_2 * cast(long) f5;
833 	long f4f6_38 = f4_2 * cast(long) f6_19;
834 	long f4f7_38 = f4   * cast(long) f7_38;
835 	long f4f8_38 = f4_2 * cast(long) f8_19;
836 	long f4f9_38 = f4   * cast(long) f9_38;
837 	long f5f5_38 = f5   * cast(long) f5_38;
838 	long f5f6_38 = f5_2 * cast(long) f6_19;
839 	long f5f7_76 = f5_2 * cast(long) f7_38;
840 	long f5f8_38 = f5_2 * cast(long) f8_19;
841 	long f5f9_76 = f5_2 * cast(long) f9_38;
842 	long f6f6_19 = f6   * cast(long) f6_19;
843 	long f6f7_38 = f6   * cast(long) f7_38;
844 	long f6f8_38 = f6_2 * cast(long) f8_19;
845 	long f6f9_38 = f6   * cast(long) f9_38;
846 	long f7f7_38 = f7   * cast(long) f7_38;
847 	long f7f8_38 = f7_2 * cast(long) f8_19;
848 	long f7f9_76 = f7_2 * cast(long) f9_38;
849 	long f8f8_19 = f8   * cast(long) f8_19;
850 	long f8f9_38 = f8   * cast(long) f9_38;
851 	long f9f9_38 = f9   * cast(long) f9_38;
852 	long h0 = f0f0  +f1f9_76+f2f8_38+f3f7_76+f4f6_38+f5f5_38;
853 	long h1 = f0f1_2+f2f9_38+f3f8_38+f4f7_38+f5f6_38;
854 	long h2 = f0f2_2+f1f1_2 +f3f9_76+f4f8_38+f5f7_76+f6f6_19;
855 	long h3 = f0f3_2+f1f2_2 +f4f9_38+f5f8_38+f6f7_38;
856 	long h4 = f0f4_2+f1f3_4 +f2f2   +f5f9_76+f6f8_38+f7f7_38;
857 	long h5 = f0f5_2+f1f4_2 +f2f3_2 +f6f9_38+f7f8_38;
858 	long h6 = f0f6_2+f1f5_4 +f2f4_2 +f3f3_2 +f7f9_76+f8f8_19;
859 	long h7 = f0f7_2+f1f6_2 +f2f5_2 +f3f4_2 +f8f9_38;
860 	long h8 = f0f8_2+f1f7_4 +f2f6_2 +f3f5_4 +f4f4   +f9f9_38;
861 	long h9 = f0f9_2+f1f8_2 +f2f7_2 +f3f6_2 +f4f5_2;
862 	long carry0;
863 	long carry1;
864 	long carry2;
865 	long carry3;
866 	long carry4;
867 	long carry5;
868 	long carry6;
869 	long carry7;
870 	long carry8;
871 	long carry9;
872 	
873 	carry0 = (h0 + cast(long) (1<<25)) >> 26; h1 += carry0; h0 -= SHL64(carry0,26);
874 	carry4 = (h4 + cast(long) (1<<25)) >> 26; h5 += carry4; h4 -= SHL64(carry4,26);
875 	
876 	carry1 = (h1 + cast(long) (1<<24)) >> 25; h2 += carry1; h1 -= SHL64(carry1,25);
877 	carry5 = (h5 + cast(long) (1<<24)) >> 25; h6 += carry5; h5 -= SHL64(carry5,25);
878 	
879 	carry2 = (h2 + cast(long) (1<<25)) >> 26; h3 += carry2; h2 -= SHL64(carry2,26);
880 	carry6 = (h6 + cast(long) (1<<25)) >> 26; h7 += carry6; h6 -= SHL64(carry6,26);
881 	
882 	carry3 = (h3 + cast(long) (1<<24)) >> 25; h4 += carry3; h3 -= SHL64(carry3,25);
883 	carry7 = (h7 + cast(long) (1<<24)) >> 25; h8 += carry7; h7 -= SHL64(carry7,25);
884 	
885 	carry4 = (h4 + cast(long) (1<<25)) >> 26; h5 += carry4; h4 -= SHL64(carry4,26);
886 	carry8 = (h8 + cast(long) (1<<25)) >> 26; h9 += carry8; h8 -= SHL64(carry8,26);
887 	
888 	carry9 = (h9 + cast(long) (1<<24)) >> 25; h0 += carry9 * 19; h9 -= SHL64(carry9,25);
889 	
890 	carry0 = (h0 + cast(long) (1<<25)) >> 26; h1 += carry0; h0 -= SHL64(carry0,26);
891 
892 	fe h;
893 	h[0] = cast(int) h0;
894 	h[1] = cast(int) h1;
895 	h[2] = cast(int) h2;
896 	h[3] = cast(int) h3;
897 	h[4] = cast(int) h4;
898 	h[5] = cast(int) h5;
899 	h[6] = cast(int) h6;
900 	h[7] = cast(int) h7;
901 	h[8] = cast(int) h8;
902 	h[9] = cast(int) h9;
903 	return h;
904 }
905 
906 /**
907  Returns: h = 2 * f * f
908  Can overlap h with f.
909 
910  Preconditions:
911  |f| bounded by 1.65*2^26,1.65*2^25,1.65*2^26,1.65*2^25,etc.
912 
913  Postconditions:
914  |h| bounded by 1.01*2^25,1.01*2^24,1.01*2^25,1.01*2^24,etc.
915 
916  Note:
917  See fe_mul.c for discussion of implementation strategy.
918  */
919 private fe fe_sq2(in ref fe f) pure
920 {
921 	int f0 = f[0];
922 	int f1 = f[1];
923 	int f2 = f[2];
924 	int f3 = f[3];
925 	int f4 = f[4];
926 	int f5 = f[5];
927 	int f6 = f[6];
928 	int f7 = f[7];
929 	int f8 = f[8];
930 	int f9 = f[9];
931 	int f0_2 = 2 * f0;
932 	int f1_2 = 2 * f1;
933 	int f2_2 = 2 * f2;
934 	int f3_2 = 2 * f3;
935 	int f4_2 = 2 * f4;
936 	int f5_2 = 2 * f5;
937 	int f6_2 = 2 * f6;
938 	int f7_2 = 2 * f7;
939 	int f5_38 = 38 * f5; /* 1.959375*2^30 */
940 	int f6_19 = 19 * f6; /* 1.959375*2^30 */
941 	int f7_38 = 38 * f7; /* 1.959375*2^30 */
942 	int f8_19 = 19 * f8; /* 1.959375*2^30 */
943 	int f9_38 = 38 * f9; /* 1.959375*2^30 */
944 	long f0f0    = f0   * cast(long) f0;
945 	long f0f1_2  = f0_2 * cast(long) f1;
946 	long f0f2_2  = f0_2 * cast(long) f2;
947 	long f0f3_2  = f0_2 * cast(long) f3;
948 	long f0f4_2  = f0_2 * cast(long) f4;
949 	long f0f5_2  = f0_2 * cast(long) f5;
950 	long f0f6_2  = f0_2 * cast(long) f6;
951 	long f0f7_2  = f0_2 * cast(long) f7;
952 	long f0f8_2  = f0_2 * cast(long) f8;
953 	long f0f9_2  = f0_2 * cast(long) f9;
954 	long f1f1_2  = f1_2 * cast(long) f1;
955 	long f1f2_2  = f1_2 * cast(long) f2;
956 	long f1f3_4  = f1_2 * cast(long) f3_2;
957 	long f1f4_2  = f1_2 * cast(long) f4;
958 	long f1f5_4  = f1_2 * cast(long) f5_2;
959 	long f1f6_2  = f1_2 * cast(long) f6;
960 	long f1f7_4  = f1_2 * cast(long) f7_2;
961 	long f1f8_2  = f1_2 * cast(long) f8;
962 	long f1f9_76 = f1_2 * cast(long) f9_38;
963 	long f2f2    = f2   * cast(long) f2;
964 	long f2f3_2  = f2_2 * cast(long) f3;
965 	long f2f4_2  = f2_2 * cast(long) f4;
966 	long f2f5_2  = f2_2 * cast(long) f5;
967 	long f2f6_2  = f2_2 * cast(long) f6;
968 	long f2f7_2  = f2_2 * cast(long) f7;
969 	long f2f8_38 = f2_2 * cast(long) f8_19;
970 	long f2f9_38 = f2   * cast(long) f9_38;
971 	long f3f3_2  = f3_2 * cast(long) f3;
972 	long f3f4_2  = f3_2 * cast(long) f4;
973 	long f3f5_4  = f3_2 * cast(long) f5_2;
974 	long f3f6_2  = f3_2 * cast(long) f6;
975 	long f3f7_76 = f3_2 * cast(long) f7_38;
976 	long f3f8_38 = f3_2 * cast(long) f8_19;
977 	long f3f9_76 = f3_2 * cast(long) f9_38;
978 	long f4f4    = f4   * cast(long) f4;
979 	long f4f5_2  = f4_2 * cast(long) f5;
980 	long f4f6_38 = f4_2 * cast(long) f6_19;
981 	long f4f7_38 = f4   * cast(long) f7_38;
982 	long f4f8_38 = f4_2 * cast(long) f8_19;
983 	long f4f9_38 = f4   * cast(long) f9_38;
984 	long f5f5_38 = f5   * cast(long) f5_38;
985 	long f5f6_38 = f5_2 * cast(long) f6_19;
986 	long f5f7_76 = f5_2 * cast(long) f7_38;
987 	long f5f8_38 = f5_2 * cast(long) f8_19;
988 	long f5f9_76 = f5_2 * cast(long) f9_38;
989 	long f6f6_19 = f6   * cast(long) f6_19;
990 	long f6f7_38 = f6   * cast(long) f7_38;
991 	long f6f8_38 = f6_2 * cast(long) f8_19;
992 	long f6f9_38 = f6   * cast(long) f9_38;
993 	long f7f7_38 = f7   * cast(long) f7_38;
994 	long f7f8_38 = f7_2 * cast(long) f8_19;
995 	long f7f9_76 = f7_2 * cast(long) f9_38;
996 	long f8f8_19 = f8   * cast(long) f8_19;
997 	long f8f9_38 = f8   * cast(long) f9_38;
998 	long f9f9_38 = f9   * cast(long) f9_38;
999 	long h0 = f0f0  +f1f9_76+f2f8_38+f3f7_76+f4f6_38+f5f5_38;
1000 	long h1 = f0f1_2+f2f9_38+f3f8_38+f4f7_38+f5f6_38;
1001 	long h2 = f0f2_2+f1f1_2 +f3f9_76+f4f8_38+f5f7_76+f6f6_19;
1002 	long h3 = f0f3_2+f1f2_2 +f4f9_38+f5f8_38+f6f7_38;
1003 	long h4 = f0f4_2+f1f3_4 +f2f2   +f5f9_76+f6f8_38+f7f7_38;
1004 	long h5 = f0f5_2+f1f4_2 +f2f3_2 +f6f9_38+f7f8_38;
1005 	long h6 = f0f6_2+f1f5_4 +f2f4_2 +f3f3_2 +f7f9_76+f8f8_19;
1006 	long h7 = f0f7_2+f1f6_2 +f2f5_2 +f3f4_2 +f8f9_38;
1007 	long h8 = f0f8_2+f1f7_4 +f2f6_2 +f3f5_4 +f4f4   +f9f9_38;
1008 	long h9 = f0f9_2+f1f8_2 +f2f7_2 +f3f6_2 +f4f5_2;
1009 	long carry0;
1010 	long carry1;
1011 	long carry2;
1012 	long carry3;
1013 	long carry4;
1014 	long carry5;
1015 	long carry6;
1016 	long carry7;
1017 	long carry8;
1018 	long carry9;
1019 	
1020 	h0 += h0;
1021 	h1 += h1;
1022 	h2 += h2;
1023 	h3 += h3;
1024 	h4 += h4;
1025 	h5 += h5;
1026 	h6 += h6;
1027 	h7 += h7;
1028 	h8 += h8;
1029 	h9 += h9;
1030 	
1031 	carry0 = (h0 + cast(long) (1<<25)) >> 26; h1 += carry0; h0 -= SHL64(carry0,26);
1032 	carry4 = (h4 + cast(long) (1<<25)) >> 26; h5 += carry4; h4 -= SHL64(carry4,26);
1033 	
1034 	carry1 = (h1 + cast(long) (1<<24)) >> 25; h2 += carry1; h1 -= SHL64(carry1,25);
1035 	carry5 = (h5 + cast(long) (1<<24)) >> 25; h6 += carry5; h5 -= SHL64(carry5,25);
1036 	
1037 	carry2 = (h2 + cast(long) (1<<25)) >> 26; h3 += carry2; h2 -= SHL64(carry2,26);
1038 	carry6 = (h6 + cast(long) (1<<25)) >> 26; h7 += carry6; h6 -= SHL64(carry6,26);
1039 	
1040 	carry3 = (h3 + cast(long) (1<<24)) >> 25; h4 += carry3; h3 -= SHL64(carry3,25);
1041 	carry7 = (h7 + cast(long) (1<<24)) >> 25; h8 += carry7; h7 -= SHL64(carry7,25);
1042 	
1043 	carry4 = (h4 + cast(long) (1<<25)) >> 26; h5 += carry4; h4 -= SHL64(carry4,26);
1044 	carry8 = (h8 + cast(long) (1<<25)) >> 26; h9 += carry8; h8 -= SHL64(carry8,26);
1045 	
1046 	carry9 = (h9 + cast(long) (1<<24)) >> 25; h0 += carry9 * 19; h9 -= SHL64(carry9,25);
1047 	
1048 	carry0 = (h0 + cast(long) (1<<25)) >> 26; h1 += carry0; h0 -= SHL64(carry0,26);
1049 
1050 	fe h;
1051 	h[0] = cast(int) h0;
1052 	h[1] = cast(int) h1;
1053 	h[2] = cast(int) h2;
1054 	h[3] = cast(int) h3;
1055 	h[4] = cast(int) h4;
1056 	h[5] = cast(int) h5;
1057 	h[6] = cast(int) h6;
1058 	h[7] = cast(int) h7;
1059 	h[8] = cast(int) h8;
1060 	h[9] = cast(int) h9;
1061 	return h;
1062 }
1063 
1064 ///**
1065 // Returns: h = f - g
1066 // Can overlap h with f or g.
1067 //
1068 // Preconditions:
1069 // |f| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
1070 // |g| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
1071 //
1072 // Postconditions:
1073 // |h| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
1074 // */
1075 //void fe_sub(ref fe h, in ref fe f, in ref fe g)
1076 //{
1077 //	h[] = f[] - g[];
1078 //}
1079 
1080 
1081 /**
1082  * 
1083  * Params:
1084  * s = 32 byte buffer
1085  * 
1086  Preconditions:
1087  |h| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
1088 
1089  Write p=2^255-19; q=floor(h/p).
1090  Basic claim: q = floor(2^(-255)(h + 19 2^(-25)h9 + 2^(-1))).
1091 
1092  Proof:
1093  Have |h|<=p so |q|<=1 so |19^2 2^(-255) q|<1/4.
1094  Also have |h-2^230 h9|<2^231 so |19 2^(-255)(h-2^230 h9)|<1/4.
1095 
1096  Write y=2^(-1)-19^2 2^(-255)q-19 2^(-255)(h-2^230 h9).
1097  Then 0<y<1.
1098 
1099  Write r=h-pq.
1100  Have 0<=r<=p-1=2^255-20.
1101  Thus 0<=r+19(2^-255)r<r+19(2^-255)2^255<=2^255-1.
1102 
1103  Write x=r+19(2^-255)r+y.
1104  Then 0<x<2^255 so floor(2^(-255)x) = 0 so floor(q+2^(-255)x) = q.
1105 
1106  Have q+2^(-255)x = 2^(-255)(h + 19 2^(-25) h9 + 2^(-1))
1107  so floor(2^(-255)(h + 19 2^(-25) h9 + 2^(-1))) = q.
1108  */
1109 private ubyte[32] fe_tobytes(in ref fe h) pure
1110 {
1111 	int h0 = h[0];
1112 	int h1 = h[1];
1113 	int h2 = h[2];
1114 	int h3 = h[3];
1115 	int h4 = h[4];
1116 	int h5 = h[5];
1117 	int h6 = h[6];
1118 	int h7 = h[7];
1119 	int h8 = h[8];
1120 	int h9 = h[9];
1121 	int q;
1122 	int carry0;
1123 	int carry1;
1124 	int carry2;
1125 	int carry3;
1126 	int carry4;
1127 	int carry5;
1128 	int carry6;
1129 	int carry7;
1130 	int carry8;
1131 	int carry9;
1132 
1133 	q = (19 * h9 + ((cast(int) 1) << 24)) >> 25;
1134 	q = (h0 + q) >> 26;
1135 	q = (h1 + q) >> 25;
1136 	q = (h2 + q) >> 26;
1137 	q = (h3 + q) >> 25;
1138 	q = (h4 + q) >> 26;
1139 	q = (h5 + q) >> 25;
1140 	q = (h6 + q) >> 26;
1141 	q = (h7 + q) >> 25;
1142 	q = (h8 + q) >> 26;
1143 	q = (h9 + q) >> 25;
1144 
1145 	/* Goal: Output h-(2^255-19)q, which is between 0 and 2^255-20. */
1146 	h0 += 19 * q;
1147 	/* Goal: Output h-2^255 q, which is between 0 and 2^255-20. */
1148 	
1149 	carry0 = h0 >> 26; h1 += carry0; h0 -= carry0 << 26;
1150 	carry1 = h1 >> 25; h2 += carry1; h1 -= carry1 << 25;
1151 	carry2 = h2 >> 26; h3 += carry2; h2 -= carry2 << 26;
1152 	carry3 = h3 >> 25; h4 += carry3; h3 -= carry3 << 25;
1153 	carry4 = h4 >> 26; h5 += carry4; h4 -= carry4 << 26;
1154 	carry5 = h5 >> 25; h6 += carry5; h5 -= carry5 << 25;
1155 	carry6 = h6 >> 26; h7 += carry6; h6 -= carry6 << 26;
1156 	carry7 = h7 >> 25; h8 += carry7; h7 -= carry7 << 25;
1157 	carry8 = h8 >> 26; h9 += carry8; h8 -= carry8 << 26;
1158 	carry9 = h9 >> 25;               h9 -= carry9 << 25;
1159 	/* h10 = carry9 */
1160 	
1161 	/*
1162 	 Goal: Output h0+...+2^255 h10-2^255 q, which is between 0 and 2^255-20.
1163 	 Have h0+...+2^230 h9 between 0 and 2^255-1;
1164 	 evidently 2^255 h10-2^255 q = 0.
1165 	 Goal: Output h0+...+2^230 h9.
1166 	 */
1167 
1168 	ubyte[32] s;
1169 	s[0] = cast(ubyte) (h0 >> 0);
1170 	s[1] = cast(ubyte) (h0 >> 8);
1171 	s[2] = cast(ubyte) (h0 >> 16);
1172 	s[3] = cast(ubyte) (((h0 >> 24) | (h1 << 2)));
1173 	s[4] = cast(ubyte) (h1 >> 6);
1174 	s[5] = cast(ubyte) (h1 >> 14);
1175 	s[6] = cast(ubyte) (((h1 >> 22) | (h2 << 3)));
1176 	s[7] = cast(ubyte) (h2 >> 5);
1177 	s[8] = cast(ubyte) (h2 >> 13);
1178 	s[9] = cast(ubyte) (((h2 >> 21) | (h3 << 5)));
1179 	s[10] = cast(ubyte) (h3 >> 3);
1180 	s[11] = cast(ubyte) (h3 >> 11);
1181 	s[12] = cast(ubyte) (((h3 >> 19) | (h4 << 6)));
1182 	s[13] = cast(ubyte) (h4 >> 2);
1183 	s[14] = cast(ubyte) (h4 >> 10);
1184 	s[15] = cast(ubyte) (h4 >> 18);
1185 	s[16] = cast(ubyte) (h5 >> 0);
1186 	s[17] = cast(ubyte) (h5 >> 8);
1187 	s[18] = cast(ubyte) (h5 >> 16);
1188 	s[19] = cast(ubyte) (((h5 >> 24) | (h6 << 1)));
1189 	s[20] = cast(ubyte) (h6 >> 7);
1190 	s[21] = cast(ubyte) (h6 >> 15);
1191 	s[22] = cast(ubyte) (((h6 >> 23) | (h7 << 3)));
1192 	s[23] = cast(ubyte) (h7 >> 5);
1193 	s[24] = cast(ubyte) (h7 >> 13);
1194 	s[25] = cast(ubyte) (((h7 >> 21) | (h8 << 4)));
1195 	s[26] = cast(ubyte) (h8 >> 4);
1196 	s[27] = cast(ubyte) (h8 >> 12);
1197 	s[28] = cast(ubyte) (((h8 >> 20) | (h9 << 6)));
1198 	s[29] = cast(ubyte) (h9 >> 2);
1199 	s[30] = cast(ubyte) (h9 >> 10);
1200 	s[31] = cast(ubyte) (h9 >> 18);
1201 
1202 	return s;
1203 }
1204 
1205 /// Conditional swap.
1206 /// Replace (f,g) with (g,f) if b == 1;
1207 /// replace (f,g) with (f,g) if b == 0.
1208 /// Params:
1209 /// b = 0 or 1
1210 void fe_cswap(ref fe f, ref fe g, in uint b)
1211 in {
1212 	assert(b == 0 || b == 1);
1213 } body
1214 {
1215 	// TODO refactor
1216 	immutable uint mask = -b;
1217 
1218 	assert(mask == 0 || mask == 0xFFFFFFFF);
1219 
1220 	f[] ^= mask & g[];
1221 	g[] ^= mask & f[];
1222 	f[] ^= mask & g[];
1223 }
1224 
1225 unittest {
1226 	fe a;
1227 	fe b;
1228 	a[] = 1;
1229 	b[] = 2;
1230 	fe A = 1;
1231 	fe B = 2;
1232 
1233 	fe_cswap(A, B, 0);
1234 	assert(A == a);
1235 	assert(B == b);
1236 	fe_cswap(A, B, 1);
1237 	assert(A == b);
1238 	assert(B == a);
1239 }