1 module dcrypt.crypto.ecc.curve25519;
2 
3 
4 /** 
5  * This module is a port of Bernstein's curve25519 to D.
6  * Original code can be found here: http://code.google.com/p/curve25519-donna/
7  * 
8  * Copyright:
9  *
10  * Copyright 2008, Google Inc.
11  * All rights reserved.
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions are
15  * met:
16  *
17  *     * Redistributions of source code must retain the above copyright
18  * notice, this list of conditions and the following disclaimer.
19  *     * Redistributions in binary form must reproduce the above
20  * copyright notice, this list of conditions and the following disclaimer
21  * in the documentation and/or other materials provided with the
22  * distribution.
23  *     * Neither the name of Google Inc. nor the names of its
24  * contributors may be used to endorse or promote products derived from
25  * this software without specific prior written permission.
26  *
27  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38  *
39  * curve25519-donna: Curve25519 elliptic curve, public key function
40  *
41  * http://code.google.com/p/curve25519-donna/
42  *
43  * Adam Langley <agl@imperialviolet.org>
44  *
45  * Derived from public domain C code by Daniel J. Bernstein <djb@cr.yp.to>
46  *
47  * More information about curve25519 can be found here
48  *   http://cr.yp.to/ecdh.html
49  *
50  * djb's sample implementation of curve25519 is written in a special assembly
51  * language called qhasm and uses the floating point registers.
52  *
53  * This is, almost, a clean room reimplementation from the curve25519 paper. It
54  * uses many of the tricks described therein. Only the crecip function is taken
55  * from the sample implementation.
56  */
57 
58 /// Generate a public key from a secret. 
59 /// Test vectors from http://cr.yp.to/highspeed/naclcrypto-20090310.pdf
60 unittest {
61 	alias ubyte[32] key_t;
62 	
63 	key_t secretKey = cast(key_t) x"77076d0a7318a57d3c16c17251b26645df4c2f87ebc0992ab177fba51db92c2a";
64 	
65 	key_t publicKey = curve25519(secretKey);
66 	
67 	auto expectedPublic = x"8520f0098930a754748b7ddcb43ef75a0dbf3a0d26381af4eba4a98eaa9b4e6a";
68 	
69 	assert(publicKey == expectedPublic, "curve25519 public key generation failed!");
70 }
71 
72 /// Generate a public key from a secret. 
73 /// Test vectors from http://cr.yp.to/highspeed/naclcrypto-20090310.pdf
74 unittest {
75 	alias ubyte[32] key_t;
76 	
77 	key_t secretKey = cast(key_t) x"5dab087e624a8a4b79e17f8b83800ee66f3bb1292618b6fd1c2f8b27ff88e0eb";
78 	
79 	key_t publicKey = curve25519(secretKey);
80 	
81 	auto expectedPublic = x"de9edb7d7b7dc1b4d35b61c2ece435373f8343c85b78674dadfc7e146f882b4f";
82 	
83 	assert(publicKey == expectedPublic, "curve25519 public key generation failed!");
84 }
85 
86 /// DH key exchange
87 /// Test vectors from http://cr.yp.to/highspeed/naclcrypto-20090310.pdf
88 unittest {
89 	alias ubyte[32] key_t;
90 	
91 	key_t priv1, priv2, pub1, pub2;
92 	priv1[] = cast(key_t) x"77076d0a7318a57d3c16c17251b26645df4c2f87ebc0992ab177fba51db92c2a";
93 	priv2[] = cast(key_t) x"5dab087e624a8a4b79e17f8b83800ee66f3bb1292618b6fd1c2f8b27ff88e0eb";
94 	
95 	pub1 = curve25519(priv1);
96 	pub2 = curve25519(priv2);
97 	
98 	key_t shared1, shared2;
99 	
100 	// Generate the shared keys. Both should be equal.
101 	shared1 = curve25519(priv1, pub2);
102 	shared2 = curve25519(priv2, pub1);
103 	
104 	auto expectedSharedSecret = x"4a5d9d5ba4ce2de1728e3bf480350f25e07e21c947d19e3376f09b3c1e161742";
105 	
106 	assert(shared1 == expectedSharedSecret, "curve25519 DH key agreement failed!");
107 	assert(shared1 == shared2, "curve25519 DH key agreement failed!");
108 }
109 
110 // Test key agreement with random secrets.
111 unittest {
112 	import dcrypt.crypto.random.prng;
113 	import std.algorithm: any;
114 	
115 	DRNG drng = DRNG(0);
116 	
117 	ubyte[32] secret1, secret2, pub1, pub2, shared1, shared2;
118 	
119 	foreach(i;0..32) {
120 		drng.nextBytes(secret1);
121 		drng.nextBytes(secret2);
122 		
123 		pub1 = curve25519(secret1);
124 		pub2 = curve25519(secret2);
125 		
126 		shared1 = curve25519(secret1, pub2);
127 		shared2 = curve25519(secret2, pub1);
128 		
129 		assert(shared1 == shared2, "DH key agreement with curve25519 failed!");
130 		assert(any!"a != 0"(shared1[]), "DH key agreement with curve25519 failed!");
131 	}
132 	
133 }
134 
135 @safe pure nothrow @nogc:
136 
137 public enum publicBasePoint = cast(immutable (ubyte[32]) ) x"0900000000000000000000000000000000000000000000000000000000000000";
138 
139 
140 /// 
141 /// 
142 /// Params:
143 /// secret = your secret key, the 'exponent'
144 /// basepoint = public key. Default: base point 9
145 /// 
146 /// Returns: basepoint^secret.
147 /// 
148 /// Examples:
149 /// 
150 /// ubyte[32] publicKey = curve25519(secretKey);
151 /// 
152 /// ubyte[32] sharedKey = curve25519(mySecretKey, herPublicKey);
153 /// 
154 public ubyte[32] curve25519(in ref ubyte[32] secret, in ref ubyte[32] basepoint = publicBasePoint) 
155 in {
156 	assert(secret.length == 32);
157 	assert(basepoint.length == 32);
158 } body {
159 	limb[10] bp, x, zmone;
160 	limb[10] z; // TODO: was 11 but I think it should be 10
161 	ubyte[32] e;
162 	
163 	e[] = secret[];
164 	
165 	e[0] &= 248;
166 	e[31] &= 127;
167 	e[31] |= 64;
168 	
169 	fexpand(bp, basepoint);
170 	cmult(x, z, e, bp);
171 	crecip(zmone, z);
172 	fmul(z, x, zmone);
173 	
174 	ubyte[32] myPublic;
175 	
176 	fcontract(myPublic, z);
177 	
178 	return myPublic;
179 }
180 
181 private:
182 
183 alias long limb;
184 
185 /* Field element representation:
186  *
187  * Field elements are written as an array of signed, 64-bit limbs, least
188  * significant first. The value of the field element is:
189  *   x[0] + 2^26·x[1] + x^51·x[2] + 2^102·x[3] + ...
190  *
191  * i.e. the limbs are 26, 25, 26, 25, ... bits wide. */
192 
193 /* Sum two numbers: output += in */
194 void fsum(limb[] output, in limb[] input)
195 {
196 	output[0..10] += input[0..10];
197 }
198 
199 /* Find the difference of two numbers: output = in - output
200  * (note the order of the arguments!). */
201 void fdifference(limb[] output, in limb[] input)
202 {
203 	output[0..10] = input[0..10] - output[0..10];
204 }
205 
206 /* Multiply a number by a scalar: output = in * scalar */
207 void fscalar_product(limb[] output, in limb[] input, in limb scalar)
208 {
209 	output[0..10] = input[0..10] * scalar;
210 }
211 
212 /* Multiply two numbers: output = in2 * in
213  *
214  * output must be distinct to both inputs. The inputs are reduced coefficient
215  * form, the output is not.
216  *
217  * output[x] <= 14 * the largest product of the input limbs. */
218 void fproduct(limb[] output, in limb[] in2, in limb[] in1) 
219 in {
220 	assert(output.length == 19);
221 	assert(in1.length >= 10);
222 	assert(in2.length >= 10);
223 } body {
224 	output[0] =       (cast(limb) (cast(int) in2[0])) * (cast(int) in1[0]);
225 	output[1] =       (cast(limb) (cast(int) in2[0])) * (cast(int) in1[1]) +
226 		(cast(limb) (cast(int) in2[1])) * (cast(int) in1[0]);
227 	output[2] =  2 *  (cast(limb) (cast(int) in2[1])) * (cast(int) in1[1]) +
228 		(cast(limb) (cast(int) in2[0])) * (cast(int) in1[2]) +
229 			(cast(limb) (cast(int) in2[2])) * (cast(int) in1[0]);
230 	output[3] =       (cast(limb) (cast(int) in2[1])) * (cast(int) in1[2]) +
231 		(cast(limb) (cast(int) in2[2])) * (cast(int) in1[1]) +
232 			(cast(limb) (cast(int) in2[0])) * (cast(int) in1[3]) +
233 			(cast(limb) (cast(int) in2[3])) * (cast(int) in1[0]);
234 	output[4] =       (cast(limb) (cast(int) in2[2])) * (cast(int) in1[2]) +
235 		2 * ((cast(limb) (cast(int) in2[1])) * (cast(int) in1[3]) +
236 			(cast(limb) (cast(int) in2[3])) * (cast(int) in1[1])) +
237 			(cast(limb) (cast(int) in2[0])) * (cast(int) in1[4]) +
238 			(cast(limb) (cast(int) in2[4])) * (cast(int) in1[0]);
239 	output[5] =       (cast(limb) (cast(int) in2[2])) * (cast(int) in1[3]) +
240 		(cast(limb) (cast(int) in2[3])) * (cast(int) in1[2]) +
241 			(cast(limb) (cast(int) in2[1])) * (cast(int) in1[4]) +
242 			(cast(limb) (cast(int) in2[4])) * (cast(int) in1[1]) +
243 			(cast(limb) (cast(int) in2[0])) * (cast(int) in1[5]) +
244 			(cast(limb) (cast(int) in2[5])) * (cast(int) in1[0]);
245 	output[6] =  2 * ((cast(limb) (cast(int) in2[3])) * (cast(int) in1[3]) +
246 		(cast(limb) (cast(int) in2[1])) * (cast(int) in1[5]) +
247 		(cast(limb) (cast(int) in2[5])) * (cast(int) in1[1])) +
248 		(cast(limb) (cast(int) in2[2])) * (cast(int) in1[4]) +
249 			(cast(limb) (cast(int) in2[4])) * (cast(int) in1[2]) +
250 			(cast(limb) (cast(int) in2[0])) * (cast(int) in1[6]) +
251 			(cast(limb) (cast(int) in2[6])) * (cast(int) in1[0]);
252 	output[7] =       (cast(limb) (cast(int) in2[3])) * (cast(int) in1[4]) +
253 		(cast(limb) (cast(int) in2[4])) * (cast(int) in1[3]) +
254 			(cast(limb) (cast(int) in2[2])) * (cast(int) in1[5]) +
255 			(cast(limb) (cast(int) in2[5])) * (cast(int) in1[2]) +
256 			(cast(limb) (cast(int) in2[1])) * (cast(int) in1[6]) +
257 			(cast(limb) (cast(int) in2[6])) * (cast(int) in1[1]) +
258 			(cast(limb) (cast(int) in2[0])) * (cast(int) in1[7]) +
259 			(cast(limb) (cast(int) in2[7])) * (cast(int) in1[0]);
260 	output[8] =       (cast(limb) (cast(int) in2[4])) * (cast(int) in1[4]) +
261 		2 * ((cast(limb) (cast(int) in2[3])) * (cast(int) in1[5]) +
262 			(cast(limb) (cast(int) in2[5])) * (cast(int) in1[3]) +
263 			(cast(limb) (cast(int) in2[1])) * (cast(int) in1[7]) +
264 			(cast(limb) (cast(int) in2[7])) * (cast(int) in1[1])) +
265 			(cast(limb) (cast(int) in2[2])) * (cast(int) in1[6]) +
266 			(cast(limb) (cast(int) in2[6])) * (cast(int) in1[2]) +
267 			(cast(limb) (cast(int) in2[0])) * (cast(int) in1[8]) +
268 			(cast(limb) (cast(int) in2[8])) * (cast(int) in1[0]);
269 	output[9] =       (cast(limb) (cast(int) in2[4])) * (cast(int) in1[5]) +
270 		(cast(limb) (cast(int) in2[5])) * (cast(int) in1[4]) +
271 			(cast(limb) (cast(int) in2[3])) * (cast(int) in1[6]) +
272 			(cast(limb) (cast(int) in2[6])) * (cast(int) in1[3]) +
273 			(cast(limb) (cast(int) in2[2])) * (cast(int) in1[7]) +
274 			(cast(limb) (cast(int) in2[7])) * (cast(int) in1[2]) +
275 			(cast(limb) (cast(int) in2[1])) * (cast(int) in1[8]) +
276 			(cast(limb) (cast(int) in2[8])) * (cast(int) in1[1]) +
277 			(cast(limb) (cast(int) in2[0])) * (cast(int) in1[9]) +
278 			(cast(limb) (cast(int) in2[9])) * (cast(int) in1[0]);
279 	output[10] = 2 * ((cast(limb) (cast(int) in2[5])) * (cast(int) in1[5]) +
280 		(cast(limb) (cast(int) in2[3])) * (cast(int) in1[7]) +
281 		(cast(limb) (cast(int) in2[7])) * (cast(int) in1[3]) +
282 		(cast(limb) (cast(int) in2[1])) * (cast(int) in1[9]) +
283 		(cast(limb) (cast(int) in2[9])) * (cast(int) in1[1])) +
284 		(cast(limb) (cast(int) in2[4])) * (cast(int) in1[6]) +
285 			(cast(limb) (cast(int) in2[6])) * (cast(int) in1[4]) +
286 			(cast(limb) (cast(int) in2[2])) * (cast(int) in1[8]) +
287 			(cast(limb) (cast(int) in2[8])) * (cast(int) in1[2]);
288 	output[11] =      (cast(limb) (cast(int) in2[5])) * (cast(int) in1[6]) +
289 		(cast(limb) (cast(int) in2[6])) * (cast(int) in1[5]) +
290 			(cast(limb) (cast(int) in2[4])) * (cast(int) in1[7]) +
291 			(cast(limb) (cast(int) in2[7])) * (cast(int) in1[4]) +
292 			(cast(limb) (cast(int) in2[3])) * (cast(int) in1[8]) +
293 			(cast(limb) (cast(int) in2[8])) * (cast(int) in1[3]) +
294 			(cast(limb) (cast(int) in2[2])) * (cast(int) in1[9]) +
295 			(cast(limb) (cast(int) in2[9])) * (cast(int) in1[2]);
296 	output[12] =      (cast(limb) (cast(int) in2[6])) * (cast(int) in1[6]) +
297 		2 * ((cast(limb) (cast(int) in2[5])) * (cast(int) in1[7]) +
298 			(cast(limb) (cast(int) in2[7])) * (cast(int) in1[5]) +
299 			(cast(limb) (cast(int) in2[3])) * (cast(int) in1[9]) +
300 			(cast(limb) (cast(int) in2[9])) * (cast(int) in1[3])) +
301 			(cast(limb) (cast(int) in2[4])) * (cast(int) in1[8]) +
302 			(cast(limb) (cast(int) in2[8])) * (cast(int) in1[4]);
303 	output[13] =      (cast(limb) (cast(int) in2[6])) * (cast(int) in1[7]) +
304 		(cast(limb) (cast(int) in2[7])) * (cast(int) in1[6]) +
305 			(cast(limb) (cast(int) in2[5])) * (cast(int) in1[8]) +
306 			(cast(limb) (cast(int) in2[8])) * (cast(int) in1[5]) +
307 			(cast(limb) (cast(int) in2[4])) * (cast(int) in1[9]) +
308 			(cast(limb) (cast(int) in2[9])) * (cast(int) in1[4]);
309 	output[14] = 2 * ((cast(limb) (cast(int) in2[7])) * (cast(int) in1[7]) +
310 		(cast(limb) (cast(int) in2[5])) * (cast(int) in1[9]) +
311 		(cast(limb) (cast(int) in2[9])) * (cast(int) in1[5])) +
312 		(cast(limb) (cast(int) in2[6])) * (cast(int) in1[8]) +
313 			(cast(limb) (cast(int) in2[8])) * (cast(int) in1[6]);
314 	output[15] =      (cast(limb) (cast(int) in2[7])) * (cast(int) in1[8]) +
315 		(cast(limb) (cast(int) in2[8])) * (cast(int) in1[7]) +
316 			(cast(limb) (cast(int) in2[6])) * (cast(int) in1[9]) +
317 			(cast(limb) (cast(int) in2[9])) * (cast(int) in1[6]);
318 	output[16] =      (cast(limb) (cast(int) in2[8])) * (cast(int) in1[8]) +
319 		2 * ((cast(limb) (cast(int) in2[7])) * (cast(int) in1[9]) +
320 			(cast(limb) (cast(int) in2[9])) * (cast(int) in1[7]));
321 	output[17] =      (cast(limb) (cast(int) in2[8])) * (cast(int) in1[9]) +
322 		(cast(limb) (cast(int) in2[9])) * (cast(int) in1[8]);
323 	output[18] = 2 *  (cast(limb) (cast(int) in2[9])) * (cast(int) in1[9]);
324 }
325 
326 /* Reduce a long form to a short form by taking the input mod 2^255 - 19.
327  *
328  * On entry: |output[i]| < 14*2^54
329  * On exit: |output[0..8]| < 280*2^54 */
330 void freduce_degree(ref limb[19] output) {
331 	/* Each of these shifts and adds ends up multiplying the value by 19.
332 	 *
333 	 * For output[0..8], the absolute entry value is < 14*2^54 and we add, at
334 	 * most, 19*14*2^54 thus, on exit, |output[0..8]| < 280*2^54. */
335 	output[8] += output[18] << 4;
336 	output[8] += output[18] << 1;
337 	output[8] += output[18];
338 	output[7] += output[17] << 4;
339 	output[7] += output[17] << 1;
340 	output[7] += output[17];
341 	output[6] += output[16] << 4;
342 	output[6] += output[16] << 1;
343 	output[6] += output[16];
344 	output[5] += output[15] << 4;
345 	output[5] += output[15] << 1;
346 	output[5] += output[15];
347 	output[4] += output[14] << 4;
348 	output[4] += output[14] << 1;
349 	output[4] += output[14];
350 	output[3] += output[13] << 4;
351 	output[3] += output[13] << 1;
352 	output[3] += output[13];
353 	output[2] += output[12] << 4;
354 	output[2] += output[12] << 1;
355 	output[2] += output[12];
356 	output[1] += output[11] << 4;
357 	output[1] += output[11] << 1;
358 	output[1] += output[11];
359 	output[0] += output[10] << 4;
360 	output[0] += output[10] << 1;
361 	output[0] += output[10];
362 }
363 
364 static assert((-1 & 3) == 3, "This code only works on a two's complement system");
365 
366 /* return v / 2^26, using only shifts and adds.
367  *
368  * On entry: v can take any value. */
369 limb div_by_2_26(in limb v)
370 {
371 	/* High word of v; no shift needed. */
372 	uint highword = cast(uint) (cast(ulong) v >> 32);
373 	/* Set to all 1s if v was negative; else set to 0s. */
374 	int sign = (cast (int) highword) >> 31;
375 	/* Set to 0x3ffffff if v was negative; else set to 0. */
376 	int roundoff = (cast(uint) sign) >> 6;
377 	/* Should return v / (1<<26) */
378 	return (v + roundoff) >> 26;
379 }
380 
381 /* return v / (2^25), using only shifts and adds.
382  *
383  * On entry: v can take any value. */
384 limb div_by_2_25(in limb v)
385 {
386 	/* High word of v; no shift needed*/
387 	uint highword = cast(uint) ((cast(ulong) v) >> 32);
388 	/* Set to all 1s if v was negative; else set to 0s. */
389 	int sign = (cast(int) highword) >> 31;
390 	/* Set to 0x1ffffff if v was negative; else set to 0. */
391 	int roundoff = (cast(uint) sign) >> 7;
392 	/* Should return v / (1<<25) */
393 	return (v + roundoff) >> 25;
394 }
395 
396 /* Reduce all coefficients of the short form input so that |x| < 2^26.
397  *
398  * On entry: |output[i]| < 280*2^54 */
399 void freduce_coefficients(ref limb[19] output)
400 {
401 
402 	output[10] = 0;
403 	
404 	for (uint i = 0; i < 10; i += 2) {
405 		limb over = div_by_2_26(output[i]);
406 		/* The entry condition (that |output[i]| < 280*2^54) means that over is, at
407 		 * most, 280*2^28 in the first iteration of this loop. This is added to the
408 		 * next limb and we can approximate the resulting bound of that limb by
409 		 * 281*2^54. */
410 		output[i] -= over << 26;
411 		output[i+1] += over;
412 		
413 		/* For the first iteration, |output[i+1]| < 281*2^54, thus |over| <
414 		 * 281*2^29. When this is added to the next limb, the resulting bound can
415 		 * be approximated as 281*2^54.
416 		 *
417 		 * For subsequent iterations of the loop, 281*2^54 remains a conservative
418 		 * bound and no overflow occurs. */
419 		over = div_by_2_25(output[i+1]);
420 		output[i+1] -= over << 25;
421 		output[i+2] += over;
422 	}
423 	/* Now |output[10]| < 281*2^29 and all other coefficients are reduced. */
424 	output[0] += output[10] << 4;
425 	output[0] += output[10] << 1;
426 	output[0] += output[10];
427 	
428 	output[10] = 0;
429 	
430 	/* Now output[1..9] are reduced, and |output[0]| < 2^26 + 19*281*2^29
431 	 * So |over| will be no more than 2^16. */
432 	{
433 		limb over = div_by_2_26(output[0]);
434 		output[0] -= over << 26;
435 		output[1] += over;
436 	}
437 	
438 	/* Now output[0,2..9] are reduced, and |output[1]| < 2^25 + 2^16 < 2^26. The
439 	 * bound on |output[1]| is sufficient to meet our needs. */
440 }
441 
442 /* A helpful wrapper around fproduct: output = in * in2.
443  *
444  * On entry: |in[i]| < 2^27 and |in2[i]| < 2^27.
445  *
446  * output must be distinct to both inputs. The output is reduced degree
447  * (indeed, one need only provide storage for 10 limbs) and |output[i]| < 2^26. */
448 
449 void fmul(ref limb[10] output, in ref limb[10] in1, in ref limb[10] in2)
450 {
451 	limb[19] t;
452 	fproduct(t, in1, in2);
453 	/* |t[i]| < 14*2^54 */
454 	freduce_degree(t);
455 	freduce_coefficients(t);
456 	/* |t[i]| < 2^26 */
457 	output[0..10] = t[0..10];
458 }
459 
460 /* Square a number: output = in**2
461  *
462  * output must be distinct from the input. The inputs are reduced coefficient
463  * form, the output is not.
464  *
465  * output[x] <= 14 * the largest product of the input limbs. */
466 void fsquare_inner(limb[] output, in limb[] input) {
467 	output[0] =       (cast(limb) (cast(int) input[0])) * (cast(int) input[0]);
468 	output[1] =  2 *  (cast(limb) (cast(int) input[0])) * (cast(int) input[1]);
469 	output[2] =  2 * ((cast(limb) (cast(int) input[1])) * (cast(int) input[1]) +
470 		(cast(limb) (cast(int) input[0])) * (cast(int) input[2]));
471 	output[3] =  2 * ((cast(limb) (cast(int) input[1])) * (cast(int) input[2]) +
472 		(cast(limb) (cast(int) input[0])) * (cast(int) input[3]));
473 	output[4] =       (cast(limb) (cast(int) input[2])) * (cast(int) input[2]) +
474 		4 *  (cast(limb) (cast(int) input[1])) * (cast(int) input[3]) +
475 			2 *  (cast(limb) (cast(int) input[0])) * (cast(int) input[4]);
476 	output[5] =  2 * ((cast(limb) (cast(int) input[2])) * (cast(int) input[3]) +
477 		(cast(limb) (cast(int) input[1])) * (cast(int) input[4]) +
478 		(cast(limb) (cast(int) input[0])) * (cast(int) input[5]));
479 	output[6] =  2 * ((cast(limb) (cast(int) input[3])) * (cast(int) input[3]) +
480 		(cast(limb) (cast(int) input[2])) * (cast(int) input[4]) +
481 		(cast(limb) (cast(int) input[0])) * (cast(int) input[6]) +
482 		2 *  (cast(limb) (cast(int) input[1])) * (cast(int) input[5]));
483 	output[7] =  2 * ((cast(limb) (cast(int) input[3])) * (cast(int) input[4]) +
484 		(cast(limb) (cast(int) input[2])) * (cast(int) input[5]) +
485 		(cast(limb) (cast(int) input[1])) * (cast(int) input[6]) +
486 		(cast(limb) (cast(int) input[0])) * (cast(int) input[7]));
487 	output[8] =       (cast(limb) (cast(int) input[4])) * (cast(int) input[4]) +
488 		2 * ((cast(limb) (cast(int) input[2])) * (cast(int) input[6]) +
489 			(cast(limb) (cast(int) input[0])) * (cast(int) input[8]) +
490 			2 * ((cast(limb) (cast(int) input[1])) * (cast(int) input[7]) +
491 				(cast(limb) (cast(int) input[3])) * (cast(int) input[5])));
492 	output[9] =  2 * ((cast(limb) (cast(int) input[4])) * (cast(int) input[5]) +
493 		(cast(limb) (cast(int) input[3])) * (cast(int) input[6]) +
494 		(cast(limb) (cast(int) input[2])) * (cast(int) input[7]) +
495 		(cast(limb) (cast(int) input[1])) * (cast(int) input[8]) +
496 		(cast(limb) (cast(int) input[0])) * (cast(int) input[9]));
497 	output[10] = 2 * ((cast(limb) (cast(int) input[5])) * (cast(int) input[5]) +
498 		(cast(limb) (cast(int) input[4])) * (cast(int) input[6]) +
499 		(cast(limb) (cast(int) input[2])) * (cast(int) input[8]) +
500 		2 * ((cast(limb) (cast(int) input[3])) * (cast(int) input[7]) +
501 			(cast(limb) (cast(int) input[1])) * (cast(int) input[9])));
502 	output[11] = 2 * ((cast(limb) (cast(int) input[5])) * (cast(int) input[6]) +
503 		(cast(limb) (cast(int) input[4])) * (cast(int) input[7]) +
504 		(cast(limb) (cast(int) input[3])) * (cast(int) input[8]) +
505 		(cast(limb) (cast(int) input[2])) * (cast(int) input[9]));
506 	output[12] =      (cast(limb) (cast(int) input[6])) * (cast(int) input[6]) +
507 		2 * ((cast(limb) (cast(int) input[4])) * (cast(int) input[8]) +
508 			2 * ((cast(limb) (cast(int) input[5])) * (cast(int) input[7]) +
509 				(cast(limb) (cast(int) input[3])) * (cast(int) input[9])));
510 	output[13] = 2 * ((cast(limb) (cast(int) input[6])) * (cast(int) input[7]) +
511 		(cast(limb) (cast(int) input[5])) * (cast(int) input[8]) +
512 		(cast(limb) (cast(int) input[4])) * (cast(int) input[9]));
513 	output[14] = 2 * ((cast(limb) (cast(int) input[7])) * (cast(int) input[7]) +
514 		(cast(limb) (cast(int) input[6])) * (cast(int) input[8]) +
515 		2 *  (cast(limb) (cast(int) input[5])) * (cast(int) input[9]));
516 	output[15] = 2 * ((cast(limb) (cast(int) input[7])) * (cast(int) input[8]) +
517 		(cast(limb) (cast(int) input[6])) * (cast(int) input[9]));
518 	output[16] =      (cast(limb) (cast(int) input[8])) * (cast(int) input[8]) +
519 		4 *  (cast(limb) (cast(int) input[7])) * (cast(int) input[9]);
520 	output[17] = 2 *  (cast(limb) (cast(int) input[8])) * (cast(int) input[9]);
521 	output[18] = 2 *  (cast(limb) (cast(int) input[9])) * (cast(int) input[9]);
522 }
523 
524 /* fsquare sets output = in^2.
525  *
526  * On entry: The |in| argument is in reduced coefficients form and |in[i]| <
527  * 2^27.
528  *
529  * On exit: The |output| argument is in reduced coefficients form (indeed, one
530  * need only provide storage for 10 limbs) and |out[i]| < 2^26. */
531 void fsquare(limb[] output, in limb[] input) {
532 	limb[19] t;
533 	fsquare_inner(t, input);
534 	/* |t[i]| < 14*2^54 because the largest product of two limbs will be <
535 	 * 2^(27+27) and fsquare_inner adds together, at most, 14 of those
536 	 * products. */
537 	freduce_degree(t);
538 	freduce_coefficients(t);
539 	/* |t[i]| < 2^26 */
540 	output[0..10] = t[0..10];
541 }
542 
543 /* Take a little-endian, 32-byte number and expand it into polynomial form */
544 void fexpand(ref limb[10] output, in ref ubyte[32] input) 
545 {
546 
547 	void F(uint n, uint start, uint shift, uint mask)(limb[] output, in ubyte[] input) {
548 		output[n] = ((input[start + 0] | 
549 				(input[start + 1]) << 8 | 
550 				(input[start + 2]) << 16 | 
551 				(input[start + 3]) << 24) >> shift) & mask;
552 	}
553 
554 	F!(0, 0, 0, 0x3ffffff)(output, input);
555 	F!(1, 3, 2, 0x1ffffff)(output, input);
556 	F!(2, 6, 3, 0x3ffffff)(output, input);
557 	F!(3, 9, 5, 0x1ffffff)(output, input);
558 	F!(4, 12, 6, 0x3ffffff)(output, input);
559 	F!(5, 16, 0, 0x1ffffff)(output, input);
560 	F!(6, 19, 1, 0x3ffffff)(output, input);
561 	F!(7, 22, 3, 0x1ffffff)(output, input);
562 	F!(8, 25, 4, 0x3ffffff)(output, input);
563 	F!(9, 28, 6, 0x1ffffff)(output, input);
564 
565 }
566 
567 static assert((-32 >> 1) == -16, "This code only works when >> does sign-extension on negative numbers");
568 
569 /* int_eq returns 0xffffffff iff a == b and zero otherwise. */
570 int int_eq(int a, int b) {
571 	a = ~(a ^ b);
572 	a &= a << 16;
573 	a &= a << 8;
574 	a &= a << 4;
575 	a &= a << 2;
576 	a &= a << 1;
577 	return a >> 31;
578 }
579 
580 /* int_gte returns 0xffffffff if a >= b and zero otherwise, where a and b are
581  * both non-negative. */
582 int int_gte(int a, int b) {
583 	a -= b;
584 	/* a >= 0 iff a >= b. */
585 	return ~(a >> 31);
586 }
587 
588 /* Take a fully reduced polynomial form number and contract it into a
589  * little-endian, 32-byte array.
590  *
591  * On entry: |input_limbs[i]| < 2^26 */
592 void fcontract(ref ubyte[32] output, in ref limb[10] input_limbs) 
593 in {
594 	assert(output.length == 32);
595 	assert(input_limbs.length == 10);
596 } body {
597 	int[10] input;
598 	int _mask;
599 	
600 	/* |input_limbs[i]| < 2^26, so it's valid to convert to an int. */
601 	for (uint i = 0; i < 10; i++) {
602 		input[i] = cast(uint) input_limbs[i];
603 	}
604 
605 	for (uint j = 0; j < 2; ++j) {
606 		for (uint i = 0; i < 9; ++i) {
607 			if ((i & 1) == 1) {
608 				/* This calculation is a time-invariant way to make input[i]
609 				 * non-negative by borrowing from the next-larger limb. */
610 				immutable int mask = input[i] >> 31;
611 				immutable int carry = -((input[i] & mask) >> 25);
612 				input[i] = input[i] + (carry << 25);
613 				input[i+1] = input[i+1] - carry;
614 			} else {
615 				immutable int mask = input[i] >> 31;
616 				immutable int carry = -((input[i] & mask) >> 26);
617 				input[i] = input[i] + (carry << 26);
618 				input[i+1] = input[i+1] - carry;
619 			}
620 		}
621 		
622 		/* There's no greater limb for input[9] to borrow from, but we can multiply
623 		 * by 19 and borrow from input[0], which is valid mod 2^255-19. */
624 		{
625 			immutable int mask = input[9] >> 31;
626 			immutable int carry = -((input[9] & mask) >> 25);
627 			input[9] = input[9] + (carry << 25);
628 			input[0] = input[0] - (carry * 19);
629 		}
630 		
631 		/* After the first iteration, input[1..9] are non-negative and fit within
632 		 * 25 or 26 bits, depending on position. However, input[0] may be
633 		 * negative. */
634 	}
635 	
636 	/* The first borrow-propagation pass above ended with every limb
637 	 except (possibly) input[0] non-negative.
638 
639 	 If input[0] was negative after the first pass, then it was because of a
640 	 carry from input[9]. On entry, input[9] < 2^26 so the carry was, at most,
641 	 one, since (2**26-1) >> 25 = 1. Thus input[0] >= -19.
642 
643 	 In the second pass, each limb is decreased by at most one. Thus the second
644 	 borrow-propagation pass could only have wrapped around to decrease
645 	 input[0] again if the first pass left input[0] negative *and* input[1]
646 	 through input[9] were all zero.  In that case, input[1] is now 2^25 - 1,
647 	 and this last borrow-propagation step will leave input[1] non-negative. */
648 	{
649 		immutable int mask = input[0] >> 31;
650 		immutable int carry = -((input[0] & mask) >> 26);
651 		input[0] = input[0] + (carry << 26);
652 		input[1] = input[1] - carry;
653 	}
654 	
655 	/* All input[i] are now non-negative. However, there might be values between
656 	 * 2^25 and 2^26 in a limb which is, nominally, 25 bits wide. */
657 	for (uint j = 0; j < 2; j++) {
658 		for (uint i = 0; i < 9; i++) {
659 			if ((i & 1) == 1) {
660 				immutable int carry = input[i] >> 25;
661 				input[i] &= 0x1ffffff;
662 				input[i+1] += carry;
663 			} else {
664 				immutable int carry = input[i] >> 26;
665 				input[i] &= 0x3ffffff;
666 				input[i+1] += carry;
667 			}
668 		}
669 		
670 		{
671 			immutable int carry = input[9] >> 25;
672 			input[9] &= 0x1ffffff;
673 			input[0] += 19*carry;
674 		}
675 	}
676 	
677 	/* If the first carry-chain pass, just above, ended up with a carry from
678 	 * input[9], and that caused input[0] to be out-of-bounds, then input[0] was
679 	 * < 2^26 + 2*19, because the carry was, at most, two.
680 	 *
681 	 * If the second pass carried from input[9] again then input[0] is < 2*19 and
682 	 * the input[9] -> input[0] carry didn't push input[0] out of bounds. */
683 	
684 	/* It still remains the case that input might be between 2^255-19 and 2^255.
685 	 * In this case, input[1..9] must take their maximum value and input[0] must
686 	 * be >= (2^255-19) & 0x3ffffff, which is 0x3ffffed. */
687 	_mask = int_gte(input[0], 0x3ffffed);
688 	for (uint i = 1; i < 10; i++) {
689 		if ((i & 1) == 1) {
690 			_mask &= int_eq(input[i], 0x1ffffff);
691 		} else {
692 			_mask &= int_eq(input[i], 0x3ffffff);
693 		}
694 	}
695 	
696 	/* mask is either 0xffffffff (if input >= 2^255-19) and zero otherwise. Thus
697 	 * this conditionally subtracts 2^255-19. */
698 	input[0] -= _mask & 0x3ffffed;
699 	
700 	for (uint i = 1; i < 10; i++) {
701 		if ((i & 1) == 1) {
702 			input[i] -= _mask & 0x1ffffff;
703 		} else {
704 			input[i] -= _mask & 0x3ffffff;
705 		}
706 	}
707 	
708 	input[1] <<= 2;
709 	input[2] <<= 3;
710 	input[3] <<= 5;
711 	input[4] <<= 6;
712 	input[6] <<= 1;
713 	input[7] <<= 3;
714 	input[8] <<= 4;
715 	input[9] <<= 6;
716 
717 	void F(uint i, uint s)() {
718 		output[s+0] |=  input[i] & 0xff; 
719 		output[s+1]  = (input[i] >> 8) & 0xff; 
720 		output[s+2]  = (input[i] >> 16) & 0xff; 
721 		output[s+3]  = (input[i] >> 24) & 0xff;
722 	}
723 
724 	output[0] = 0;
725 	output[16] = 0;
726 	F!(0,0)();
727 	F!(1,3)();
728 	F!(2,6)();
729 	F!(3,9)();
730 	F!(4,12)();
731 	F!(5,16)();
732 	F!(6,19)();
733 	F!(7,22)();
734 	F!(8,25)();
735 	F!(9,28)();
736 }
737 
738 /* Input: Q, Q', Q-Q'
739  * Output: 2Q, Q+Q'
740  *
741  *   x2 z3: long form
742  *   x3 z3: long form
743  *   x z: short form, destroyed
744  *   xprime zprime: short form, destroyed
745  *   qmqp: short form, preserved
746  *
747  * On entry and exit, the absolute value of the limbs of all inputs and outputs
748  * are < 2^26. */
749 void fmonty(ref limb[19] x2, ref limb[19] z2,  /* output 2Q */
750 	ref limb[19] x3, ref limb[19] z3,  /* output Q + Q' */
751 	ref limb[19] x, ref limb[19] z,    /* input Q */
752 	ref limb[19] xprime, ref limb[19] zprime,  /* input Q' */
753 	in ref limb[10] qmqp /* input Q - Q' */) 
754 {
755 
756 	limb[10] origx, origxprime;
757 	limb[19] zzz, xx, zz, xxprime,
758 		zzprime, zzzprime, xxxprime;
759 	
760 	//memcpy(origx, x, 10 * sizeofcast(limb));
761 	origx[0..10] = x[0..10];
762 
763 	fsum(x, z);
764 	/* |x[i]| < 2^27 */
765 	fdifference(z, origx);  /* does x - z */
766 	/* |z[i]| < 2^27 */
767 	
768 	//memcpy(origxprime, xprime, sizeofcast(limb) * 10);
769 	origxprime[0..10] = xprime[0..10];
770 
771 	fsum(xprime, zprime);
772 	/* |xprime[i]| < 2^27 */
773 	fdifference(zprime, origxprime);
774 	/* |zprime[i]| < 2^27 */
775 	fproduct(xxprime, xprime, z);
776 	/* |xxprime[i]| < 14*2^54: the largest product of two limbs will be <
777 	 * 2^(27+27) and fproduct adds together, at most, 14 of those products.
778 	 * (Approximating that to 2^58 doesn't work out.) */
779 	fproduct(zzprime, x, zprime);
780 	/* |zzprime[i]| < 14*2^54 */
781 	freduce_degree(xxprime);
782 	freduce_coefficients(xxprime);
783 	/* |xxprime[i]| < 2^26 */
784 	freduce_degree(zzprime);
785 	freduce_coefficients(zzprime);
786 	/* |zzprime[i]| < 2^26 */
787 	//memcpy(origxprime, xxprime, sizeofcast(limb) * 10);
788 	origxprime[0..10] = xxprime[0..10];
789 	fsum(xxprime, zzprime);
790 	/* |xxprime[i]| < 2^27 */
791 	fdifference(zzprime, origxprime);
792 	/* |zzprime[i]| < 2^27 */
793 	fsquare(xxxprime, xxprime);
794 	/* |xxxprime[i]| < 2^26 */
795 	fsquare(zzzprime, zzprime);
796 	/* |zzzprime[i]| < 2^26 */
797 	fproduct(zzprime, zzzprime, qmqp);
798 	/* |zzprime[i]| < 14*2^52 */
799 	freduce_degree(zzprime);
800 	freduce_coefficients(zzprime);
801 	/* |zzprime[i]| < 2^26 */
802 	//memcpy(x3, xxxprime, sizeofcast(limb) * 10);
803 	x3[0..10] = xxxprime[0..10];
804 	//memcpy(z3, zzprime, sizeofcast(limb) * 10);
805 	z3[0..10] = zzprime[0..10];
806 	
807 	fsquare(xx, x);
808 	/* |xx[i]| < 2^26 */
809 	fsquare(zz, z);
810 	/* |zz[i]| < 2^26 */
811 	fproduct(x2, xx, zz);
812 	/* |x2[i]| < 14*2^52 */
813 	freduce_degree(x2);
814 	freduce_coefficients(x2);
815 	/* |x2[i]| < 2^26 */
816 	fdifference(zz, xx);  // does zz = xx - zz
817 	/* |zz[i]| < 2^27 */
818 	//memset(zzz + 10, 0, sizeofcast(limb) * 9);
819 	zzz[10..19] = 0;
820 
821 	fscalar_product(zzz, zz, 121665);
822 	/* |zzz[i]| < 2^(27+17) */
823 	/* No need to call freduce_degree here:
824 	 fscalar_product doesn't increase the degree of its input. */
825 	freduce_coefficients(zzz);
826 	/* |zzz[i]| < 2^26 */
827 	fsum(zzz, xx);
828 	/* |zzz[i]| < 2^27 */
829 	fproduct(z2, zz, zzz);
830 	/* |z2[i]| < 14*2^(26+27) */
831 	freduce_degree(z2);
832 	freduce_coefficients(z2);
833 	/* |z2|i| < 2^26 */
834 }
835 
836 /* Conditionally swap two reduced-form limb arrays if 'iswap' is 1, but leave
837  * them unchanged if 'iswap' is 0.  Runs in data-invariant time to avoid
838  * side-channel attacks.
839  *
840  * NOTE that this function requires that 'iswap' be 1 or 0; other values give
841  * wrong results.  Also, the two limb arrays must be in reduced-coefficient,
842  * reduced-degree form: the values in a[10..19] or b[10..19] aren't swapped,
843  * and all all values in a[0..9],b[0..9] must have magnitude less than
844  * INT32_MAX. */
845 void swap_conditional(ref limb[19] a, ref limb[19] b, in limb iswap) 
846 in {
847 	assert(iswap == 0 || iswap == 1, "requires 'iswap' to be 1 or 0");
848 }
849 body {
850 	immutable int swap = cast(int) -iswap;
851 	
852 	for (uint i = 0; i < 10; ++i) {
853 		immutable int x = swap & ( (cast(int)a[i]) ^ (cast(int)b[i]) );
854 		a[i] = (cast(int)a[i]) ^ x;
855 		b[i] = (cast(int)b[i]) ^ x;
856 	}
857 }
858 
859 /* Calculates nQ where Q is the x-coordinate of a point on the curve
860  *
861  *   resultx/resultz: the x coordinate of the resulting curve point (short form)
862  *   n: a little endian, 32-byte number
863  *   q: a point of the curve (short form) */
864 void cmult(ref limb[10] resultx, ref limb[10] resultz, in ref ubyte[32] n, in ref limb[10] q) 
865 {
866 	limb[19] a, b, c, d;
867 	b[0] = 1;
868 	c[0] = 1;
869 
870 	alias nqpqx = a, nqpqz = b, nqx = c, nqz = d;
871 	limb[] t;
872 	limb[19] e, f, g, h;
873 	f[0] = 1;
874 	h[0] = 1;
875 	alias nqpqx2 = e, nqpqz2 = f, nqx2 = g, nqz2 = h;
876 
877 	nqpqx[0..10] = q[0..10];
878 	
879 	for (uint i = 0; i < 32; ++i) {
880 		ubyte byt = n[31 - i];
881 		for (uint j = 0; j < 8; ++j) {
882 			immutable limb bit = byt >> 7;
883 			
884 			swap_conditional(nqx, nqpqx, bit);
885 			swap_conditional(nqz, nqpqz, bit);
886 			fmonty(nqx2, nqz2,
887 				nqpqx2, nqpqz2,
888 				nqx, nqz,
889 				nqpqx, nqpqz,
890 				q);
891 			swap_conditional(nqx2, nqpqx2, bit);
892 			swap_conditional(nqz2, nqpqz2, bit);
893 			
894 			t = nqx;
895 			nqx = nqx2;
896 			nqx2 = t;
897 			t = nqz;
898 			nqz = nqz2;
899 			nqz2 = t;
900 			t = nqpqx;
901 			nqpqx = nqpqx2;
902 			nqpqx2 = t;
903 			t = nqpqz;
904 			nqpqz = nqpqz2;
905 			nqpqz2 = t;
906 
907 			byt <<= 1;
908 		}
909 	}
910 
911 	resultx[0..10] = nqx[0..10];
912 	resultz[0..10] = nqz[0..10];
913 }
914 
915 // -----------------------------------------------------------------------------
916 // Shamelessly copied from djb's code :-)
917 // -----------------------------------------------------------------------------
918 void crecip(ref limb[10] output, in ref limb[10] z) {
919 	limb[10] z2, z9, z11, z2_5_0, z2_10_0, z2_20_0, z2_50_0, z2_100_0, t0, t1;
920 
921 	/* 2 */ fsquare(z2,z);
922 	/* 4 */ fsquare(t1,z2);
923 	/* 8 */ fsquare(t0,t1);
924 	/* 9 */ fmul(z9,t0,z);
925 	/* 11 */ fmul(z11,z9,z2);
926 	/* 22 */ fsquare(t0,z11);
927 	/* 2^5 - 2^0 = 31 */ fmul(z2_5_0,t0,z9);
928 	
929 	/* 2^6 - 2^1 */ fsquare(t0,z2_5_0);
930 	/* 2^7 - 2^2 */ fsquare(t1,t0);
931 	/* 2^8 - 2^3 */ fsquare(t0,t1);
932 	/* 2^9 - 2^4 */ fsquare(t1,t0);
933 	/* 2^10 - 2^5 */ fsquare(t0,t1);
934 	/* 2^10 - 2^0 */ fmul(z2_10_0,t0,z2_5_0);
935 	
936 	/* 2^11 - 2^1 */ fsquare(t0,z2_10_0);
937 	/* 2^12 - 2^2 */ fsquare(t1,t0);
938 	/* 2^20 - 2^10 */ for (uint i = 2;i < 10;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
939 	/* 2^20 - 2^0 */ fmul(z2_20_0,t1,z2_10_0);
940 	
941 	/* 2^21 - 2^1 */ fsquare(t0,z2_20_0);
942 	/* 2^22 - 2^2 */ fsquare(t1,t0);
943 	/* 2^40 - 2^20 */ for (uint i = 2;i < 20;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
944 	/* 2^40 - 2^0 */ fmul(t0,t1,z2_20_0);
945 	
946 	/* 2^41 - 2^1 */ fsquare(t1,t0);
947 	/* 2^42 - 2^2 */ fsquare(t0,t1);
948 	/* 2^50 - 2^10 */ for (uint i = 2;i < 10;i += 2) { fsquare(t1,t0); fsquare(t0,t1); }
949 	/* 2^50 - 2^0 */ fmul(z2_50_0,t0,z2_10_0);
950 
951 	/* 2^51 - 2^1 */ fsquare(t0,z2_50_0);
952 	/* 2^52 - 2^2 */ fsquare(t1,t0);
953 	/* 2^100 - 2^50 */ for (uint i = 2;i < 50;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
954 	/* 2^100 - 2^0 */ fmul(z2_100_0,t1,z2_50_0);
955 	
956 	/* 2^101 - 2^1 */ fsquare(t1,z2_100_0);
957 	/* 2^102 - 2^2 */ fsquare(t0,t1);
958 	/* 2^200 - 2^100 */ for (uint i = 2;i < 100;i += 2) { fsquare(t1,t0); fsquare(t0,t1); }
959 	/* 2^200 - 2^0 */ fmul(t1,t0,z2_100_0);
960 	
961 	/* 2^201 - 2^1 */ fsquare(t0,t1);
962 	/* 2^202 - 2^2 */ fsquare(t1,t0);
963 	/* 2^250 - 2^50 */ for (uint i = 2;i < 50;i += 2) { fsquare(t0,t1); fsquare(t1,t0); }
964 	/* 2^250 - 2^0 */ fmul(t0,t1,z2_50_0);
965 
966 	/* 2^251 - 2^1 */ fsquare(t1,t0);
967 	/* 2^252 - 2^2 */ fsquare(t0,t1);
968 	/* 2^253 - 2^3 */ fsquare(t1,t0);
969 	/* 2^254 - 2^4 */ fsquare(t0,t1);
970 	/* 2^255 - 2^5 */ fsquare(t1,t0);
971 	/* 2^255 - 21 */ fmul(output,t1,z11);
972 }