1 module dcrypt.crypto.ecc.curve25519; 2 3 4 /** 5 * This module is a port of Bernstein's curve25519 to D. 6 * Original code can be found here: http://code.google.com/p/curve25519-donna/ 7 * 8 * Copyright: 9 * 10 * Copyright 2008, Google Inc. 11 * All rights reserved. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions are 15 * met: 16 * 17 * * Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * * Redistributions in binary form must reproduce the above 20 * copyright notice, this list of conditions and the following disclaimer 21 * in the documentation and/or other materials provided with the 22 * distribution. 23 * * Neither the name of Google Inc. nor the names of its 24 * contributors may be used to endorse or promote products derived from 25 * this software without specific prior written permission. 26 * 27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 38 * 39 * curve25519-donna: Curve25519 elliptic curve, public key function 40 * 41 * http://code.google.com/p/curve25519-donna/ 42 * 43 * Adam Langley <agl@imperialviolet.org> 44 * 45 * Derived from public domain C code by Daniel J. Bernstein <djb@cr.yp.to> 46 * 47 * More information about curve25519 can be found here 48 * http://cr.yp.to/ecdh.html 49 * 50 * djb's sample implementation of curve25519 is written in a special assembly 51 * language called qhasm and uses the floating point registers. 52 * 53 * This is, almost, a clean room reimplementation from the curve25519 paper. It 54 * uses many of the tricks described therein. Only the crecip function is taken 55 * from the sample implementation. 56 */ 57 58 /// Generate a public key from a secret. 59 /// Test vectors from http://cr.yp.to/highspeed/naclcrypto-20090310.pdf 60 unittest { 61 alias ubyte[32] key_t; 62 63 key_t secretKey = cast(key_t) x"77076d0a7318a57d3c16c17251b26645df4c2f87ebc0992ab177fba51db92c2a"; 64 65 key_t publicKey = curve25519(secretKey); 66 67 auto expectedPublic = x"8520f0098930a754748b7ddcb43ef75a0dbf3a0d26381af4eba4a98eaa9b4e6a"; 68 69 assert(publicKey == expectedPublic, "curve25519 public key generation failed!"); 70 } 71 72 /// Generate a public key from a secret. 73 /// Test vectors from http://cr.yp.to/highspeed/naclcrypto-20090310.pdf 74 unittest { 75 alias ubyte[32] key_t; 76 77 key_t secretKey = cast(key_t) x"5dab087e624a8a4b79e17f8b83800ee66f3bb1292618b6fd1c2f8b27ff88e0eb"; 78 79 key_t publicKey = curve25519(secretKey); 80 81 auto expectedPublic = x"de9edb7d7b7dc1b4d35b61c2ece435373f8343c85b78674dadfc7e146f882b4f"; 82 83 assert(publicKey == expectedPublic, "curve25519 public key generation failed!"); 84 } 85 86 /// DH key exchange 87 /// Test vectors from http://cr.yp.to/highspeed/naclcrypto-20090310.pdf 88 unittest { 89 alias ubyte[32] key_t; 90 91 key_t priv1, priv2, pub1, pub2; 92 priv1[] = cast(key_t) x"77076d0a7318a57d3c16c17251b26645df4c2f87ebc0992ab177fba51db92c2a"; 93 priv2[] = cast(key_t) x"5dab087e624a8a4b79e17f8b83800ee66f3bb1292618b6fd1c2f8b27ff88e0eb"; 94 95 pub1 = curve25519(priv1); 96 pub2 = curve25519(priv2); 97 98 key_t shared1, shared2; 99 100 // Generate the shared keys. Both should be equal. 101 shared1 = curve25519(priv1, pub2); 102 shared2 = curve25519(priv2, pub1); 103 104 auto expectedSharedSecret = x"4a5d9d5ba4ce2de1728e3bf480350f25e07e21c947d19e3376f09b3c1e161742"; 105 106 assert(shared1 == expectedSharedSecret, "curve25519 DH key agreement failed!"); 107 assert(shared1 == shared2, "curve25519 DH key agreement failed!"); 108 } 109 110 // Test key agreement with random secrets. 111 unittest { 112 import dcrypt.crypto.random.prng; 113 import std.algorithm: any; 114 115 DRNG drng = DRNG(0); 116 117 ubyte[32] secret1, secret2, pub1, pub2, shared1, shared2; 118 119 foreach(i;0..32) { 120 drng.nextBytes(secret1); 121 drng.nextBytes(secret2); 122 123 pub1 = curve25519(secret1); 124 pub2 = curve25519(secret2); 125 126 shared1 = curve25519(secret1, pub2); 127 shared2 = curve25519(secret2, pub1); 128 129 assert(shared1 == shared2, "DH key agreement with curve25519 failed!"); 130 assert(any!"a != 0"(shared1[]), "DH key agreement with curve25519 failed!"); 131 } 132 133 } 134 135 @safe pure nothrow @nogc: 136 137 public enum publicBasePoint = cast(immutable (ubyte[32]) ) x"0900000000000000000000000000000000000000000000000000000000000000"; 138 139 140 /// 141 /// 142 /// Params: 143 /// secret = your secret key, the 'exponent' 144 /// basepoint = public key. Default: base point 9 145 /// 146 /// Returns: basepoint^secret. 147 /// 148 /// Examples: 149 /// 150 /// ubyte[32] publicKey = curve25519(secretKey); 151 /// 152 /// ubyte[32] sharedKey = curve25519(mySecretKey, herPublicKey); 153 /// 154 public ubyte[32] curve25519(in ref ubyte[32] secret, in ref ubyte[32] basepoint = publicBasePoint) 155 in { 156 assert(secret.length == 32); 157 assert(basepoint.length == 32); 158 } body { 159 limb[10] bp, x, zmone; 160 limb[10] z; // TODO: was 11 but I think it should be 10 161 ubyte[32] e; 162 163 e[] = secret[]; 164 165 e[0] &= 248; 166 e[31] &= 127; 167 e[31] |= 64; 168 169 fexpand(bp, basepoint); 170 cmult(x, z, e, bp); 171 crecip(zmone, z); 172 fmul(z, x, zmone); 173 174 ubyte[32] myPublic; 175 176 fcontract(myPublic, z); 177 178 return myPublic; 179 } 180 181 private: 182 183 alias long limb; 184 185 /* Field element representation: 186 * 187 * Field elements are written as an array of signed, 64-bit limbs, least 188 * significant first. The value of the field element is: 189 * x[0] + 2^26·x[1] + x^51·x[2] + 2^102·x[3] + ... 190 * 191 * i.e. the limbs are 26, 25, 26, 25, ... bits wide. */ 192 193 /* Sum two numbers: output += in */ 194 void fsum(limb[] output, in limb[] input) 195 { 196 output[0..10] += input[0..10]; 197 } 198 199 /* Find the difference of two numbers: output = in - output 200 * (note the order of the arguments!). */ 201 void fdifference(limb[] output, in limb[] input) 202 { 203 output[0..10] = input[0..10] - output[0..10]; 204 } 205 206 /* Multiply a number by a scalar: output = in * scalar */ 207 void fscalar_product(limb[] output, in limb[] input, in limb scalar) 208 { 209 output[0..10] = input[0..10] * scalar; 210 } 211 212 /* Multiply two numbers: output = in2 * in 213 * 214 * output must be distinct to both inputs. The inputs are reduced coefficient 215 * form, the output is not. 216 * 217 * output[x] <= 14 * the largest product of the input limbs. */ 218 void fproduct(limb[] output, in limb[] in2, in limb[] in1) 219 in { 220 assert(output.length == 19); 221 assert(in1.length >= 10); 222 assert(in2.length >= 10); 223 } body { 224 output[0] = (cast(limb) (cast(int) in2[0])) * (cast(int) in1[0]); 225 output[1] = (cast(limb) (cast(int) in2[0])) * (cast(int) in1[1]) + 226 (cast(limb) (cast(int) in2[1])) * (cast(int) in1[0]); 227 output[2] = 2 * (cast(limb) (cast(int) in2[1])) * (cast(int) in1[1]) + 228 (cast(limb) (cast(int) in2[0])) * (cast(int) in1[2]) + 229 (cast(limb) (cast(int) in2[2])) * (cast(int) in1[0]); 230 output[3] = (cast(limb) (cast(int) in2[1])) * (cast(int) in1[2]) + 231 (cast(limb) (cast(int) in2[2])) * (cast(int) in1[1]) + 232 (cast(limb) (cast(int) in2[0])) * (cast(int) in1[3]) + 233 (cast(limb) (cast(int) in2[3])) * (cast(int) in1[0]); 234 output[4] = (cast(limb) (cast(int) in2[2])) * (cast(int) in1[2]) + 235 2 * ((cast(limb) (cast(int) in2[1])) * (cast(int) in1[3]) + 236 (cast(limb) (cast(int) in2[3])) * (cast(int) in1[1])) + 237 (cast(limb) (cast(int) in2[0])) * (cast(int) in1[4]) + 238 (cast(limb) (cast(int) in2[4])) * (cast(int) in1[0]); 239 output[5] = (cast(limb) (cast(int) in2[2])) * (cast(int) in1[3]) + 240 (cast(limb) (cast(int) in2[3])) * (cast(int) in1[2]) + 241 (cast(limb) (cast(int) in2[1])) * (cast(int) in1[4]) + 242 (cast(limb) (cast(int) in2[4])) * (cast(int) in1[1]) + 243 (cast(limb) (cast(int) in2[0])) * (cast(int) in1[5]) + 244 (cast(limb) (cast(int) in2[5])) * (cast(int) in1[0]); 245 output[6] = 2 * ((cast(limb) (cast(int) in2[3])) * (cast(int) in1[3]) + 246 (cast(limb) (cast(int) in2[1])) * (cast(int) in1[5]) + 247 (cast(limb) (cast(int) in2[5])) * (cast(int) in1[1])) + 248 (cast(limb) (cast(int) in2[2])) * (cast(int) in1[4]) + 249 (cast(limb) (cast(int) in2[4])) * (cast(int) in1[2]) + 250 (cast(limb) (cast(int) in2[0])) * (cast(int) in1[6]) + 251 (cast(limb) (cast(int) in2[6])) * (cast(int) in1[0]); 252 output[7] = (cast(limb) (cast(int) in2[3])) * (cast(int) in1[4]) + 253 (cast(limb) (cast(int) in2[4])) * (cast(int) in1[3]) + 254 (cast(limb) (cast(int) in2[2])) * (cast(int) in1[5]) + 255 (cast(limb) (cast(int) in2[5])) * (cast(int) in1[2]) + 256 (cast(limb) (cast(int) in2[1])) * (cast(int) in1[6]) + 257 (cast(limb) (cast(int) in2[6])) * (cast(int) in1[1]) + 258 (cast(limb) (cast(int) in2[0])) * (cast(int) in1[7]) + 259 (cast(limb) (cast(int) in2[7])) * (cast(int) in1[0]); 260 output[8] = (cast(limb) (cast(int) in2[4])) * (cast(int) in1[4]) + 261 2 * ((cast(limb) (cast(int) in2[3])) * (cast(int) in1[5]) + 262 (cast(limb) (cast(int) in2[5])) * (cast(int) in1[3]) + 263 (cast(limb) (cast(int) in2[1])) * (cast(int) in1[7]) + 264 (cast(limb) (cast(int) in2[7])) * (cast(int) in1[1])) + 265 (cast(limb) (cast(int) in2[2])) * (cast(int) in1[6]) + 266 (cast(limb) (cast(int) in2[6])) * (cast(int) in1[2]) + 267 (cast(limb) (cast(int) in2[0])) * (cast(int) in1[8]) + 268 (cast(limb) (cast(int) in2[8])) * (cast(int) in1[0]); 269 output[9] = (cast(limb) (cast(int) in2[4])) * (cast(int) in1[5]) + 270 (cast(limb) (cast(int) in2[5])) * (cast(int) in1[4]) + 271 (cast(limb) (cast(int) in2[3])) * (cast(int) in1[6]) + 272 (cast(limb) (cast(int) in2[6])) * (cast(int) in1[3]) + 273 (cast(limb) (cast(int) in2[2])) * (cast(int) in1[7]) + 274 (cast(limb) (cast(int) in2[7])) * (cast(int) in1[2]) + 275 (cast(limb) (cast(int) in2[1])) * (cast(int) in1[8]) + 276 (cast(limb) (cast(int) in2[8])) * (cast(int) in1[1]) + 277 (cast(limb) (cast(int) in2[0])) * (cast(int) in1[9]) + 278 (cast(limb) (cast(int) in2[9])) * (cast(int) in1[0]); 279 output[10] = 2 * ((cast(limb) (cast(int) in2[5])) * (cast(int) in1[5]) + 280 (cast(limb) (cast(int) in2[3])) * (cast(int) in1[7]) + 281 (cast(limb) (cast(int) in2[7])) * (cast(int) in1[3]) + 282 (cast(limb) (cast(int) in2[1])) * (cast(int) in1[9]) + 283 (cast(limb) (cast(int) in2[9])) * (cast(int) in1[1])) + 284 (cast(limb) (cast(int) in2[4])) * (cast(int) in1[6]) + 285 (cast(limb) (cast(int) in2[6])) * (cast(int) in1[4]) + 286 (cast(limb) (cast(int) in2[2])) * (cast(int) in1[8]) + 287 (cast(limb) (cast(int) in2[8])) * (cast(int) in1[2]); 288 output[11] = (cast(limb) (cast(int) in2[5])) * (cast(int) in1[6]) + 289 (cast(limb) (cast(int) in2[6])) * (cast(int) in1[5]) + 290 (cast(limb) (cast(int) in2[4])) * (cast(int) in1[7]) + 291 (cast(limb) (cast(int) in2[7])) * (cast(int) in1[4]) + 292 (cast(limb) (cast(int) in2[3])) * (cast(int) in1[8]) + 293 (cast(limb) (cast(int) in2[8])) * (cast(int) in1[3]) + 294 (cast(limb) (cast(int) in2[2])) * (cast(int) in1[9]) + 295 (cast(limb) (cast(int) in2[9])) * (cast(int) in1[2]); 296 output[12] = (cast(limb) (cast(int) in2[6])) * (cast(int) in1[6]) + 297 2 * ((cast(limb) (cast(int) in2[5])) * (cast(int) in1[7]) + 298 (cast(limb) (cast(int) in2[7])) * (cast(int) in1[5]) + 299 (cast(limb) (cast(int) in2[3])) * (cast(int) in1[9]) + 300 (cast(limb) (cast(int) in2[9])) * (cast(int) in1[3])) + 301 (cast(limb) (cast(int) in2[4])) * (cast(int) in1[8]) + 302 (cast(limb) (cast(int) in2[8])) * (cast(int) in1[4]); 303 output[13] = (cast(limb) (cast(int) in2[6])) * (cast(int) in1[7]) + 304 (cast(limb) (cast(int) in2[7])) * (cast(int) in1[6]) + 305 (cast(limb) (cast(int) in2[5])) * (cast(int) in1[8]) + 306 (cast(limb) (cast(int) in2[8])) * (cast(int) in1[5]) + 307 (cast(limb) (cast(int) in2[4])) * (cast(int) in1[9]) + 308 (cast(limb) (cast(int) in2[9])) * (cast(int) in1[4]); 309 output[14] = 2 * ((cast(limb) (cast(int) in2[7])) * (cast(int) in1[7]) + 310 (cast(limb) (cast(int) in2[5])) * (cast(int) in1[9]) + 311 (cast(limb) (cast(int) in2[9])) * (cast(int) in1[5])) + 312 (cast(limb) (cast(int) in2[6])) * (cast(int) in1[8]) + 313 (cast(limb) (cast(int) in2[8])) * (cast(int) in1[6]); 314 output[15] = (cast(limb) (cast(int) in2[7])) * (cast(int) in1[8]) + 315 (cast(limb) (cast(int) in2[8])) * (cast(int) in1[7]) + 316 (cast(limb) (cast(int) in2[6])) * (cast(int) in1[9]) + 317 (cast(limb) (cast(int) in2[9])) * (cast(int) in1[6]); 318 output[16] = (cast(limb) (cast(int) in2[8])) * (cast(int) in1[8]) + 319 2 * ((cast(limb) (cast(int) in2[7])) * (cast(int) in1[9]) + 320 (cast(limb) (cast(int) in2[9])) * (cast(int) in1[7])); 321 output[17] = (cast(limb) (cast(int) in2[8])) * (cast(int) in1[9]) + 322 (cast(limb) (cast(int) in2[9])) * (cast(int) in1[8]); 323 output[18] = 2 * (cast(limb) (cast(int) in2[9])) * (cast(int) in1[9]); 324 } 325 326 /* Reduce a long form to a short form by taking the input mod 2^255 - 19. 327 * 328 * On entry: |output[i]| < 14*2^54 329 * On exit: |output[0..8]| < 280*2^54 */ 330 void freduce_degree(ref limb[19] output) { 331 /* Each of these shifts and adds ends up multiplying the value by 19. 332 * 333 * For output[0..8], the absolute entry value is < 14*2^54 and we add, at 334 * most, 19*14*2^54 thus, on exit, |output[0..8]| < 280*2^54. */ 335 output[8] += output[18] << 4; 336 output[8] += output[18] << 1; 337 output[8] += output[18]; 338 output[7] += output[17] << 4; 339 output[7] += output[17] << 1; 340 output[7] += output[17]; 341 output[6] += output[16] << 4; 342 output[6] += output[16] << 1; 343 output[6] += output[16]; 344 output[5] += output[15] << 4; 345 output[5] += output[15] << 1; 346 output[5] += output[15]; 347 output[4] += output[14] << 4; 348 output[4] += output[14] << 1; 349 output[4] += output[14]; 350 output[3] += output[13] << 4; 351 output[3] += output[13] << 1; 352 output[3] += output[13]; 353 output[2] += output[12] << 4; 354 output[2] += output[12] << 1; 355 output[2] += output[12]; 356 output[1] += output[11] << 4; 357 output[1] += output[11] << 1; 358 output[1] += output[11]; 359 output[0] += output[10] << 4; 360 output[0] += output[10] << 1; 361 output[0] += output[10]; 362 } 363 364 static assert((-1 & 3) == 3, "This code only works on a two's complement system"); 365 366 /* return v / 2^26, using only shifts and adds. 367 * 368 * On entry: v can take any value. */ 369 limb div_by_2_26(in limb v) 370 { 371 /* High word of v; no shift needed. */ 372 uint highword = cast(uint) (cast(ulong) v >> 32); 373 /* Set to all 1s if v was negative; else set to 0s. */ 374 int sign = (cast (int) highword) >> 31; 375 /* Set to 0x3ffffff if v was negative; else set to 0. */ 376 int roundoff = (cast(uint) sign) >> 6; 377 /* Should return v / (1<<26) */ 378 return (v + roundoff) >> 26; 379 } 380 381 /* return v / (2^25), using only shifts and adds. 382 * 383 * On entry: v can take any value. */ 384 limb div_by_2_25(in limb v) 385 { 386 /* High word of v; no shift needed*/ 387 uint highword = cast(uint) ((cast(ulong) v) >> 32); 388 /* Set to all 1s if v was negative; else set to 0s. */ 389 int sign = (cast(int) highword) >> 31; 390 /* Set to 0x1ffffff if v was negative; else set to 0. */ 391 int roundoff = (cast(uint) sign) >> 7; 392 /* Should return v / (1<<25) */ 393 return (v + roundoff) >> 25; 394 } 395 396 /* Reduce all coefficients of the short form input so that |x| < 2^26. 397 * 398 * On entry: |output[i]| < 280*2^54 */ 399 void freduce_coefficients(ref limb[19] output) 400 { 401 402 output[10] = 0; 403 404 for (uint i = 0; i < 10; i += 2) { 405 limb over = div_by_2_26(output[i]); 406 /* The entry condition (that |output[i]| < 280*2^54) means that over is, at 407 * most, 280*2^28 in the first iteration of this loop. This is added to the 408 * next limb and we can approximate the resulting bound of that limb by 409 * 281*2^54. */ 410 output[i] -= over << 26; 411 output[i+1] += over; 412 413 /* For the first iteration, |output[i+1]| < 281*2^54, thus |over| < 414 * 281*2^29. When this is added to the next limb, the resulting bound can 415 * be approximated as 281*2^54. 416 * 417 * For subsequent iterations of the loop, 281*2^54 remains a conservative 418 * bound and no overflow occurs. */ 419 over = div_by_2_25(output[i+1]); 420 output[i+1] -= over << 25; 421 output[i+2] += over; 422 } 423 /* Now |output[10]| < 281*2^29 and all other coefficients are reduced. */ 424 output[0] += output[10] << 4; 425 output[0] += output[10] << 1; 426 output[0] += output[10]; 427 428 output[10] = 0; 429 430 /* Now output[1..9] are reduced, and |output[0]| < 2^26 + 19*281*2^29 431 * So |over| will be no more than 2^16. */ 432 { 433 limb over = div_by_2_26(output[0]); 434 output[0] -= over << 26; 435 output[1] += over; 436 } 437 438 /* Now output[0,2..9] are reduced, and |output[1]| < 2^25 + 2^16 < 2^26. The 439 * bound on |output[1]| is sufficient to meet our needs. */ 440 } 441 442 /* A helpful wrapper around fproduct: output = in * in2. 443 * 444 * On entry: |in[i]| < 2^27 and |in2[i]| < 2^27. 445 * 446 * output must be distinct to both inputs. The output is reduced degree 447 * (indeed, one need only provide storage for 10 limbs) and |output[i]| < 2^26. */ 448 449 void fmul(ref limb[10] output, in ref limb[10] in1, in ref limb[10] in2) 450 { 451 limb[19] t; 452 fproduct(t, in1, in2); 453 /* |t[i]| < 14*2^54 */ 454 freduce_degree(t); 455 freduce_coefficients(t); 456 /* |t[i]| < 2^26 */ 457 output[0..10] = t[0..10]; 458 } 459 460 /* Square a number: output = in**2 461 * 462 * output must be distinct from the input. The inputs are reduced coefficient 463 * form, the output is not. 464 * 465 * output[x] <= 14 * the largest product of the input limbs. */ 466 void fsquare_inner(limb[] output, in limb[] input) { 467 output[0] = (cast(limb) (cast(int) input[0])) * (cast(int) input[0]); 468 output[1] = 2 * (cast(limb) (cast(int) input[0])) * (cast(int) input[1]); 469 output[2] = 2 * ((cast(limb) (cast(int) input[1])) * (cast(int) input[1]) + 470 (cast(limb) (cast(int) input[0])) * (cast(int) input[2])); 471 output[3] = 2 * ((cast(limb) (cast(int) input[1])) * (cast(int) input[2]) + 472 (cast(limb) (cast(int) input[0])) * (cast(int) input[3])); 473 output[4] = (cast(limb) (cast(int) input[2])) * (cast(int) input[2]) + 474 4 * (cast(limb) (cast(int) input[1])) * (cast(int) input[3]) + 475 2 * (cast(limb) (cast(int) input[0])) * (cast(int) input[4]); 476 output[5] = 2 * ((cast(limb) (cast(int) input[2])) * (cast(int) input[3]) + 477 (cast(limb) (cast(int) input[1])) * (cast(int) input[4]) + 478 (cast(limb) (cast(int) input[0])) * (cast(int) input[5])); 479 output[6] = 2 * ((cast(limb) (cast(int) input[3])) * (cast(int) input[3]) + 480 (cast(limb) (cast(int) input[2])) * (cast(int) input[4]) + 481 (cast(limb) (cast(int) input[0])) * (cast(int) input[6]) + 482 2 * (cast(limb) (cast(int) input[1])) * (cast(int) input[5])); 483 output[7] = 2 * ((cast(limb) (cast(int) input[3])) * (cast(int) input[4]) + 484 (cast(limb) (cast(int) input[2])) * (cast(int) input[5]) + 485 (cast(limb) (cast(int) input[1])) * (cast(int) input[6]) + 486 (cast(limb) (cast(int) input[0])) * (cast(int) input[7])); 487 output[8] = (cast(limb) (cast(int) input[4])) * (cast(int) input[4]) + 488 2 * ((cast(limb) (cast(int) input[2])) * (cast(int) input[6]) + 489 (cast(limb) (cast(int) input[0])) * (cast(int) input[8]) + 490 2 * ((cast(limb) (cast(int) input[1])) * (cast(int) input[7]) + 491 (cast(limb) (cast(int) input[3])) * (cast(int) input[5]))); 492 output[9] = 2 * ((cast(limb) (cast(int) input[4])) * (cast(int) input[5]) + 493 (cast(limb) (cast(int) input[3])) * (cast(int) input[6]) + 494 (cast(limb) (cast(int) input[2])) * (cast(int) input[7]) + 495 (cast(limb) (cast(int) input[1])) * (cast(int) input[8]) + 496 (cast(limb) (cast(int) input[0])) * (cast(int) input[9])); 497 output[10] = 2 * ((cast(limb) (cast(int) input[5])) * (cast(int) input[5]) + 498 (cast(limb) (cast(int) input[4])) * (cast(int) input[6]) + 499 (cast(limb) (cast(int) input[2])) * (cast(int) input[8]) + 500 2 * ((cast(limb) (cast(int) input[3])) * (cast(int) input[7]) + 501 (cast(limb) (cast(int) input[1])) * (cast(int) input[9]))); 502 output[11] = 2 * ((cast(limb) (cast(int) input[5])) * (cast(int) input[6]) + 503 (cast(limb) (cast(int) input[4])) * (cast(int) input[7]) + 504 (cast(limb) (cast(int) input[3])) * (cast(int) input[8]) + 505 (cast(limb) (cast(int) input[2])) * (cast(int) input[9])); 506 output[12] = (cast(limb) (cast(int) input[6])) * (cast(int) input[6]) + 507 2 * ((cast(limb) (cast(int) input[4])) * (cast(int) input[8]) + 508 2 * ((cast(limb) (cast(int) input[5])) * (cast(int) input[7]) + 509 (cast(limb) (cast(int) input[3])) * (cast(int) input[9]))); 510 output[13] = 2 * ((cast(limb) (cast(int) input[6])) * (cast(int) input[7]) + 511 (cast(limb) (cast(int) input[5])) * (cast(int) input[8]) + 512 (cast(limb) (cast(int) input[4])) * (cast(int) input[9])); 513 output[14] = 2 * ((cast(limb) (cast(int) input[7])) * (cast(int) input[7]) + 514 (cast(limb) (cast(int) input[6])) * (cast(int) input[8]) + 515 2 * (cast(limb) (cast(int) input[5])) * (cast(int) input[9])); 516 output[15] = 2 * ((cast(limb) (cast(int) input[7])) * (cast(int) input[8]) + 517 (cast(limb) (cast(int) input[6])) * (cast(int) input[9])); 518 output[16] = (cast(limb) (cast(int) input[8])) * (cast(int) input[8]) + 519 4 * (cast(limb) (cast(int) input[7])) * (cast(int) input[9]); 520 output[17] = 2 * (cast(limb) (cast(int) input[8])) * (cast(int) input[9]); 521 output[18] = 2 * (cast(limb) (cast(int) input[9])) * (cast(int) input[9]); 522 } 523 524 /* fsquare sets output = in^2. 525 * 526 * On entry: The |in| argument is in reduced coefficients form and |in[i]| < 527 * 2^27. 528 * 529 * On exit: The |output| argument is in reduced coefficients form (indeed, one 530 * need only provide storage for 10 limbs) and |out[i]| < 2^26. */ 531 void fsquare(limb[] output, in limb[] input) { 532 limb[19] t; 533 fsquare_inner(t, input); 534 /* |t[i]| < 14*2^54 because the largest product of two limbs will be < 535 * 2^(27+27) and fsquare_inner adds together, at most, 14 of those 536 * products. */ 537 freduce_degree(t); 538 freduce_coefficients(t); 539 /* |t[i]| < 2^26 */ 540 output[0..10] = t[0..10]; 541 } 542 543 /* Take a little-endian, 32-byte number and expand it into polynomial form */ 544 void fexpand(ref limb[10] output, in ref ubyte[32] input) 545 { 546 547 void F(uint n, uint start, uint shift, uint mask)(limb[] output, in ubyte[] input) { 548 output[n] = ((input[start + 0] | 549 (input[start + 1]) << 8 | 550 (input[start + 2]) << 16 | 551 (input[start + 3]) << 24) >> shift) & mask; 552 } 553 554 F!(0, 0, 0, 0x3ffffff)(output, input); 555 F!(1, 3, 2, 0x1ffffff)(output, input); 556 F!(2, 6, 3, 0x3ffffff)(output, input); 557 F!(3, 9, 5, 0x1ffffff)(output, input); 558 F!(4, 12, 6, 0x3ffffff)(output, input); 559 F!(5, 16, 0, 0x1ffffff)(output, input); 560 F!(6, 19, 1, 0x3ffffff)(output, input); 561 F!(7, 22, 3, 0x1ffffff)(output, input); 562 F!(8, 25, 4, 0x3ffffff)(output, input); 563 F!(9, 28, 6, 0x1ffffff)(output, input); 564 565 } 566 567 static assert((-32 >> 1) == -16, "This code only works when >> does sign-extension on negative numbers"); 568 569 /* int_eq returns 0xffffffff iff a == b and zero otherwise. */ 570 int int_eq(int a, int b) { 571 a = ~(a ^ b); 572 a &= a << 16; 573 a &= a << 8; 574 a &= a << 4; 575 a &= a << 2; 576 a &= a << 1; 577 return a >> 31; 578 } 579 580 /* int_gte returns 0xffffffff if a >= b and zero otherwise, where a and b are 581 * both non-negative. */ 582 int int_gte(int a, int b) { 583 a -= b; 584 /* a >= 0 iff a >= b. */ 585 return ~(a >> 31); 586 } 587 588 /* Take a fully reduced polynomial form number and contract it into a 589 * little-endian, 32-byte array. 590 * 591 * On entry: |input_limbs[i]| < 2^26 */ 592 void fcontract(ref ubyte[32] output, in ref limb[10] input_limbs) 593 in { 594 assert(output.length == 32); 595 assert(input_limbs.length == 10); 596 } body { 597 int[10] input; 598 int _mask; 599 600 /* |input_limbs[i]| < 2^26, so it's valid to convert to an int. */ 601 for (uint i = 0; i < 10; i++) { 602 input[i] = cast(uint) input_limbs[i]; 603 } 604 605 for (uint j = 0; j < 2; ++j) { 606 for (uint i = 0; i < 9; ++i) { 607 if ((i & 1) == 1) { 608 /* This calculation is a time-invariant way to make input[i] 609 * non-negative by borrowing from the next-larger limb. */ 610 immutable int mask = input[i] >> 31; 611 immutable int carry = -((input[i] & mask) >> 25); 612 input[i] = input[i] + (carry << 25); 613 input[i+1] = input[i+1] - carry; 614 } else { 615 immutable int mask = input[i] >> 31; 616 immutable int carry = -((input[i] & mask) >> 26); 617 input[i] = input[i] + (carry << 26); 618 input[i+1] = input[i+1] - carry; 619 } 620 } 621 622 /* There's no greater limb for input[9] to borrow from, but we can multiply 623 * by 19 and borrow from input[0], which is valid mod 2^255-19. */ 624 { 625 immutable int mask = input[9] >> 31; 626 immutable int carry = -((input[9] & mask) >> 25); 627 input[9] = input[9] + (carry << 25); 628 input[0] = input[0] - (carry * 19); 629 } 630 631 /* After the first iteration, input[1..9] are non-negative and fit within 632 * 25 or 26 bits, depending on position. However, input[0] may be 633 * negative. */ 634 } 635 636 /* The first borrow-propagation pass above ended with every limb 637 except (possibly) input[0] non-negative. 638 639 If input[0] was negative after the first pass, then it was because of a 640 carry from input[9]. On entry, input[9] < 2^26 so the carry was, at most, 641 one, since (2**26-1) >> 25 = 1. Thus input[0] >= -19. 642 643 In the second pass, each limb is decreased by at most one. Thus the second 644 borrow-propagation pass could only have wrapped around to decrease 645 input[0] again if the first pass left input[0] negative *and* input[1] 646 through input[9] were all zero. In that case, input[1] is now 2^25 - 1, 647 and this last borrow-propagation step will leave input[1] non-negative. */ 648 { 649 immutable int mask = input[0] >> 31; 650 immutable int carry = -((input[0] & mask) >> 26); 651 input[0] = input[0] + (carry << 26); 652 input[1] = input[1] - carry; 653 } 654 655 /* All input[i] are now non-negative. However, there might be values between 656 * 2^25 and 2^26 in a limb which is, nominally, 25 bits wide. */ 657 for (uint j = 0; j < 2; j++) { 658 for (uint i = 0; i < 9; i++) { 659 if ((i & 1) == 1) { 660 immutable int carry = input[i] >> 25; 661 input[i] &= 0x1ffffff; 662 input[i+1] += carry; 663 } else { 664 immutable int carry = input[i] >> 26; 665 input[i] &= 0x3ffffff; 666 input[i+1] += carry; 667 } 668 } 669 670 { 671 immutable int carry = input[9] >> 25; 672 input[9] &= 0x1ffffff; 673 input[0] += 19*carry; 674 } 675 } 676 677 /* If the first carry-chain pass, just above, ended up with a carry from 678 * input[9], and that caused input[0] to be out-of-bounds, then input[0] was 679 * < 2^26 + 2*19, because the carry was, at most, two. 680 * 681 * If the second pass carried from input[9] again then input[0] is < 2*19 and 682 * the input[9] -> input[0] carry didn't push input[0] out of bounds. */ 683 684 /* It still remains the case that input might be between 2^255-19 and 2^255. 685 * In this case, input[1..9] must take their maximum value and input[0] must 686 * be >= (2^255-19) & 0x3ffffff, which is 0x3ffffed. */ 687 _mask = int_gte(input[0], 0x3ffffed); 688 for (uint i = 1; i < 10; i++) { 689 if ((i & 1) == 1) { 690 _mask &= int_eq(input[i], 0x1ffffff); 691 } else { 692 _mask &= int_eq(input[i], 0x3ffffff); 693 } 694 } 695 696 /* mask is either 0xffffffff (if input >= 2^255-19) and zero otherwise. Thus 697 * this conditionally subtracts 2^255-19. */ 698 input[0] -= _mask & 0x3ffffed; 699 700 for (uint i = 1; i < 10; i++) { 701 if ((i & 1) == 1) { 702 input[i] -= _mask & 0x1ffffff; 703 } else { 704 input[i] -= _mask & 0x3ffffff; 705 } 706 } 707 708 input[1] <<= 2; 709 input[2] <<= 3; 710 input[3] <<= 5; 711 input[4] <<= 6; 712 input[6] <<= 1; 713 input[7] <<= 3; 714 input[8] <<= 4; 715 input[9] <<= 6; 716 717 void F(uint i, uint s)() { 718 output[s+0] |= input[i] & 0xff; 719 output[s+1] = (input[i] >> 8) & 0xff; 720 output[s+2] = (input[i] >> 16) & 0xff; 721 output[s+3] = (input[i] >> 24) & 0xff; 722 } 723 724 output[0] = 0; 725 output[16] = 0; 726 F!(0,0)(); 727 F!(1,3)(); 728 F!(2,6)(); 729 F!(3,9)(); 730 F!(4,12)(); 731 F!(5,16)(); 732 F!(6,19)(); 733 F!(7,22)(); 734 F!(8,25)(); 735 F!(9,28)(); 736 } 737 738 /* Input: Q, Q', Q-Q' 739 * Output: 2Q, Q+Q' 740 * 741 * x2 z3: long form 742 * x3 z3: long form 743 * x z: short form, destroyed 744 * xprime zprime: short form, destroyed 745 * qmqp: short form, preserved 746 * 747 * On entry and exit, the absolute value of the limbs of all inputs and outputs 748 * are < 2^26. */ 749 void fmonty(ref limb[19] x2, ref limb[19] z2, /* output 2Q */ 750 ref limb[19] x3, ref limb[19] z3, /* output Q + Q' */ 751 ref limb[19] x, ref limb[19] z, /* input Q */ 752 ref limb[19] xprime, ref limb[19] zprime, /* input Q' */ 753 in ref limb[10] qmqp /* input Q - Q' */) 754 { 755 756 limb[10] origx, origxprime; 757 limb[19] zzz, xx, zz, xxprime, 758 zzprime, zzzprime, xxxprime; 759 760 //memcpy(origx, x, 10 * sizeofcast(limb)); 761 origx[0..10] = x[0..10]; 762 763 fsum(x, z); 764 /* |x[i]| < 2^27 */ 765 fdifference(z, origx); /* does x - z */ 766 /* |z[i]| < 2^27 */ 767 768 //memcpy(origxprime, xprime, sizeofcast(limb) * 10); 769 origxprime[0..10] = xprime[0..10]; 770 771 fsum(xprime, zprime); 772 /* |xprime[i]| < 2^27 */ 773 fdifference(zprime, origxprime); 774 /* |zprime[i]| < 2^27 */ 775 fproduct(xxprime, xprime, z); 776 /* |xxprime[i]| < 14*2^54: the largest product of two limbs will be < 777 * 2^(27+27) and fproduct adds together, at most, 14 of those products. 778 * (Approximating that to 2^58 doesn't work out.) */ 779 fproduct(zzprime, x, zprime); 780 /* |zzprime[i]| < 14*2^54 */ 781 freduce_degree(xxprime); 782 freduce_coefficients(xxprime); 783 /* |xxprime[i]| < 2^26 */ 784 freduce_degree(zzprime); 785 freduce_coefficients(zzprime); 786 /* |zzprime[i]| < 2^26 */ 787 //memcpy(origxprime, xxprime, sizeofcast(limb) * 10); 788 origxprime[0..10] = xxprime[0..10]; 789 fsum(xxprime, zzprime); 790 /* |xxprime[i]| < 2^27 */ 791 fdifference(zzprime, origxprime); 792 /* |zzprime[i]| < 2^27 */ 793 fsquare(xxxprime, xxprime); 794 /* |xxxprime[i]| < 2^26 */ 795 fsquare(zzzprime, zzprime); 796 /* |zzzprime[i]| < 2^26 */ 797 fproduct(zzprime, zzzprime, qmqp); 798 /* |zzprime[i]| < 14*2^52 */ 799 freduce_degree(zzprime); 800 freduce_coefficients(zzprime); 801 /* |zzprime[i]| < 2^26 */ 802 //memcpy(x3, xxxprime, sizeofcast(limb) * 10); 803 x3[0..10] = xxxprime[0..10]; 804 //memcpy(z3, zzprime, sizeofcast(limb) * 10); 805 z3[0..10] = zzprime[0..10]; 806 807 fsquare(xx, x); 808 /* |xx[i]| < 2^26 */ 809 fsquare(zz, z); 810 /* |zz[i]| < 2^26 */ 811 fproduct(x2, xx, zz); 812 /* |x2[i]| < 14*2^52 */ 813 freduce_degree(x2); 814 freduce_coefficients(x2); 815 /* |x2[i]| < 2^26 */ 816 fdifference(zz, xx); // does zz = xx - zz 817 /* |zz[i]| < 2^27 */ 818 //memset(zzz + 10, 0, sizeofcast(limb) * 9); 819 zzz[10..19] = 0; 820 821 fscalar_product(zzz, zz, 121665); 822 /* |zzz[i]| < 2^(27+17) */ 823 /* No need to call freduce_degree here: 824 fscalar_product doesn't increase the degree of its input. */ 825 freduce_coefficients(zzz); 826 /* |zzz[i]| < 2^26 */ 827 fsum(zzz, xx); 828 /* |zzz[i]| < 2^27 */ 829 fproduct(z2, zz, zzz); 830 /* |z2[i]| < 14*2^(26+27) */ 831 freduce_degree(z2); 832 freduce_coefficients(z2); 833 /* |z2|i| < 2^26 */ 834 } 835 836 /* Conditionally swap two reduced-form limb arrays if 'iswap' is 1, but leave 837 * them unchanged if 'iswap' is 0. Runs in data-invariant time to avoid 838 * side-channel attacks. 839 * 840 * NOTE that this function requires that 'iswap' be 1 or 0; other values give 841 * wrong results. Also, the two limb arrays must be in reduced-coefficient, 842 * reduced-degree form: the values in a[10..19] or b[10..19] aren't swapped, 843 * and all all values in a[0..9],b[0..9] must have magnitude less than 844 * INT32_MAX. */ 845 void swap_conditional(ref limb[19] a, ref limb[19] b, in limb iswap) 846 in { 847 assert(iswap == 0 || iswap == 1, "requires 'iswap' to be 1 or 0"); 848 } 849 body { 850 immutable int swap = cast(int) -iswap; 851 852 for (uint i = 0; i < 10; ++i) { 853 immutable int x = swap & ( (cast(int)a[i]) ^ (cast(int)b[i]) ); 854 a[i] = (cast(int)a[i]) ^ x; 855 b[i] = (cast(int)b[i]) ^ x; 856 } 857 } 858 859 /* Calculates nQ where Q is the x-coordinate of a point on the curve 860 * 861 * resultx/resultz: the x coordinate of the resulting curve point (short form) 862 * n: a little endian, 32-byte number 863 * q: a point of the curve (short form) */ 864 void cmult(ref limb[10] resultx, ref limb[10] resultz, in ref ubyte[32] n, in ref limb[10] q) 865 { 866 limb[19] a, b, c, d; 867 b[0] = 1; 868 c[0] = 1; 869 870 alias nqpqx = a, nqpqz = b, nqx = c, nqz = d; 871 limb[] t; 872 limb[19] e, f, g, h; 873 f[0] = 1; 874 h[0] = 1; 875 alias nqpqx2 = e, nqpqz2 = f, nqx2 = g, nqz2 = h; 876 877 nqpqx[0..10] = q[0..10]; 878 879 for (uint i = 0; i < 32; ++i) { 880 ubyte byt = n[31 - i]; 881 for (uint j = 0; j < 8; ++j) { 882 immutable limb bit = byt >> 7; 883 884 swap_conditional(nqx, nqpqx, bit); 885 swap_conditional(nqz, nqpqz, bit); 886 fmonty(nqx2, nqz2, 887 nqpqx2, nqpqz2, 888 nqx, nqz, 889 nqpqx, nqpqz, 890 q); 891 swap_conditional(nqx2, nqpqx2, bit); 892 swap_conditional(nqz2, nqpqz2, bit); 893 894 t = nqx; 895 nqx = nqx2; 896 nqx2 = t; 897 t = nqz; 898 nqz = nqz2; 899 nqz2 = t; 900 t = nqpqx; 901 nqpqx = nqpqx2; 902 nqpqx2 = t; 903 t = nqpqz; 904 nqpqz = nqpqz2; 905 nqpqz2 = t; 906 907 byt <<= 1; 908 } 909 } 910 911 resultx[0..10] = nqx[0..10]; 912 resultz[0..10] = nqz[0..10]; 913 } 914 915 // ----------------------------------------------------------------------------- 916 // Shamelessly copied from djb's code :-) 917 // ----------------------------------------------------------------------------- 918 void crecip(ref limb[10] output, in ref limb[10] z) { 919 limb[10] z2, z9, z11, z2_5_0, z2_10_0, z2_20_0, z2_50_0, z2_100_0, t0, t1; 920 921 /* 2 */ fsquare(z2,z); 922 /* 4 */ fsquare(t1,z2); 923 /* 8 */ fsquare(t0,t1); 924 /* 9 */ fmul(z9,t0,z); 925 /* 11 */ fmul(z11,z9,z2); 926 /* 22 */ fsquare(t0,z11); 927 /* 2^5 - 2^0 = 31 */ fmul(z2_5_0,t0,z9); 928 929 /* 2^6 - 2^1 */ fsquare(t0,z2_5_0); 930 /* 2^7 - 2^2 */ fsquare(t1,t0); 931 /* 2^8 - 2^3 */ fsquare(t0,t1); 932 /* 2^9 - 2^4 */ fsquare(t1,t0); 933 /* 2^10 - 2^5 */ fsquare(t0,t1); 934 /* 2^10 - 2^0 */ fmul(z2_10_0,t0,z2_5_0); 935 936 /* 2^11 - 2^1 */ fsquare(t0,z2_10_0); 937 /* 2^12 - 2^2 */ fsquare(t1,t0); 938 /* 2^20 - 2^10 */ for (uint i = 2;i < 10;i += 2) { fsquare(t0,t1); fsquare(t1,t0); } 939 /* 2^20 - 2^0 */ fmul(z2_20_0,t1,z2_10_0); 940 941 /* 2^21 - 2^1 */ fsquare(t0,z2_20_0); 942 /* 2^22 - 2^2 */ fsquare(t1,t0); 943 /* 2^40 - 2^20 */ for (uint i = 2;i < 20;i += 2) { fsquare(t0,t1); fsquare(t1,t0); } 944 /* 2^40 - 2^0 */ fmul(t0,t1,z2_20_0); 945 946 /* 2^41 - 2^1 */ fsquare(t1,t0); 947 /* 2^42 - 2^2 */ fsquare(t0,t1); 948 /* 2^50 - 2^10 */ for (uint i = 2;i < 10;i += 2) { fsquare(t1,t0); fsquare(t0,t1); } 949 /* 2^50 - 2^0 */ fmul(z2_50_0,t0,z2_10_0); 950 951 /* 2^51 - 2^1 */ fsquare(t0,z2_50_0); 952 /* 2^52 - 2^2 */ fsquare(t1,t0); 953 /* 2^100 - 2^50 */ for (uint i = 2;i < 50;i += 2) { fsquare(t0,t1); fsquare(t1,t0); } 954 /* 2^100 - 2^0 */ fmul(z2_100_0,t1,z2_50_0); 955 956 /* 2^101 - 2^1 */ fsquare(t1,z2_100_0); 957 /* 2^102 - 2^2 */ fsquare(t0,t1); 958 /* 2^200 - 2^100 */ for (uint i = 2;i < 100;i += 2) { fsquare(t1,t0); fsquare(t0,t1); } 959 /* 2^200 - 2^0 */ fmul(t1,t0,z2_100_0); 960 961 /* 2^201 - 2^1 */ fsquare(t0,t1); 962 /* 2^202 - 2^2 */ fsquare(t1,t0); 963 /* 2^250 - 2^50 */ for (uint i = 2;i < 50;i += 2) { fsquare(t0,t1); fsquare(t1,t0); } 964 /* 2^250 - 2^0 */ fmul(t0,t1,z2_50_0); 965 966 /* 2^251 - 2^1 */ fsquare(t1,t0); 967 /* 2^252 - 2^2 */ fsquare(t0,t1); 968 /* 2^253 - 2^3 */ fsquare(t1,t0); 969 /* 2^254 - 2^4 */ fsquare(t0,t1); 970 /* 2^255 - 2^5 */ fsquare(t1,t0); 971 /* 2^255 - 21 */ fmul(output,t1,z11); 972 }