~apreiml/bigc

812fe218a79860d1ec4790d020b69b349486d2d9 — Armin Preiml 1 year, 11 months ago b69533c master
move bigint to https://git.sr.ht/~apreiml/hare-wipcrypto
12 files changed, 0 insertions(+), 1807 deletions(-)

D crypto/bigint/+test/arithm.ha
D crypto/bigint/+test/encoding.ha
D crypto/bigint/+test/monty.ha
D crypto/bigint/+test/utils.ha
D crypto/bigint/TODO
D crypto/bigint/arithm.ha
D crypto/bigint/encoding.ha
D crypto/bigint/moddiv.ha
D crypto/bigint/monty.ha
D crypto/bigint/types+ct32.ha
D crypto/bigint/types.ha
D crypto/bigint/util.ha
D crypto/bigint/+test/arithm.ha => crypto/bigint/+test/arithm.ha +0 -128
@@ 1,128 0,0 @@
use bytes;
use fmt;

@test fn tadd() void = {
	let result: [4]u8 = [0...];

	let bx = fromhex("002132a0");
	let by = fromhex("00ff3201");

	let carry = add(bx, by, 0);

	assert(carry == 0);

	decode(result, bx);
	assert(bytes::equal([0x00, 0x21, 0x32, 0xa0], result));

	let carry = add(bx, by, 1);

	assert(carry == 0);
	decode(result, bx);
	assert(bytes::equal([0x01, 0x20, 0x64, 0xa1], result));
};

@test fn muladd_small() void = {
	const mod = fromhex("0100");
	let x = fromhex("0100");

	muladd_small(x, 3, mod);
	assert(equalshex(x, "03"));

	free(mod);
	free(x);

	const mod = fromhex("1000000000");
	let x = fromhex("0000000001");
	muladd_small(x, 27, mod);
	assert(equalshex(x, "8000001b"));

	free(mod);
	free(x);
};

@test fn mulacc() void = {
	let d = fromhex("0000000000000000");
	let a = fromhex("0010");
	let b = fromhex("0014");

	defer free(d);
	defer free(a);
	defer free(b);

	mulacc(d, a, b);
	assert(equalshex(d, "0140"));

	mulacc(d, a, b);
	assert(equalshex(d, "0280"));
};

@test fn rshift() void = {
	let x = fromhex("1020304050");
	rshift(x, 1);
	assert(equalshex(x, "0810182028"));
	free(x);

	let x = fromhex("00");
	rshift(x, 1);
	assert(equalshex(x, ""));
	free(x);
};

@test fn reduce() void = {
	let m = fromhex("87654321");

	let a = fromhex("1234");
	let x = fromhex("00000000");
	reduce(x, a, m);
	assert(equalshex(x, "1234"));
	free(a);
	free(x);

	let a = fromhex("0123456789abcdef");
	let x = fromhex("00000000");
	reduce(x, a, m);
	assert(equalshex(x, "14786ead"));
	free(a);
	free(x);

	free(m);
};

@test fn modpow() void = {
	let m = fromhex("87654321");
	let x = fromhexmod("00f03202", m);

	let e: [_]u8 = [0x00, 0x00, 0xc1, 0xf4];
	const m0i = ninv31(m[1]);

	let t1 = fromhex("00000000");
	let t2 = fromhex("00000000");

	modpow(x, e, m, m0i, t1, t2);

	assert(equalshex(x, "3de073fc"));

	free(m);
	free(x);
	free(t1);
	free(t2);
};

@test fn modpow_opt() void = {
	let m = fromhex("87654321");
	let x = fromhexmod("00f03202", m);

	let e: [_]u8 = [0x00, 0x00, 0xc1, 0xf4];
	const m0i = ninv31(m[1]);

	let tmp = fromhex("0000000000000000");

	modpow_opt(x, e, m, m0i, tmp, len(tmp));

	fmt::println("x:", tohex(x))!;
	assert(equalshex(x, "3de073fc"));

	free(m);
	free(x);
	free(tmp);
};

D crypto/bigint/+test/encoding.ha => crypto/bigint/+test/encoding.ha +0 -78
@@ 1,78 0,0 @@
use bytes;
use encoding::hex;
use fmt;


@test fn encdec() void = {
	const decoded: [12]u8 = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11];
	let result: [12]u8 = [0...];

	let buf = bigint(len(decoded) * 8);
	defer free(buf);

	encode(buf, decoded);
	decode(result, buf);

	assert(bytes::equal(result, decoded));
};


@test fn encmoddec() void = {
	const input: [4]u8 = [0, 0, 0, 10];

	let mod = fromhex("00190000");
	let resultbuf = bigint(len(input) * 8);
	let result: [4]u8 = [0...];

	let ret = encodemod(resultbuf[..], input, mod);

	decode(result[..], resultbuf);
	assert(ret == 1);
	assert(bytes::equal(result, input));

	const input: [4]u8 = [0, 25, 0, 0];
	let ret = encodemod(resultbuf[..], input, mod);
	assert(ret == 0);
	assert(iszero(resultbuf) == 1);

	const input: [4]u8 = [0, 26, 0, 0];
	let ret = encodemod(resultbuf[..], input, mod);
	assert(ret == 0);
	assert(iszero(resultbuf) == 1);
};


@test fn encreddec() void = {
	const input: [4]u8 = [0, 0, 0, 0x0a];

	let mod = fromhex("190000");
	defer free(mod);
	let resultbuf = bigint(len(input) * 8);
	let result: [4]u8 = [0...];

	encodereduce(resultbuf, input, mod);
	decode(result, resultbuf);
	assert(bytes::equal(result, input));

	const input: [4]u8 = [0, 0x19, 0, 0];
	let resultbuf = bigint(len(input) * 8);
	encodereduce(resultbuf, input, mod);
	decode(result, resultbuf);
	assert(iszero(resultbuf) == 1);

	const input: [4]u8 = [0x24, 0x17, 0x01, 0x05];
	let resultbuf = bigint(len(input) * 8);
	encodereduce(resultbuf, input, mod);
	decode(result, resultbuf);
	assert(bytes::equal(result, [0x00, 0x0e, 0x01, 0x05]));
};

@test fn word_countbits() void = {
	assert(word_countbits(0) == 0);
	assert(word_countbits(1) == 1);
	assert(word_countbits(2) == 2);
	assert(word_countbits(4) == 3);
	assert(word_countbits(7) == 3);
	assert(word_countbits(8) == 4);
	assert(word_countbits(1131) == 11);
};

D crypto/bigint/+test/monty.ha => crypto/bigint/+test/monty.ha +0 -52
@@ 1,52 0,0 @@
use fmt;

@test fn montyencode() void = {
	let m = fromhex("0010000061");
	let x = fromhexmod("0000010064", m);

	defer free(x);
	defer free(m);

	const m0i = ninv31(m[1]);

	to_monty(x, m);
	from_monty(x, m, m0i);

	assert(equalshex(x, "010064"));
};

@test fn montymul() void = {
	let x = fromhex("00000123");
	let y = fromhex("000003cf");
	let m = fromhex("10000061");
	let d = fromhex("00000000");

	const m0i = ninv31(m[1]);

	to_monty(x, m);
	to_monty(y, m);
	montymul(d, x, y, m, m0i);
	from_monty(d, m, m0i);

	assert(equalshex(d, "04544d"));

	free(d);
	free(x);
	free(y);

	d = fromhex("00000000");
	x = fromhex("0f98b7cf");
	y = fromhex("04216b9c");

	to_monty(x, m);
	to_monty(y, m);
	montymul(d, x, y, m, m0i);
	from_monty(d, m, m0i);

	assert(equalshex(d, "0d031f49"));

	free(x);
	free(y);
	free(m);
	free(d);
};

D crypto/bigint/+test/utils.ha => crypto/bigint/+test/utils.ha +0 -42
@@ 1,42 0,0 @@
use encoding::hex;
use fmt;

// The caller must free the result.
fn fromhex(h: str) []word = {
	let n: []u8 = hex::decodestr(h)!;
	defer free(n);

	let i = bigint(len(n) * 8);
	encode(i, n);
	return i;
};

// 'h' must be lower than 'm'
fn fromhexmod(h: str, m: []word) []word = {
	let r = fromhex(h);
	r[0] = m[0];
	return r;
};

// The caller must free the result.
fn tohex(x: []word) str = {
	let buf: []u8 = alloc([0...], (len(x) - 1) * size(word));
	defer free(buf);

	decode(buf, x);

	let i = 0z;
	for (i < len(buf); i += 1) {
		if (buf[i] != 0) {
			break;
		};
	};

	return hex::encodestr(buf[i..]);
};

fn equalshex(x: []word, h: str) bool = {
	let result = tohex(x);
	defer free(result);
	return result == h;
};

D crypto/bigint/TODO => crypto/bigint/TODO +0 -11
@@ 1,11 0,0 @@

 * Organize .ha files better: encoding.ha, monty.ha ...
 * default i31. Offer i15 and 64 -> 128 multiplication support
   * document the difference properly
 * change length encoding to only store bitlen since length is already part of
   the slice in hare.
 * proper tests
 * check which utils can go to crypto::math




D crypto/bigint/arithm.ha => crypto/bigint/arithm.ha +0 -506
@@ 1,506 0,0 @@
// The following code was initially ported from BearSSL.
//
// Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
// 
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
// 
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
// 
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.

use fmt;
use encoding::hex;
use bytes;

// Adds 'b' to 'a' and stores the result into 'a' if 'do' is 1. Returns the
// carry, if the addition overflows.
//
// 'a' and 'b' must be of the same length.
export fn add(a: []word, b: []word, do: u32) u32 = {
	assert(len(a) == len(b));

	let carry: u32 = 0;
	for (let i = 1z; i < len(b); i += 1) {
		const n: u32 = a[i] + b[i] + carry;
		carry = n >> BITSIZE;
		a[i] = mux(do, n & BITMASK, a[i]): word;
	};
	return carry;
};

// Subtracts 'b' from 'a' and stores the result into 'a' if 'do' is 1. Returns
// the carry, if the subtraction overflows.
//
// 'a' and 'b' must be of the same length.
export fn sub(a: []word, b: []word, do: u32) u32 = {
	assert(len(a) == len(b));

	let carry: u32 = 0;
	for (let i = 1z; i < len(b); i += 1) {
		let n: u32 = a[i] - b[i] - carry;
		carry = n >> BITSIZE;
		a[i] = mux(do, n & BITMASK, a[i]): word;
	};
	return carry;
};

// Right-shift an integer 'x' by 'count'. The shift amount must by lower than
// [[BITSIZE]].
export fn rshift(x: []word, count: word) void = {
	assert(count < BITSIZE);

	const l = ewordlen(x);
	if (l == 0) {
		return;
	};

	let r = x[1] >> count;
	for (let i = 2z; i <= l; i += 1) {
		const w = x[i];
		x[i - 1] = (w << (BITSIZE - count) | r) & BITMASK;
		r = w >> count;
	};
	x[l] = r;
};

// Constant-time division. The dividend hi:lo is divided by the
// divisor d; the quotient is returned and the remainder is written
// in *r. If hi == d, then the quotient does not fit on 32 bits;
// returned value is thus truncated. If hi > d, returned values are
// indeterminate.
export fn divu32(hi: u32, lo: u32, y: u32) (u32, u32) = {
	assert(y != 0);
	assert(y > hi);

	// TODO: optimize this 
	let q: u32 = 0;
	const ch: u32 = eq(hi, y);
	hi = mux(ch, 0, hi);
	for (let k: u32 = 31; k > 0; k -= 1) {
		const j = (32 - k);
		const w = (hi << j) | (lo >> k);
		const ctl = ge(w, y) | (hi >> k);
		const hi2 = (w - y) >> j;
		const lo2 = lo - (y << k);
		hi = mux(ctl, hi2, hi);
		lo = mux(ctl, lo2, lo);
		q |= ctl << k;
	};
	let cf = ge(lo, y) | hi;
	q |= cf;
	const r = mux(cf, lo - y, lo);
	return (q, r);
};

// Multiply x[] by 2^BITSIZE and then add integer z, modulo m[]. This
// function assumes that x[] and m[] have the same announced bit
// length, and the announced bit length of m[] matches its true
// bit length.
// 
// x[] and m[] MUST be distinct arrays.
// 
// CT: only the common announced bit length of x and m leaks, not
// the values of x, z or m.
//
fn muladd_small(x: []word, z: word, m: const []word) void = {
	let hi: u32 = 0;
	// We can test on the modulus bit length since we accept to
	// leak that length.
	const m_bitlen = m[0];
	if (m_bitlen == 0) {
		return;
	};
	if (m_bitlen <= BITSIZE) {
		hi = x[1] >> 1;
		const lo = (x[1] << BITSIZE) | z;
		x[1] = divu32(hi, lo, m[1]).1: word;
		return;
	};
	let mlen = ewordlen(m);
	let mblr = m_bitlen & BITSIZE: word;

	// Principle: we estimate the quotient (x*2^31+z)/m by
	// doing a 64/32 division with the high words.
	// 
	// Let:
	//   w = 2^31
	//   a = (w*a0 + a1) * w^N + a2
	//   b = b0 * w^N + b2
	// such that:
	//   0 <= a0 < w
	//   0 <= a1 < w
	//   0 <= a2 < w^N
	//   w/2 <= b0 < w
	//   0 <= b2 < w^N
	//   a < w*b
	// I.e. the two top words of a are a0:a1, the top word of b is
	// b0, we ensured that b0 is "full" (high bit set), and a is
	// such that the quotient q = a/b fits on one word (0 <= q < w).
	// 
	// If a = b*q + r (with 0 <= r < q), we can estimate q by
	// doing an Euclidean division on the top words:
	//   a0*w+a1 = b0*u + v  (with 0 <= v < b0)
	// Then the following holds:
	//   0 <= u <= w
	//   u-2 <= q <= u

	let a0: u32 = 0, a1: u32 = 0, b0: u32 = 0;
	hi = x[mlen];
	if (mblr == 0) {
		a0 = x[mlen];
		x[2..mlen+1] = x[1..mlen];
		x[1] = z;
		a1 = x[mlen];
		b0 = m[mlen];
	} else {
		a0 = ((x[mlen] << (BITSIZE: word - mblr)) | (x[mlen - 1] >> mblr))
			& BITMASK;
		x[2..mlen+1] = x[1..mlen];
		x[1] = z;
		a1 = ((x[mlen] << (BITSIZE: word - mblr)) | (x[mlen - 1] >> mblr))
			& BITMASK;
		b0 = ((m[mlen] << (BITSIZE: word - mblr)) | (m[mlen - 1] >> mblr))
			& BITMASK;
	};

	// We estimate a divisor q. If the quotient returned by br_div()
	// is g:
	// -- If a0 == b0 then g == 0; we want q = 0x7FFFFFFF.
	// -- Otherwise:
	//    -- if g == 0 then we set q = 0;
	//    -- otherwise, we set q = g - 1.
	// The properties described above then ensure that the true
	// quotient is q-1, q or q+1.
	// 
	// Take care that a0, a1 and b0 are 31-bit words, not 32-bit. We
	// must adjust the parameters to br_div() accordingly.
	// 
	const (g, _) = divu32(a0 >> 1, a1 | (a0 << 31), b0);
	const q = mux(eq(a0, b0), BITMASK, mux(eq(g, 0), 0, g - 1));

	// We subtract q*m from x (with the extra high word of value 'hi').
	// Since q may be off by 1 (in either direction), we may have to
	// add or subtract m afterwards.
	// 
	// The 'tb' flag will be true (1) at the end of the loop if the
	// result is greater than or equal to the modulus (not counting
	// 'hi' or the carry).
	let cc: u32 = 0;
	let tb: u32 = 1;
	for (let u = 1z; u <= mlen; u += 1) {
		const mw = m[u];
		const zl = mul31(mw, q) + cc;
		cc = (zl >> BITSIZE): word;
		const zw = zl: word & BITMASK;
		const xw = x[u];
		let nxw = xw - zw;
		cc += nxw >> BITSIZE;
		nxw &= BITMASK;
		x[u] = nxw;
		tb = mux(eq(nxw, mw), tb, gt(nxw, mw));
	};

	// If we underestimated q, then either cc < hi (one extra bit
	// beyond the top array word), or cc == hi and tb is true (no
	// extra bit, but the result is not lower than the modulus). In
	// these cases we must subtract m once.
	// 
	// Otherwise, we may have overestimated, which will show as
	// cc > hi (thus a negative result). Correction is adding m once.
	const over = gt(cc, hi);
	const under = ~over & (tb | lt(cc, hi));
	add(x, m, over);
	sub(x, m, under);
};


// Multiply two 31-bit integers, with a 62-bit result. This default
// implementation assumes that the basic multiplication operator
// yields constant-time code.
// TODO non const mul?
fn mul31(x: u32, y: u32) u64 = x: u64 * y: u64;
fn mul31_lo(x: u32, y: u32) u32 = (x: u32 * y: u32) & 0x7FFFFFFF;

fn ninv31(x: u32) u32 = {
	let y: u32 = 2 - x;
	y *= 2 - y * x;
	y *= 2 - y * x;
	y *= 2 - y * x;
	y *= 2 - y * x;
	return mux(x & 1, -y, 0) & 0x7FFFFFFF;
};


// Compute d+a*b, result in d. The initial announced bit length of d[]
// MUST match that of a[]. The d[] array MUST be large enough to
// accommodate the full result, plus (possibly) an extra word. The
// resulting announced bit length of d[] will be the sum of the announced
// bit lengths of a[] and b[] (therefore, it may be larger than the actual
// bit length of the numerical result).
//
// a[] and b[] may be the same array. d[] must be disjoint from both a[]
// and b[].
export fn mulacc(d: []word, a: const []word, b: const []word) void = {
	const alen = ewordlen(a);
	const blen = ewordlen(b);

	// We want to add the two bit lengths, but these are encoded,
	// which requires some extra care.
	const dl: u32 = (a[0] & 31) + (b[0] & 31);
	const dh: u32 = (a[0] >> 5) + (b[0] >> 5);
	d[0] = (dh << 5) + dl + (~(dl - 31): u32 >> 31);

	for (let u = 0z; u < blen; u += 1) {
		// Carry always fits on 31 bits; we want to keep it in a
		// 32-bit register on 32-bit architectures (on a 64-bit
		// architecture, cast down from 64 to 32 bits means
		// clearing the high bits, which is not free; on a 32-bit
		// architecture, the same operation really means ignoring
		// the top register, which has negative or zero cost).
		const f: u32 = b[1 + u];
		let cc: u64 = 0; // TODO #if BR_64 u32

		for (let v = 0z; v < alen; v += 1) {

			let z: u64 = d[1 + u + v]: u64 + mul31(f, a[1 + v]) + cc;
			cc = z >> 31;
			d[1 + u + v] = z: word & BITMASK;
		};
		d[1 + u + alen] = cc: u32;
	};
};

// Reduce an integer (a[]) modulo another (m[]). The result is written
// in x[] and its announced bit length is set to be equal to that of m[].
//
// x[] MUST be distinct from a[] and m[].
//
// CT: only announced bit lengths leak, not values of x, a or m.
// TODO it may also leak, if a is < m
export fn reduce(x: []word, a: const []word, m: const []word) void = {
	const m_bitlen = m[0];
	const mlen = ewordlen(m);

	x[0] = m_bitlen;
	if (m_bitlen == 0) {
		return;
	};

	// If the source is shorter, then simply copy all words from a[]
	// and zero out the upper words.
	const a_bitlen = a[0];
	const alen = ewordlen(a);
	if (a_bitlen < m_bitlen) {
		x[1..1 + alen] = a[1..1 + alen];
		for (let u = alen; u < mlen; u += 1) {
			x[u + 1] = 0;
		};
		return;
	};

	// The source length is at least equal to that of the modulus.
	// We must thus copy N-1 words, and input the remaining words
	// one by one.
	x[1..mlen] = a[2 + (alen - mlen).. 1 + mlen];
	x[mlen] = 0;
	for (let u = 1 + alen - mlen; u > 0; u -= 1) {
		muladd_small(x, a[u], m);
	};
};

// Compute a modular exponentiation. x[] MUST be an integer modulo m[]
// (same announced bit length, lower value). m[] MUST be odd. The
// exponent is in big-endian unsigned notation, over 'elen' bytes. The
// "m0i" parameter is equal to -(1/m0) mod 2^31, where m0 is the least
// significant value word of m[] (this works only if m[] is an odd
// integer). The t1[] and t2[] parameters must be temporary arrays,
// each large enough to accommodate an integer with the same size as m[].
export fn modpow(
	x: []word,
	e: const []u8,
	m: const []word,
	m0i: u32,
	t1: []word,
	t2: []word,
) void = {
	// effective word length including the "length" word
	const mlen = ewordlen(m) + 1;

	// Throughout the algorithm:
	// -- t1[] is in Montgomery representation; it contains x, x^2,
	// x^4, x^8...
	// -- The result is accumulated, in normal representation, in
	// the x[] array.
	// -- t2[] is used as destination buffer for each multiplication.
	//
	// Note that there is no need to call br_i32_from_monty().
	// memcpy(t1, x, mlen);

	t1[..mlen] = x[..mlen];
	to_monty(t1, m);
	zero(x, m[0]);
	x[1] = 1;

	for (let k: u32 = 0; k < (len(e): u32 << 3); k += 1) {
		let ctl: u32 = (e[len(e) - 1 - (k >> 3)] >> (k & 7)) & 1;

		montymul(t2, x, t1, m, m0i);
		ccopy(
			ctl,
			(x: *[*]u8)[..len(x) * size(word)], // TODO mlen?
			(t2: *[*]u8)[..len(t2) * size(word)],
		);
		montymul(t2, t1, t1, m, m0i);
		t1[..mlen] = t2[..mlen];
	};
};

// TODO dst len or src len?
fn ccopy(ctl: u32, dst: []u8, src: const []u8) void = {
	for (let i = 0z; i < len(dst); i += 1) {
		const x = src[i];
		const y = dst[i];

		dst[i] = mux(ctl, x, y): u8;
	};
};

// Compute a modular exponentiation. x[] MUST be an integer modulo m[]
// (same announced bit length, lower value). m[] MUST be odd. The
// exponent is in big-endian unsigned notation, over 'elen' bytes. The
// "m0i" parameter is equal to -(1/m0) mod 2^31, where m0 is the least
// significant value word of m[] (this works only if m[] is an odd
// integer). The tmp[] array is used for temporaries, and has size
// 'twlen' words; it must be large enough to accommodate at least two
// temporary values with the same size as m[] (including the leading
// "bit length" word). If there is room for more temporaries, then this
// function may use the extra room for window-based optimisation,
// resulting in faster computations.
//
// Returned value is 1 on success, 0 on error. An error is reported if
// THE PROVIDED TMP[] ARRAY IS TOO SHORT.
// TODO figure out why redundant modpow?
fn modpow_opt(
	x: []word,
	e: []u8, // TODO const?
	m: const []word,
	m0i: u32,
	tmp: []word,
	twlen: size,
) u32 = {
	// Get modulus size.
	let mwlen: size = (m[0] + 63) >> 5;
	const mlen: size = mwlen * size(word);
	mwlen += (mwlen & 1);
	let t1 = tmp;
	let t2 = tmp[mwlen..];

	// Compute possible window size, with a maximum of 5 bits.
	// When the window has size 1 bit, we use a specific code
	// that requires only two temporaries. Otherwise, for a
	// window of k bits, we need 2^k+1 temporaries.
	if (twlen < (mwlen << 1)) {
		return 0;
	};

	let win_len: int = 5;
	for (win_len > 1; win_len -= 1) {
		if (((1u32 << win_len: u32) + 1) * mwlen <= twlen) {
			break;
		};
	};

	// Everything is done in Montgomery representation.
	to_monty(x, m);

	// Compute window contents. If the window has size one bit only,
	// then t2 is set to x; otherwise, t2[0] is left untouched, and
	// t2[k] is set to x^k (for k >= 1).
	if (win_len == 1) {
		//memcpy(t2, x, mlen);
		t2[..mlen] = x[..mlen];
	} else {
		//memcpy(t2 + mwlen, x, mlen);
		t2[mwlen..mwlen + mlen] = x[..mlen];
		let base = t2[mwlen..];
		for (let u = 2z; u < (1u << win_len: uint); u += 1) {
			montymul(base[mwlen..], base, x, m, m0i);
			base = base[mwlen..];
		};
	};

	// We need to set x to 1, in Montgomery representation. This can
	// be done efficiently by setting the high word to 1, then doing
	// one word-sized shift.
	zero(x, m[0]);
	x[(m[0] + 31) >> 5] = 1;
	muladd_small(x, 0, m);

	let elen = len(e);

	// We process bits from most to least significant. At each
	// loop iteration, we have acc_len bits in acc.
	let acc: word = 0;
	let acc_len = 0i;
	for (acc_len > 0 || elen > 0) {
		// Get the next bits.
		let k = win_len;
		if (acc_len < win_len) {
			if (elen > 0) {
				acc = (acc << 8) | e[0];
				e = e[1..];
				elen -= 1;
				acc_len += 8;
			} else {
				k = acc_len;
			};
		};
		// TODO:
		let bits: u32 = (acc: u32 >> (acc_len: u32 - k: u32)): u32 & ((1u32 << k: u32) - 1u32);
		acc_len -= k;

		// We could get exactly k bits. Compute k squarings.
		for (let i = 0z; i < k: size; i += 1) {
			montymul(t1, x, x, m, m0i);
			// memcpy(x, t1, mlen);
			x[..mlen] = t1[..mlen];
		};

		// Window lookup: we want to set t2 to the window
		// lookup value, assuming the bits are non-zero. If
		// the window length is 1 bit only, then t2 is
		// already set; otherwise, we do a constant-time lookup.
		if (win_len > 1) {
			zero(t2, m[0]);
			let base = t2[mwlen..];
			for (let u = 1u32; u < (1 << k): size; u += 1) {
				let mask: u32 = -eq(u, bits);
				for (let v = 1z; v < mwlen; v += 1) {
					t2[v] |= mask & base[v];
				};
				base = base[mwlen..];
			};
		};

		// Multiply with the looked-up value. We keep the
		// product only if the exponent bits are not all-zero.
		montymul(t1, x, t2, m, m0i);
		ccopy(neq(bits, 0), (x: *[*]u8)[..mlen], (t1: *[*]u8)[..mlen]);
	};

	// Convert back from Montgomery representation, and exit.
	from_monty(x, m, m0i);
	return 1;
};

D crypto/bigint/encoding.ha => crypto/bigint/encoding.ha +0 -298
@@ 1,298 0,0 @@
// The following code was initially ported from BearSSL.
//
// Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
// 
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
// 
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
// 
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.

use fmt;
use bytes;
use encoding::hex;

// Allocates a new big integer that will fit given 'bitlen' of bits. A big 
// integer may be allocated statically using the following size:
//
//     def wordlen = ((bitlen + BITSIZE - 1) / BITSIZE) + 1);
//     let i: [wordlen]word = [0...];
//
// The caller must free the value by using [[free]].
export fn bigint(bitlen: size) []word = {
	return alloc([0...], ((bitlen + BITSIZE - 1) / BITSIZE) + 1);
};

// Encode 'src' from its big-endian unsigned encoding into the internal
// representation. The caller must make sure that 'dest' has enough space to
// store 'src'.
export fn encode(dest: []word, src: []u8) void = {
	// The internal representation uses an array of words wheres the first
	// word contains the encodes the announced bit length. The subsequend
	// words will encode the bits in little-endian order. The BITMASK tells
	// what bits are used in the words.

	let acc: u32 = 0;
	let accbits: u32 = 0;
	let didx: size = 1;

	for (let i = len(src) - 1; i < len(src); i -= 1) {
		acc |= (src[i] << accbits);
		accbits += 8;

		if (accbits >= BITSIZE) {
			dest[didx] = (acc & BITMASK): word;
			accbits -= BITSIZE;
			acc >>= BITSIZE;
			didx += 1;
		};
	};

	dest[didx] = acc: word;

	// TODO only store announced bitlength
	dest[0] = countbits(dest[1..]): word;
};

// calculates an encoded bit length of 'n'
fn countbits(n: []word) u32 = {
	let tw: u32 = 0;
	let twk: u32 = 0;

	for (let i = len(n) - 1; i < len(n); i -= 1) {
		const c = eq(tw, 0);
		const w = n[i];
		tw = mux(c, w, tw);
		twk = mux(c, i: u32, twk);
	};

	return (twk << BITLENSHIFT) + word_countbits(tw);
};

fn word_countbits(x: u32) u32 = {
	let k: u32 = neq(x, 0);
	let c: u32 = 0;

	c = gt(x, 0xFFFF);
	x = mux(c, x >> 16, x);
	k += c << 4;

	c = gt(x, 0x00FF);
	x = mux(c, x >>  8, x);
	k += c << 3;

	c = gt(x, 0x000F);
	x = mux(c, x >>  4, x);
	k += c << 2;

	c = gt(x, 0x0003);
	x = mux(c, x >>  2, x);
	k += c << 1;

	k += gt(x, 0x0001);

	return k;
};

fn ebitlen(x: []word) u32 = {
	return x[0];
};

// Get the effective wordlen of 'x'. The effective wordlen is the number of
// words that are filled with effective bits.
fn ewordlen(x: []word) u32 = {
	return (x[0] + BITSIZE) >> BITLENSHIFT;
};

// Decode 'src' into a big-endian unsigned byte array. The caller must ensure
// that 'dest' has enought space to store the decoded value of 'src'.
export fn decode(dest: []u8, src: []word) void = {
	let acc: u64 = 0;
	let accbits: u64 = 0;
	let sidx: size = 1;
	for (let i = len(dest) - 1; i < len(dest); i -= 1) {
		if (accbits < 8) {
			if (sidx < len(src)) {
				acc |= ((src[sidx]: u64) << accbits: u64): u64;
				sidx += 1;
			};
			accbits += BITSIZE;
		};
		dest[i] = acc: u8;
		acc >>= 8;
		accbits -= 8;
	};
};

// Encodes an integer from its big-endian unsigned encoding. 'src' must
// be lower than 'm'. If not 'dest' will be set to 0. 'dest' will have the
// announced bit length of 'm' after decoding.
//
// The return value is 1 if the decoded value fits within 'm' or 0 otherwise.
//
// TODO dest must be same size of m?
// TODO constant time info
export fn encodemod(dest: []word, src: []u8, m: []word) u32 = {
	// Two-pass algorithm: in the first pass, we determine whether the
	// value fits; in the second pass, we do the actual write.
	//
	// During the first pass, 'r' contains the comparison result so
	// far:
	//  0x00000000   value is equal to the modulus
	//  0x00000001   value is greater than the modulus
	//  0xFFFFFFFF   value is lower than the modulus
	//
	// Since we iterate starting with the least significant bytes (at
	// the end of src[]), each new comparison overrides the previous
	// except when the comparison yields 0 (equal).
	//
	// During the second pass, 'r' is either 0xFFFFFFFF (value fits)
	// or 0x00000000 (value does not fit).
	//
	// We must iterate over all bytes of the source, _and_ possibly
	// some extra virtual bytes (with value 0) so as to cover the
	// complete modulus as well. We also add 4 such extra bytes beyond
	// the modulus length because it then guarantees that no accumulated
	// partial word remains to be processed.

	let mlen = 0z, tlen = 0z;

	mlen = ewordlen(m);
	tlen = (mlen << 1); // TODO tlen? it's 2 at i31
	if (tlen < len(src)) {
		tlen = len(src);
	};
	tlen += 4; // TODO why + 4?
	let r: u32 = 0;
	for (let pass = 0z; pass < 2; pass += 1) {
		let v: size = 1;
		let acc: u32 = 0, acc_len: u32 = 0;

		for (let u = 0z; u < tlen; u += 1) {
			let b: u32 = 0;

			if (u < len(src)) {
				b = src[len(src) - 1 - u];
			};
			acc |= (b << acc_len);
			acc_len += 8;
			if (acc_len >= BITSIZE) {
				const xw: u32 = acc & BITMASK;
				acc_len -= BITSIZE;
				acc = b >> (8 - acc_len);
				if (v <= mlen) {
					if (pass == 1) {
						dest[v] = (r & xw): word;
					} else {
						let cc = cmp(xw, m[v]: u32): u32;
						r = mux(eq(cc, 0), r, cc);
					};
				} else {
					if (pass == 0) {
						r = mux(eq(xw, 0), r, 1);
					};
				};
				v += 1;
			};
		};

		// When we reach this point at the end of the first pass:
		// r is either 0, 1 or -1; we want to set r to 0 if it
		// is equal to 0 or 1, and leave it to -1 otherwise.
		// 
		// When we reach this point at the end of the second pass:
		// r is either 0 or -1; we want to leave that value
		// untouched. This is a subcase of the previous.
		// 
		r >>= 1;
		r |= (r << 1);
	};

	dest[0] = m[0];
	return r & 1;
};

fn ds(s: []u8, name: str) void = {
	fmt::printfln("{} (slice): {}", name, hex::encodestr(s))!;
	return;
};

fn dn(s: []word, name: str) void = {
	const n = len(s) * (BITSIZE / 8) + 1; // TODO fix this
	let d: []u8 = alloc([0...], n);
	decode(d, s);
	fmt::printfln("{} (num): {}", name, hex::encodestr(d))!;

	fmt::print("bin: ")!;
	for (let i = len(s) - 1; i > 0 ; i -= 1) {
		fmt::printf("{:031b} ", s[i]: u32)!;
	};
	fmt::println()!;
	fmt::println("bitlen: ", s[0]: u32)!;
	return;
};

export fn encodereduce(dest: []word, src: []u8, m: []word) void = {
	// Get the encoded bit length.
	const m_ebitlen = m[0];

	// Special case for an invalid (null) modulus.
	if (m_ebitlen == 0) {
		dest[0] = 0;
		return;
	};

	zero(dest, m_ebitlen);

	// First decode directly as many bytes as possible. This requires
	// computing the actual bit length.
	let m_rbitlen = m_ebitlen >> 5;
	let m_rbitlen = (m_ebitlen & 31) + (m_rbitlen << 5) - m_rbitlen;
	const mblen: size = (m_rbitlen + 7) >> 3;
	let k: size = mblen - 1;
	if (k >= len(src)) {
		encode(dest, src);
		dest[0] = m_ebitlen;
		return;
	};
	encode(dest, src[..k]);
	dest[0] = m_ebitlen;

	// Input remaining bytes, using 31-bit words.
	let acc: word = 0;
	let acc_len: word = 0;
	for (k < len(src)) {
		const v = src[k];
		k += 1;
		if (acc_len >= 23) {
			acc_len -= 23;
			acc <<= (8 - acc_len);
			acc |= v >> acc_len;
			muladd_small(dest, acc, m);
			acc = v & (0xFF >> (8 - acc_len));
		} else {
			acc = (acc << 8) | v;
			acc_len += 8;
		};
	};

	// We may have some bits accumulated. We then perform a shift to
	// be able to inject these bits as a full 31-bit word.
	if (acc_len != 0) {
		acc = (acc | (dest[1] << acc_len)) & BITMASK;
		rshift(dest, 31 - acc_len);
		muladd_small(dest, acc, m);
	};
};

D crypto/bigint/moddiv.ha => crypto/bigint/moddiv.ha +0 -411
@@ 1,411 0,0 @@
// The following code was initially ported from BearSSL.
//
// Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
// 
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
// 
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
// 
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.

// In this file, we handle big integers with a custom format, i.e.
// without the usual one-word header. Value is split into 31-bit words,
// each stored in a 32-bit slot (top bit is zero) in little-endian
// order. The length (in words) is provided explicitly. In some cases,
// the value can be negative (using two's complement representation). In
// some cases, the top word is allowed to have a 32th bit.

// Negate big integer conditionally. The value consists of 'len' words,
// with 31 bits in each word (the top bit of each word should be 0,
// except possibly for the last word). If 'ctl' is 1, the negation is
// computed; otherwise, if 'ctl' is 0, then the value is unchanged.
fn cond_negate(a: []word, alen: size, ctl: u32) void = {
	let cc = ctl;
	const xm: u32 = -ctl >> 1;
	for (let k = 0z; k < alen; k += 1) {
		let aw = a[k];
		aw = (aw ^ xm) + cc;
		a[k] = aw & 0x7FFFFFFF;
		cc = aw >> 31;
	};
};

// Finish modular reduction. Rules on input parameters:
// 
//   if neg = 1, then -m <= a < 0
//   if neg = 0, then 0 <= a < 2*m
// 
// If neg = 0, then the top word of a[] may use 32 bits.
// 
// Also, modulus m must be odd.
fn finish_mod(a: []word, alen: size, m: const []word, neg: u32) void = {
	// First pass: compare a (assumed nonnegative) with m.
	// Note that if the final word uses the top extra bit, then
	// subtracting m must yield a value less than 2^31, since we
	// assumed that a < 2*m.
	let cc: u32 = 0;
	for (let k = 0z; k < alen; k += 1) {
		const aw = a[k];
		const mw = m[k];
		cc = (aw - mw - cc) >> 31;
	};

	// At this point:
	//   if neg = 1, then we must add m (regardless of cc)
	//   if neg = 0 and cc = 0, then we must subtract m
	//   if neg = 0 and cc = 1, then we must do nothing
	const xm: u32 = -neg >> 1;
	const ym: u32 = -(neg | (1 - cc));
	cc = neg;
	for (let k = 0z; k < alen; k += 1) {
		let aw = a[k];
		const mw = (m[k] ^ xm) & ym;
		aw = aw - mw - cc;
		a[k] = aw & 0x7FFFFFFF;
		cc = aw >> 31;
	};
};

// Compute:
//   a <- (a*pa+b*pb)/(2^31)
//   b <- (a*qa+b*qb)/(2^31)
// The division is assumed to be exact (i.e. the low word is dropped).
// If the final a is negative, then it is negated. Similarly for b.
// Returned value is the combination of two bits:
//   bit 0: 1 if a had to be negated, 0 otherwise
//   bit 1: 1 if b had to be negated, 0 otherwise
//
// Factors pa, pb, qa and qb must be at most 2^31 in absolute value.
// Source integers a and b must be nonnegative; top word is not allowed
// to contain an extra 32th bit.
fn co_reduce(
	a: []word,
	b: []word,
	alen: size, // TODO
	pa: i64,
	pb: i64,
	qa: i64,
	qb: i64,
) u32 = {
	let cca: i64 = 0;
	let ccb: i64 = 0;
	for (let k = 0z; k < alen; k += 1) {
		// Since:
		//   |pa| <= 2^31
		//   |pb| <= 2^31
		//   0 <= wa <= 2^31 - 1
		//   0 <= wb <= 2^31 - 1
		//   |cca| <= 2^32 - 1
		// Then:
		//   |za| <= (2^31-1)*(2^32) + (2^32-1) = 2^63 - 1
		//
		// Thus, the new value of cca is such that |cca| <= 2^32 - 1.
		// The same applies to ccb.
		const wa = a[k];
		const wb = b[k];
		const za: u64 = wa * pa: u64 + wb * pb: u64 + cca: u64;
		const zb: u64 = wa * qa: u64 + wb * qb: u64 + ccb: u64;
		if (k > 0) {
			a[k - 1] = za: word & 0x7FFFFFFF;
			b[k - 1] = zb: word & 0x7FFFFFFF;
		};

		// For the new values of cca and ccb, we need a signed
		// right-shift; since, in C, right-shifting a signed
		// negative value is implementation-defined, we use a
		// custom portable sign extension expression.
		const m: u64 = 1 << 32;
		let tta: u64 = za >> 31;
		let ttb: u64 = zb >> 31;
		tta = (tta ^ m) - m;
		ttb = (ttb ^ m) - m;
		// cca = *(int64_t *)&tta;
		// ccb = *(int64_t *)&ttb;
		cca = tta: i64; // TODO test this
		ccb = ttb: i64;
	};
	a[alen - 1] = cca: u32;
	b[alen - 1] = ccb: u32;

	const nega: u32 = (cca: u64 >> 63): u32;
	const negb: u32 = (ccb: u64 >> 63): u32;
	cond_negate(a, alen, nega);
	cond_negate(b, alen, negb);
	return nega | (negb << 1);
};

// Compute:
//   a <- (a*pa+b*pb)/(2^31) mod m
//   b <- (a*qa+b*qb)/(2^31) mod m
//
// m0i is equal to -1/m[0] mod 2^31.
//
// Factors pa, pb, qa and qb must be at most 2^31 in absolute value.
// Source integers a and b must be nonnegative; top word is not allowed
// to contain an extra 32th bit.
fn co_reduce_mod(
	a: []word,
	b: []word,
	alen: size, // TODO
	pa: i64,
	pb: i64,
	qa: i64,
	qb: i64,
	m: const []word,
	m0i: u32,
) void = {
	let cca: i64 = 0;
	let ccb: i64 = 0;
	let fa: u32 = ((a[0] * pa: u32 + b[0] * pb: u32) * m0i) & 0x7FFFFFFF;
	let fb: u32 = ((a[0] * qa: u32 + b[0] * qb: u32) * m0i) & 0x7FFFFFFF;
	for (let k = 0z; k < alen; k += 1) {

		// In this loop, carries 'cca' and 'ccb' always fit on
		// 33 bits (in absolute value).
		let wa = a[k];
		let wb = b[k];
		let za: u64 = wa * pa: u64 + wb * pb: u64
			+ m[k] * fa: u64 + cca: u64;
		let zb: u64 = wa * qa: u64 + wb * qb: u64
			+ m[k] * fb: u64 + ccb: u64;
		if (k > 0) {
			a[k - 1] = za: word & 0x7FFFFFFF;
			b[k - 1] = zb: word & 0x7FFFFFFF;
		};

		const m: u64 = 1 << 32;
		let tta: u64 = za >> 31;
		let ttb: u64 = zb >> 31;
		tta = (tta ^ m) - m;
		ttb = (ttb ^ m) - m;
		// cca = *(int64_t *)&tta;
		// ccb = *(int64_t *)&ttb;
		cca = tta: i64; // TODO test this
		ccb = ttb: i64;
	};
	a[alen - 1] = cca: u32;
	b[alen - 1] = ccb: u32;

	// At this point:
	//   -m <= a < 2*m
	//   -m <= b < 2*m
	// (this is a case of Montgomery reduction)
	// The top word of 'a' and 'b' may have a 32-th bit set.
	// We may have to add or subtract the modulus.
	finish_mod(a, alen, m, (cca: u64 >> 63): u32);
	finish_mod(b, alen, m, (ccb: u64 >> 63): u32);
};

fn moddiv(
	x: []word,
	y: const []word,
	m: const []word,
	m0i: u32,
	t: []word,
) u32 = {
	// Algorithm is an extended binary GCD. We maintain four values
	// a, b, u and v, with the following invariants:
	//
	//   a * x = y * u mod m
	//   b * x = y * v mod m
	//
	// Starting values are:
	//
	//   a = y
	//   b = m
	//   u = x
	//   v = 0
	//
	// The formal definition of the algorithm is a sequence of steps:
	//
	//   - If a is even, then a <- a/2 and u <- u/2 mod m.
	//   - Otherwise, if b is even, then b <- b/2 and v <- v/2 mod m.
	//   - Otherwise, if a > b, then a <- (a-b)/2 and u <- (u-v)/2 mod m.
	//   - Otherwise, b <- (b-a)/2 and v <- (v-u)/2 mod m.
	//
	// Algorithm stops when a = b. At that point, they both are equal
	// to GCD(y,m); the modular division succeeds if that value is 1.
	// The result of the modular division is then u (or v: both are
	// equal at that point).
	//
	// Each step makes either a or b shrink by at least one bit; hence,
	// if m has bit length k bits, then 2k-2 steps are sufficient.
	//
	//
	// Though complexity is quadratic in the size of m, the bit-by-bit
	// processing is not very efficient. We can speed up processing by
	// remarking that the decisions are taken based only on observation
	// of the top and low bits of a and b.
	//
	// In the loop below, at each iteration, we use the two top words
	// of a and b, and the low words of a and b, to compute reduction
	// parameters pa, pb, qa and qb such that the new values for a
	// and b are:
	//
	//   a' = (a*pa + b*pb) / (2^31)
	//   b' = (a*qa + b*qb) / (2^31)
	//
	// the division being exact.
	//
	// Since the choices are based on the top words, they may be slightly
	// off, requiring an optional correction: if a' < 0, then we replace
	// pa with -pa, and pb with -pb. The total length of a and b is
	// thus reduced by at least 30 bits at each iteration.
	//
	// The stopping conditions are still the same, though: when a
	// and b become equal, they must be both odd (since m is odd,
	// the GCD cannot be even), therefore the next operation is a
	// subtraction, and one of the values becomes 0. At that point,
	// nothing else happens, i.e. one value is stuck at 0, and the
	// other one is the GCD.

	const mlen: size = (m[0] + 31) >> 5;
	let a = t;
	let b = a[mlen..];
	let u = x[1..];
	let v = b[mlen..];
	//memcpy(a, y + 1, mlen * sizeof *y);
	a[..mlen] = y[1..mlen + 1];
	// memcpy(b, m + 1, mlen * sizeof *m);
	b[..mlen] = m[1..mlen + 1];
	// memset(v, 0, mlen * sizeof *v);
	v = [0...];

	// Loop below ensures that a and b are reduced by some bits each,
	// for a total of at least 30 bits.
	for (let num: u32 = ((m[0] - (m[0] >> 5)) << 1) + 30; num >= 30; num -= 30) {
		// Extract top words of a and b. If j is the highest
		// index >= 1 such that a[j] != 0 or b[j] != 0, then we want
		// (a[j] << 31) + a[j - 1], and (b[j] << 31) + b[j - 1].
		// If a and b are down to one word each, then we use a[0]
		// and b[0].
		let c0: u32 = -1: u32;
		let c1 = -1: u32;
		let a0: u32 = 0;
		let a1: u32 = 0;
		let b0: u32 = 0;
		let b1: u32 = 0;
		let j = mlen;
		for (j > 0; j -= 1) {
			const aw = a[j];
			const bw = b[j];
			a0 ^= (a0 ^ aw) & c0;
			a1 ^= (a1 ^ aw) & c1;
			b0 ^= (b0 ^ bw) & c0;
			b1 ^= (b1 ^ bw) & c1;
			c1 = c0;
			c0 &= (((aw | bw) + 0x7FFFFFFF) >> 31) - 1: u32;
		};

		// If c1 = 0, then we grabbed two words for a and b.
		// If c1 != 0 but c0 = 0, then we grabbed one word. It
		// is not possible that c1 != 0 and c0 != 0, because that
		// would mean that both integers are zero.
		a1 |= a0 & c1;
		a0 &= ~c1;
		b1 |= b0 & c1;
		b0 &= ~c1;
		let a_hi: u64 = (a0: u64 << 31) + a1;
		let b_hi: u64 = (b0: u64 << 31) + b1;
		let a_lo = a[0];
		let b_lo = b[0];

		// Compute reduction factors:
		//
		//   a' = a*pa + b*pb
		//   b' = a*qa + b*qb
		//
		// such that a' and b' are both multiple of 2^31, but are
		// only marginally larger than a and b.
		let pa: i64 = 1;
		let pb: i64 = 0;
		let qa: i64 = 0;
		let qb: i64 = 1;
		for (let i = 0i; i < 31; i += 1) {
			// At each iteration:
			//
			//   a <- (a-b)/2 if: a is odd, b is odd, a_hi > b_hi
			//   b <- (b-a)/2 if: a is odd, b is odd, a_hi <= b_hi
			//   a <- a/2 if: a is even
			//   b <- b/2 if: a is odd, b is even
			//
			// We multiply a_lo and b_lo by 2 at each
			// iteration, thus a division by 2 really is a
			// non-multiplication by 2.

			// r = GT(a_hi, b_hi)
			// But the GT() function works on uint32_t operands,
			// so we inline a 64-bit version here.
			let rz: u64 = b_hi - a_hi;
			const r: u32 = ((rz ^ ((a_hi ^ b_hi)
				& (a_hi ^ rz))) >> 63): u32;

			// cAB = 1 if b must be subtracted from a
			// cBA = 1 if a must be subtracted from b
			// cA = 1 if a is divided by 2, 0 otherwise
			//
			// Rules:
			//
			//   cAB and cBA cannot be both 1.
			//   if a is not divided by 2, b is.
			let oa: word = (a_lo >> i: word): word & 1;
			let ob: word = (b_lo >> i: word): word & 1;
			let cAB: word = oa & ob & r;
			let cBA: word = oa & ob & not(r);
			let cA: word = cAB | not(oa);

			// Conditional subtractions.
			a_lo -= b_lo & -cAB;
			a_hi -= b_hi & -cAB: u64;
			pa -= qa & -cAB: i64;
			pb -= qb & -cAB: i64;
			b_lo -= a_lo & -cBA;
			b_hi -= a_hi & -cBA: u64;
			qa -= pa & -cBA: i64;
			qb -= pb & -cBA: i64;

			// Shifting.
			a_lo += a_lo & (cA - 1);
			pa += pa & (cA: i64 - 1);
			pb += pb & (cA: i64 - 1);
			a_hi ^= (a_hi ^ (a_hi >> 1)) & -cA: u64;
			b_lo += b_lo & -cA;
			qa += qa & -cA: i64;
			qb += qb & -cA: i64;
			b_hi ^= (b_hi ^ (b_hi >> 1)) & (cA: u64 - 1);
		};

		// Replace a and b with new values a' and b'.
		const r: u32 = co_reduce(a, b, mlen, pa, pb, qa, qb);
		pa -= pa * ((r & 1) << 1): i64;
		pb -= pb * ((r & 1) << 1): i64;
		qa -= qa * (r & 2): i64;
		qb -= qb * (r & 2): i64;
		co_reduce_mod(u, v, mlen, pa, pb, qa, qb, m[1..], m0i);
	};

	// Now one of the arrays should be 0, and the other contains
	// the GCD. If a is 0, then u is 0 as well, and v contains
	// the division result.
	// Result is correct if and only if GCD is 1.
	let r = (a[0] | b[0]) ^ 1;
	u[0] |= v[0];
	for (let k = 1z; k < mlen; k += 1) {
		r |= a[k] | b[k];
		u[k] |= v[k];
	};
	return eq0(r: i32);
};


D crypto/bigint/monty.ha => crypto/bigint/monty.ha +0 -151
@@ 1,151 0,0 @@
// The following code was initially ported from BearSSL.
//
// Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
// 
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
// 
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
// 
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.

// Convert a modular integer to Montgomery representation. The integer x[]
// MUST be lower than m[], but with the same announced bit length.
fn to_monty(dest: []word, m: const []word) void = {
	for (let k: u32 = (m[0] + 31) >> 5; k > 0; k -= 1) {
		muladd_small(dest, 0, m);
	};
};

// Convert a modular integer back from Montgomery representation. The
// integer x[] MUST be lower than m[], but with the same announced bit
// length. The "m0i" parameter is equal to -(1/m0) mod 2^32, where m0 is
// the least significant value word of m[] (this works only if m[] is
// an odd integer).
// TODO mention ::ninv
fn from_monty(x: []word, m: const []word, m0i: u32) void = {
	assert(m[1] & 1 == 1, "m must be an odd integer");

	const mlen = (m[0] + 31) >> 5;
	for (let u = 0z; u < mlen; u += 1) {
		const f: u32 = mul31_lo(x[1], m0i);
		let cc: u64 = 0;
		for (let v = 0z; v < mlen; v += 1) {
			const z: u64 = x[v + 1]: u64 + mul31(f, m[v + 1]) + cc;
			cc = z >> 31;
			if (v != 0) {
				x[v] = z: u32 & 0x7FFFFFFF;
			};
		};
		x[mlen] = cc: u32;
	};

	// We may have to do an extra subtraction, but only if the
	// value in x[] is indeed greater than or equal to that of m[],
	// which is why we must do two calls (first call computes the
	// carry, second call performs the subtraction only if the carry
	// is 0).
	sub(x, m, not(sub(x, m, 0)));
};

// Compute a modular Montgomery multiplication. d[] is filled with the
// value of x*y/R modulo m[] (where R is the Montgomery factor). The
// array d[] MUST be distinct from x[], y[] and m[]. x[] and y[] MUST be
// numerically lower than m[]. x[] and y[] MAY be the same array. The
// "m0i" parameter is equal to -(1/m0) mod 2^32, where m0 is the least
// significant value word of m[] (this works only if m[] is an odd
// integer).
fn montymul(
	d: []word,
	x: const []word,
	y: const []word,
	m: const []word,
	m0i: u32
) void = {
	// Each outer loop iteration computes:
	//   d <- (d + xu*y + f*m) / 2^31
	// We have xu <= 2^31-1 and f <= 2^31-1.
	// Thus, if d <= 2*m-1 on input, then:
	//   2*m-1 + 2*(2^31-1)*m <= (2^32)*m-1
	// and the new d value is less than 2*m.
	//
	// We represent d over 31-bit words, with an extra word 'dh'
	// which can thus be only 0 or 1.
	let v: size = 0;

	const mlen = (m[0] + 31) >> 5;
	const len4 = mlen & ~3: size;
	zero(d, m[0]);
	let dh: u32 = 0;
	for (let u = 0z; u < mlen; u += 1) {
		// The carry for each operation fits on 32 bits:
		//   d[v+1] <= 2^31-1
		//   xu*y[v+1] <= (2^31-1)*(2^31-1)
		//   f*m[v+1] <= (2^31-1)*(2^31-1)
		//   r <= 2^32-1
		//   (2^31-1) + 2*(2^31-1)*(2^31-1) + (2^32-1) = 2^63 - 2^31
		// After division by 2^31, the new r is then at most 2^32-1
		//
		// Using a 32-bit carry has performance benefits on 32-bit
		// systems; however, on 64-bit architectures, we prefer to
		// keep the carry (r) in a 64-bit register, thus avoiding some
		// "clear high bits" operations.

		const xu: u32 = x[u + 1];
		const f: u32 = mul31_lo((d[1] + mul31_lo(x[u + 1], y[1])), m0i);

		let r: u64 = 0; // TODO if !BR_64 u32
		v = 0;
		for (v < len4; v += 4) {
			let z: u64 = d[v + 1]: u64 + mul31(xu, y[v + 1])
				+ mul31(f, m[v + 1]) + r;
			r = z >> 31;
			d[v + 0] = z: u32 & 0x7FFFFFFF;
			z = d[v + 2]: u64 + mul31(xu, y[v + 2])
				+ mul31(f, m[v + 2]) + r;
			r = z >> 31;
			d[v + 1] = z: u32 & 0x7FFFFFFF;
			z = d[v + 3]: u64 + mul31(xu, y[v + 3])
				+ mul31(f, m[v + 3]) + r;
			r = z >> 31;
			d[v + 2] = z: u32 & 0x7FFFFFFF;
			z = d[v + 4]: u64 + mul31(xu, y[v + 4])
				+ mul31(f, m[v + 4]) + r;
			r = z >> 31;
			d[v + 3] = z: u32 & 0x7FFFFFFF;
		};
		for (v < mlen; v += 1) {
			const z: u64 = d[v + 1]: u64 + mul31(xu, y[v + 1])
				+ mul31(f, m[v + 1]) + r;
			r = z >> 31;
			d[v] = z: u32 & 0x7FFFFFFF;
		};

		// Since the new dh can only be 0 or 1, the addition of
		// the old dh with the carry MUST fit on 32 bits, and
		// thus can be done into dh itself.
		dh += r: u32;
		d[mlen] = dh & 0x7FFFFFFF;
		dh >>= 31;
	};

	// We must write back the bit length because it was overwritten in
	// the loop (not overwriting it would require a test in the loop,
	// which would yield bigger and slower code).
	d[0] = m[0];

	// d[] may still be greater than m[] at that point; notably, the
	// 'dh' word may be non-zero.
	sub(d, m, neq(dh, 0) | not(sub(d, m, 0)));
};

D crypto/bigint/types+ct32.ha => crypto/bigint/types+ct32.ha +0 -5
@@ 1,5 0,0 @@
def BITSIZE: u32 = 15;
def BITMASK: u32 = 0x7FFF;
def BITLENSHIFT: u32 = 4;

export type word = u16;

D crypto/bigint/types.ha => crypto/bigint/types.ha +0 -5
@@ 1,5 0,0 @@
export type word = u32;

def BITSIZE: u32 = 31;
def BITMASK: word = 0x7FFFFFFF;
def BITLENSHIFT: u32 = 5;

D crypto/bigint/util.ha => crypto/bigint/util.ha +0 -120
@@ 1,120 0,0 @@
// The following code was initially ported from BearSSL.
//
// Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
// 
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
// 
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
// 
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
use fmt;
use bytes;

// returns x if ctl == 1 and y if ctl == 0
fn mux(ctl: u32, x: u32, y: u32) u32 = y ^ ((-(ctl: i32)): u32 & (x ^ y));

@test fn mux() void = {
	assert(mux(1, 0x4, 0xff) == 0x4);
	assert(mux(0, 0x4, 0xff) == 0xff);
};

fn not(x: u32) u32 = x ^ 1;

fn eq(x: u32, y: u32) u32 = {
	let q = x ^ y;
	return ((q | -(q: i32): u32) >> 31) ^ 1;
};

@test fn eq() void = {
	assert(eq(0x4f, 0x4f) == 1);
	assert(eq(0x4f, 0x0) == 0);
	assert(eq(0x2, 0x6) == 0);
};

fn eq0(x: i32) u32 = {
	const q: u32 = x: u32;
	return ~(q | -q) >> 31;
};

@test fn eq0() void = {
	assert(eq0(0) == 1);
	assert(eq0(1) == 0);
	assert(eq0(0x1234) == 0);
};


// Returns 1 if x != y and 0 otherwise.
fn neq(x: u32, y: u32) u32 = {
	let q = x ^ y;
	return (q | -(q: i32): u32) >> 31;
};

fn gt(x: u32, y: u32) u32 = {
	let z: u32 = y - x;
	return (z ^ ((x ^ y) & (x ^ z))) >> 31;
};

@test fn gt() void = {
	assert(gt(1, 0) == 1);
	assert(gt(0, 1) == 0);
	assert(gt(0, 0) == 0);

	assert(gt(0xf3, 0xf2) == 1);
	assert(gt(0x20, 0xff) == 0);
	assert(gt(0x23, 0x23) == 0);
};

fn ge(x: u32, y: u32) u32 = not(gt(y, x));
fn lt(x: u32, y: u32) u32 = gt(y, x);
fn le(x: u32, y: u32) u32 = not(gt(x, y));

fn cmp(x: u32, y: u32) i32 = gt(x, y): i32 | -(gt(y, x): i32);

@test fn cmp() void = {
	assert(cmp(0, 0) == 0);
	assert(cmp(0x34, 0x34) == 0);

	assert(cmp(0x12, 0x34) == -1);
	assert(cmp(0x87, 0x34) == 1);
};

// Zeroes effective words of 'x' and sets its encoded bitlen 'ebitlen'.
export fn zero(x: []word, ebitlen: word) void = {
	x[0] = ebitlen;
	const ewordlen = (ebitlen + BITSIZE) >> BITLENSHIFT;
	bytes::zero((x: *[*]u8)[1..(ewordlen + 1) * size(word)]);
};

fn iszero(x: []word) u32 = {
	let z: u32 = 0;

	for (let i = ewordlen(x); i > 0; i -= 1) {
		z |= x[i];
	};
	return ~(z | -(z: i32): u32) >> 31;
};

@test fn iszero() void = {
	let x = fromhex("210032a0");
	let y = fromhex("00000000");

	assert(iszero(x) == 0);
	assert(iszero(y) == 1);

	free(x);
	free(y);
};