Commit 1d073430 authored by Mike Hamburg's avatar Mike Hamburg
Browse files

p521 testing, 803kcy ecdh

parent 65466601
October 27, 2014:
Added more support for >512-bit primes. Changed shared secret
to not overflow the buffer in this case. Changed hashing to
SHA512-PRNG; this doesn't change the behavior in the case that
only one block is required.
E-521 appears to be working. Needs more testing, and maybe some
careful analysis since the carry-handling bounds are awfully tight
under the current analysis (the "< 5<<57" that it #if0 asserts is
not actually tight enough under the current analysis; need
it to be < (1+epsilon) << 59).
So you actually do need to reduce often, at least in the x86_64_r12
version.
p521/arch_ref64: simple and relatively slow impl. Like
p448/arch_ref64, this arch reduces after every add or sub.
p521/arch_x86_64_r12: aggressive, fast implementation. This impl
stores 521 bits not in 9 limbs, but 12! Limbs 3,7,11 are 0, and
are there only for vector alignment. (TODO: remove those limbs
from precomputed tables, so that we don't have to look them up!).
The carry handling on this build is very tight, and probably could
stand more analysis. This is why I have the careful balancing of
"hexad" and "nonad" multiplies in its Chung-Hasan mul routine.
The 'r12 build is a work in progress, and currently only works on
clang (because it rearranges vectors in the timesW function).
Timings for the fast, aggressive arch on Haswell:
mul: 146cy
sqr: 111cy
invert: 62kcy
keygen: 270kcy
ecdh: 803kcy
sign: 283kcy
verif: 907kcy
Same rules as other Goldi benchmarks. Turbo off, HT off,
timing-channel protected (no dataflow from secrets to branches,
memory lookups or known vt instructions), compressed points.
October 23, 2014:
Pushing through changes for curve flexibility. First up is
Ed480-Ridinghood, because it has the same number of words. Next
......
......@@ -12,7 +12,7 @@
#include "ec_point.h"
#include "magic.h"
#define is32 (GOLDI_BITS == 32 || FIELD_BITS == 480)
#define is32 (GOLDI_BITS == 32 || FIELD_BITS != 448)
/* TODO XXX PERF FIXME: better detection of overflow conditions */
/* I wanted to just use if (is32)
......
......@@ -243,6 +243,8 @@ goldilocks_shared_secret_core (
const struct goldilocks_public_key_t *your_pubkey,
const struct goldilocks_precomputed_public_key_t *pre
) {
uint8_t gxy[GOLDI_FIELD_BYTES];
/* This function doesn't actually need anything in goldilocks_global,
* so it doesn't check init.
*/
......@@ -277,7 +279,7 @@ goldilocks_shared_secret_core (
#endif
field_serialize(shared,&pk);
field_serialize(gxy,&pk);
/* obliterate records of our failure by adjusting with obliteration key */
struct sha512_ctx_t ctx;
......@@ -305,7 +307,7 @@ goldilocks_shared_secret_core (
#endif
/* stir in the shared key and finish */
sha512_update(&ctx, shared, GOLDI_FIELD_BYTES);
sha512_update(&ctx, gxy, GOLDI_FIELD_BYTES);
sha512_final(&ctx, shared);
return (GOLDI_ECORRUPT & ~msucc)
......
......@@ -154,6 +154,14 @@ typedef word_t vecmask_t __attribute__((vector_size(32)));
return (big_register_t)x;
}
#endif
typedef struct {
uint64xn_t unaligned;
} __attribute__((packed)) unaligned_uint64xn_t;
typedef struct {
uint32xn_t unaligned;
} __attribute__((packed)) unaligned_uint32xn_t;
/**
* Return -1 if x==0, and 0 otherwise.
......
......@@ -4,41 +4,20 @@
#include "p521.h"
typedef uint64x4_t uint64x3_t; /* fit it in a vector register */
static const uint64x3_t mask58 = { (1ull<<58) - 1, (1ull<<58) - 1, (1ull<<58) - 1, 0 };
typedef struct {
uint64x3_t lo, hi;
} hexad_t;
/* Currently requires CLANG. Sorry. */
static inline uint64x3_t timesW (uint64x3_t u) {
return u.zxyw + u.zwww;
}
/* Store three vectors. Currently requries AVX2 (TODO: remove) */
static const uint64x4_t ls_mask_3 = { -1ull, -1ull, -1ull, 0 };
static void store3 (uint64_t *x, uint64x3_t v) {
_mm256_maskstore_epi64((long long *) x, ls_mask_3, v);
}
uint64x3_t lo, hi, hier;
} nonad_t;
static __inline__ uint64_t is_zero(uint64_t a) {
/* let's hope the compiler isn't clever enough to optimize this. */
return (((__uint128_t)a)-1)>>64;
}
static __inline__ __uint128_t widemul(
const uint64_t a,
const uint64_t b
) {
return ((__uint128_t)a) * ((__uint128_t)b);
}
static inline __uint128_t widemulu(const uint64_t a, const uint64_t b) {
static inline __uint128_t widemulu(uint64_t a, uint64_t b) {
return ((__uint128_t)(a)) * b;
}
static inline __int128_t widemuls(const int64_t a, const int64_t b) {
static inline __int128_t widemuls(int64_t a, int64_t b) {
return ((__int128_t)(a)) * b;
}
......@@ -48,8 +27,9 @@ static inline uint64_t opacify(uint64_t x) {
return x;
}
static inline void hexad_mul (
hexad_t *hex,
/* These used to be hexads, leading to 10% better performance, but there were overflow issues */
static inline void nonad_mul (
nonad_t *hex,
const uint64_t *a,
const uint64_t *b
) {
......@@ -76,15 +56,13 @@ static inline void hexad_mul (
lo = { (uint64_t)(xu), (uint64_t)(xv), (uint64_t)(xw), 0 },
hi = { (uint64_t)(xu>>64), (uint64_t)(xv>>64), (uint64_t)(xw>>64), 0 };
hi = hi<<6 | lo>>58;
lo &= mask58;
hex->lo = lo;
hex->hi = hi;
hex->hier = hi>>52;
hex->hi = (hi<<12)>>6 | lo>>58;
hex->lo = lo & mask58;
}
static inline void hexad_mul_signed (
hexad_t *hex,
nonad_t *hex,
const int64_t *a,
const int64_t *b
) {
......@@ -111,15 +89,18 @@ static inline void hexad_mul_signed (
lo = { (uint64_t)(xu), (uint64_t)(xv), (uint64_t)(xw), 0 },
hi = { (uint64_t)(xu>>64), (uint64_t)(xv>>64), (uint64_t)(xw>>64), 0 };
hi = hi<<6 | lo>>58;
lo &= mask58;
hex->lo = lo;
hex->hi = hi;
/*
hex->hier = (uint64x4_t)((int64x4_t)hi>>52);
hex->hi = (hi<<12)>>6 | lo>>58;
hex->lo = lo & mask58;
*/
hex->hi = hi<<6 | lo>>58;
hex->lo = lo & mask58;
}
static inline void hexad_sqr (
hexad_t *hex,
static inline void nonad_sqr (
nonad_t *hex,
const uint64_t *a
) {
__uint128_t xu, xv, xw;
......@@ -143,15 +124,13 @@ static inline void hexad_sqr (
lo = { (uint64_t)(xu), (uint64_t)(xv), (uint64_t)(xw), 0 },
hi = { (uint64_t)(xu>>64), (uint64_t)(xv>>64), (uint64_t)(xw>>64), 0 };
hi = hi<<6 | lo>>58;
lo &= mask58;
hex->lo = lo;
hex->hi = hi;
hex->hier = hi>>52;
hex->hi = (hi<<12)>>6 | lo>>58;
hex->lo = lo & mask58;
}
static inline void hexad_sqr_signed (
hexad_t *hex,
nonad_t *hex,
const int64_t *a
) {
__uint128_t xu, xv, xw;
......@@ -175,11 +154,15 @@ static inline void hexad_sqr_signed (
lo = { (uint64_t)(xu), (uint64_t)(xv), (uint64_t)(xw), 0 },
hi = { (uint64_t)(xu>>64), (uint64_t)(xv>>64), (uint64_t)(xw>>64), 0 };
hi = hi<<6 | lo>>58;
lo &= mask58;
hex->lo = lo;
hex->hi = hi;
/*
hex->hier = (uint64x4_t)((int64x4_t)hi>>52);
hex->hi = (hi<<12)>>6 | lo>>58;
hex->lo = lo & mask58;
*/
hex->hi = hi<<6 | lo>>58;
hex->lo = lo & mask58;
}
......@@ -190,51 +173,83 @@ p521_mul (
const p521_t *as,
const p521_t *bs
) {
int i;
#if 0
assert(as->limb[3] == 0 && as->limb[7] == 0 && as->limb[11] == 0);
assert(bs->limb[3] == 0 && bs->limb[7] == 0 && bs->limb[11] == 0);
for (i=0; i<12; i++) {
assert(as->limb[i] < 5ull<<57);
assert(bs->limb[i] < 5ull<<57);
}
#endif
/* Bounds on the hexads and nonads.
*
* Limbs < 2<<58 + ep.
* Nonad mul < 1<<58, 1<<58, tiny
* -> t0 < (3,2,2)<<58 + tiny
* t1,t2 < 2<<58 + tiny
* * w < (4,2,2)
* Hexad mul < +- (5,4,3) * 4<<116 -> 2^58 lo, +- (5,4,3) * 4<<58+ep
* TimesW < (2,1,1)<<58, (6,5,4)*4<<58 + ep
* ot2 = t0 + timesW(t2 + t1 - acdf.hi - bcef.lo);
== (3,2,2) + (4,2,2) + (4,2,2) +- (6,5,4)*4 - (1) << 58
in (-25, +35) << 58
uint64x3_t ot0 = t0 + timesW(t2 + t1 - acdf.hi - bcef.lo);
uint64x3_t ot1 = t0 + t1 - abde.lo + timesW(t2 - bcef.hi);
uint64x3_t ot2 = t0 + t1 + t2 - abde.hi - acdf.lo + vhi2;
*/
uint64_t *c = cs->limb;
const uint64_t *a = as->limb, *b = bs->limb;
hexad_t ad, be, cf, abde, bcef, acdf;
hexad_mul(&ad, &a[0], &b[0]);
hexad_mul(&be, &a[3], &b[3]);
hexad_mul(&cf, &a[6], &b[6]);
uint64_t amt = 32;
uint64x3_t vhi = { amt*((1ull<<58)-1), amt*((1ull<<58)-1), amt*((1ull<<58)-1), 0 },
nonad_t ad, be, cf, abde, bcef, acdf;
nonad_mul(&ad, &a[0], &b[0]);
nonad_mul(&be, &a[4], &b[4]);
nonad_mul(&cf, &a[8], &b[8]);
uint64_t amt = 26;
uint64x3_t vhi = { amt*((1ull<<58)-1), amt*((1ull<<58)-1), amt*((1ull<<58)-1), 0 },
vhi2 = { 0, 0, -amt<<57, 0 };
uint64x3_t t0 = cf.lo + be.hi, t1 = ad.lo + timesW(cf.hi) + vhi, t2 = ad.hi + be.lo;
int64_t ta[3], tb[3];
// it seems to be faster not to vectorize these loops
for (int i=0; i<3; i++) {
ta[i] = a[i]-a[i+3];
tb[i] = b[i]-b[i+3];
}
hexad_mul_signed(&abde,ta,tb);
for (int i=0; i<3; i++) {
ta[i] = a[i+3]-a[i+6];
tb[i] = b[i+3]-b[i+6];
}
hexad_mul_signed(&bcef,ta,tb);
for (int i=0; i<3; i++) {
ta[i] = a[i]-a[i+6];
tb[i] = b[i]-b[i+6];
}
hexad_mul_signed(&acdf,ta,tb);
uint64x3_t ot0 = t1 + timesW(t0 + t2 - acdf.hi - bcef.lo);
uint64x3_t ot1 = t1 + t2 - abde.lo + timesW(t0 - bcef.hi);
uint64x3_t ot2 = t1 + t2 + t0 - abde.hi - acdf.lo + vhi2;
uint64x3_t out0 = (ot0 & mask58) + timesW(ot2>>58);
uint64x3_t out1 = (ot1 & mask58) + (ot0>>58);
uint64x3_t out2 = (ot2 & mask58) + (ot1>>58);
store3(&c[0], out0);
store3(&c[3], out1);
store3(&c[6], out2);
uint64x3_t t2 = cf.lo + be.hi + ad.hier, t0 = ad.lo + timesW(cf.hi + be.hier) + vhi, t1 = ad.hi + be.lo + timesW(cf.hier);
int64_t ta[4] VECTOR_ALIGNED, tb[4] VECTOR_ALIGNED;
// it seems to be faster not to vectorize these loops
for (i=0; i<3; i++) {
ta[i] = a[i]-a[i+4];
tb[i] = b[i]-b[i+4];
}
hexad_mul_signed(&abde,ta,tb);
for (i=0; i<3; i++) {
ta[i] = a[i+4]-a[i+8];
tb[i] = b[i+4]-b[i+8];
}
hexad_mul_signed(&bcef,ta,tb);
for (i=0; i<3; i++) {
ta[i] = a[i]-a[i+8];
tb[i] = b[i]-b[i+8];
}
hexad_mul_signed(&acdf,ta,tb);
uint64x3_t ot0 = t0 + timesW(t2 + t1 - acdf.hi - bcef.lo);
uint64x3_t ot1 = t0 + t1 - abde.lo + timesW(t2 - bcef.hi);
uint64x3_t ot2 = t0 + t1 + t2 - abde.hi - acdf.lo + vhi2;
uint64x3_t out0 = (ot0 & mask58) + timesW(ot2>>58);
uint64x3_t out1 = (ot1 & mask58) + (ot0>>58);
uint64x3_t out2 = (ot2 & mask58) + (ot1>>58);
*(uint64x4_t *)&c[0] = out0;
*(uint64x4_t *)&c[4] = out1;
*(uint64x4_t *)&c[8] = out2;
}
......@@ -243,48 +258,58 @@ p521_sqr (
p521_t *__restrict__ cs,
const p521_t *as
) {
int i;
#if 0
assert(as->limb[3] == 0 && as->limb[7] == 0 && as->limb[11] == 0);
for (i=0; i<12; i++) {
assert(as->limb[i] < 5ull<<57);
}
#endif
uint64_t *c = cs->limb;
const uint64_t *a = as->limb;
hexad_t ad, be, cf, abde, bcef, acdf;
hexad_sqr(&ad, &a[0]);
hexad_sqr(&be, &a[3]);
hexad_sqr(&cf, &a[6]);
uint64_t amt = 32;
uint64x3_t vhi = { amt*((1ull<<58)-1), amt*((1ull<<58)-1), amt*((1ull<<58)-1), 0 },
nonad_t ad, be, cf, abde, bcef, acdf;
nonad_sqr(&ad, &a[0]);
nonad_sqr(&be, &a[4]);
nonad_sqr(&cf, &a[8]);
uint64_t amt = 26;
uint64x3_t vhi = { amt*((1ull<<58)-1), amt*((1ull<<58)-1), amt*((1ull<<58)-1), 0 },
vhi2 = { 0, 0, -amt<<57, 0 };
uint64x3_t t2 = cf.lo + be.hi + ad.hier, t0 = ad.lo + timesW(cf.hi + be.hier) + vhi, t1 = ad.hi + be.lo + timesW(cf.hier);
int64_t ta[4] VECTOR_ALIGNED;
// it seems to be faster not to vectorize these loops
for (i=0; i<3; i++) {
ta[i] = a[i]-a[i+4];
}
hexad_sqr_signed(&abde,ta);
for (i=0; i<3; i++) {
ta[i] = a[i+4]-a[i+8];
}
hexad_sqr_signed(&bcef,ta);
for (i=0; i<3; i++) {
ta[i] = a[i]-a[i+8];
}
hexad_sqr_signed(&acdf,ta);
uint64x3_t ot0 = t0 + timesW(t2 + t1 - acdf.hi - bcef.lo);
uint64x3_t ot1 = t0 + t1 - abde.lo + timesW(t2 - bcef.hi);
uint64x3_t ot2 = t0 + t1 + t2 - abde.hi - acdf.lo + vhi2;
uint64x3_t out0 = (ot0 & mask58) + timesW(ot2>>58);
uint64x3_t out1 = (ot1 & mask58) + (ot0>>58);
uint64x3_t out2 = (ot2 & mask58) + (ot1>>58);
uint64x3_t t0 = cf.lo + be.hi, t1 = ad.lo + timesW(cf.hi) + vhi, t2 = ad.hi + be.lo;
int64_t ta[3];
// it seems to be faster not to vectorize these loops
for (int i=0; i<3; i++) {
ta[i] = a[i]-a[i+3];
}
hexad_sqr_signed(&abde,ta);
for (int i=0; i<3; i++) {
ta[i] = a[i+3]-a[i+6];
}
hexad_sqr_signed(&bcef,ta);
for (int i=0; i<3; i++) {
ta[i] = a[i]-a[i+6];
}
hexad_sqr_signed(&acdf,ta);
uint64x3_t ot0 = t1 + timesW(t0 + t2 - acdf.hi - bcef.lo);
uint64x3_t ot1 = t1 + t2 - abde.lo + timesW(t0 - bcef.hi);
uint64x3_t ot2 = t1 + t2 + t0 - abde.hi - acdf.lo + vhi2;
uint64x3_t out0 = (ot0 & mask58) + timesW(ot2>>58);
uint64x3_t out1 = (ot1 & mask58) + (ot0>>58);
uint64x3_t out2 = (ot2 & mask58) + (ot1>>58);
store3(&c[0], out0);
store3(&c[3], out1);
store3(&c[6], out2);
*(uint64x4_t *)&c[0] = out0;
*(uint64x4_t *)&c[4] = out1;
*(uint64x4_t *)&c[8] = out2;
}
void
......@@ -293,37 +318,59 @@ p521_mulw (
const p521_t *as,
uint64_t b
) {
#if 0
int i;
assert(as->limb[3] == 0 && as->limb[7] == 0 && as->limb[11] == 0);
for (i=0; i<12; i++) {
assert(as->limb[i] < 1ull<<61);
}
assert(b < 1ull<<61);
#endif
const uint64_t *a = as->limb;
uint64_t *c = cs->limb;
__uint128_t accum0 = 0, accum3 = 0, accum6 = 0;
uint64_t mask = (1ull<<58) - 1;
int i;
for (i=0; i<3; i++) {
accum0 += widemul(b, a[LIMBPERM(i)]);
accum3 += widemul(b, a[LIMBPERM(i+3)]);
accum6 += widemul(b, a[LIMBPERM(i+6)]);
c[LIMBPERM(i)] = accum0 & mask; accum0 >>= 58;
c[LIMBPERM(i+3)] = accum3 & mask; accum3 >>= 58;
if (i==2) {
c[LIMBPERM(i+6)] = accum6 & (mask>>1); accum6 >>= 57;
} else {
c[LIMBPERM(i+6)] = accum6 & mask; accum6 >>= 58;
}
}
uint64_t mask = (1ull<<58) - 1;
accum0 += widemulu(b, a[0]);
accum3 += widemulu(b, a[1]);
accum6 += widemulu(b, a[2]);
c[0] = accum0 & mask; accum0 >>= 58;
c[1] = accum3 & mask; accum3 >>= 58;
c[2] = accum6 & mask; accum6 >>= 58;
accum0 += widemulu(b, a[4]);
accum3 += widemulu(b, a[5]);
accum6 += widemulu(b, a[6]);
c[4] = accum0 & mask; accum0 >>= 58;
c[5] = accum3 & mask; accum3 >>= 58;
c[6] = accum6 & mask; accum6 >>= 58;
accum0 += widemulu(b, a[8]);
accum3 += widemulu(b, a[9]);
accum6 += widemulu(b, a[10]);
c[8] = accum0 & mask; accum0 >>= 58;
c[9] = accum3 & mask; accum3 >>= 58;
c[10] = accum6 & (mask>>1); accum6 >>= 57;
accum0 += c[LIMBPERM(3)];
c[LIMBPERM(3)] = accum0 & mask;
c[LIMBPERM(4)] += accum0 >> 58;
accum0 += c[1];
c[1] = accum0 & mask;
c[5] += accum0 >> 58;
accum3 += c[LIMBPERM(6)];
c[LIMBPERM(6)] = accum3 & mask;
c[LIMBPERM(7)] += accum3 >> 58;
accum3 += c[2];
c[2] = accum3 & mask;
c[6] += accum3 >> 58;
accum6 += c[LIMBPERM(0)];
c[LIMBPERM(0)] = accum6 & mask;
c[LIMBPERM(1)] += accum6 >> 58;
accum6 += c[0];
c[0] = accum6 & mask;
c[4] += accum6 >> 58;
c[3] = c[7] = c[11] = 0;
}
......@@ -366,6 +413,8 @@ p521_strong_reduce (
}
assert(is_zero(carry + scarry));
a->limb[3] = a->limb[7] = a->limb[11] = 0;
}
mask_t
......@@ -377,8 +426,8 @@ p521_is_zero (
p521_strong_reduce(&b);
uint64_t any = 0;
int i;
for (i=0; i<9; i++) {
unsigned int i;
for (i=0; i<sizeof(b)/sizeof(b.limb[0]); i++) {
any |= b.limb[i];
}
return is_zero(any);
......@@ -389,7 +438,7 @@ p521_serialize (
uint8_t *serial,
const struct p521_t *x
) {
int i,k=0;
unsigned int i,k=0;
p521_t red;
p521_copy(&red, x);
p521_strong_reduce(&red);
......@@ -430,10 +479,12 @@ p521_deserialize (
uint64_t and = -1ull;
for (i=0; i<8; i++) {
and &= x->limb[i];
and &= x->limb[LIMBPERM(i)];
}
and &= (2*out+1);
good &= is_zero((and+1)>>58);
x->limb[3] = x->limb[7] = x->limb[11] = 0;
return good;
}
......@@ -9,13 +9,14 @@
#include <string.h>
#include "word.h"
#include "constant_time.h"
#define LIMBPERM(x) (((x)%3)*3 + (x)/3)
#define LIMBPERM(x) (((x)%3)*4 + (x)/3)
#define USE_P521_3x3_TRANSPOSE