Commit 04b955ea authored by Mike Hamburg's avatar Mike Hamburg
Browse files

Added really_memset, thanks David Leon Gil.

Trying to work around an apparent GCC bug on SSE2, thanks Samuel
Neves.

Added an experimental NEON arch.  It's fast.  It's not yet GCC clean.
It needs some more work on general cleanliness too.
parent 5847031d
August 4, 2014:
Experiments and bug fixes.
Add really_memset = memset_s (except not because I'm setting -std=c99),
thanks David Leon Gil. I think I put it in the right places.
Try to work around what I think is a compiler bug in GCC -O3 on non-AVX
platforms. I can't seem to work around it as -Os, so I'm just flagging
a warning (-Werror makes it an error) for now. Will take more
investigation. Thanks Samuel Neves.
Added an experimental (not ready yet!) ARM NEON implementation in
arch_neon_experimental. This implementation seems to work, but needs
more testing. It is currently asm-heavy and not GCC clean. I am
planning to have a flag for it to use intrinsics instead of asm;
currently the intrinsics are commented out. On clang this does ECDH
in 1850kcy on my BeagleBone Black, comparable to Curve41417. Once this
is ready, I will probably move it to arch_neon proper, since arch_neon
isn't particularly tuned.
July 11, 2014:
This is mostly a cleanup release.
......
......@@ -22,7 +22,7 @@ endif
WARNFLAGS = -pedantic -Wall -Wextra -Werror -Wunreachable-code \
-Wmissing-declarations -Wunused-function $(EXWARN)
-Wmissing-declarations -Wunused-function -Wno-overlength-strings $(EXWARN)
INCFLAGS = -Isrc/include -Iinclude -Isrc/$(ARCH)
......@@ -36,8 +36,8 @@ ARCHFLAGS += -mfpu=neon
else
ARCHFLAGS += -mfpu=vfpv3-d16
endif
ARCHFLAGS += -mcpu=cortex-a9 # FIXME
GENFLAGS = -DN_TESTS_BASE=1000 # sooooo sloooooow
ARCHFLAGS += -mcpu=cortex-a8 # FIXME
GENFLAGS += -DN_TESTS_BASE=1000 # sooooo sloooooow
else
ARCHFLAGS += -maes -mavx2 -mbmi2 #TODO
endif
......
......@@ -13,7 +13,7 @@ game protection system out of Stanford, and are (c) 2011 Stanford
University. All of these files are usable under the MIT license contained in
LICENSE.txt.
The Makefile is set for my 2013 MacBook Air. You can `make runbench` to run
The Makefile is set for my 2013 MacBook Air. You can `make bench` to run
a completely arbitrary set of benchmarks and tests, or `make
build/goldilocks.so` to build a stripped-down version of the library. For
non-Haswell platforms, you need to replace -mavx2 -mbmi2 by an appropriate
......
......@@ -39,7 +39,7 @@ xx_vaddup_s64(int64x2_t x) {
#include "neon_emulation.h"
#endif /* ARM_NEON */
static inline void __attribute__((gnu_inline,always_inline))
static inline void __attribute__((gnu_inline,always_inline,unused))
smlal (
uint64_t *acc,
const uint32_t a,
......@@ -48,7 +48,7 @@ smlal (
*acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b;
}
static inline void __attribute__((gnu_inline,always_inline))
static inline void __attribute__((gnu_inline,always_inline,unused))
smlal2 (
uint64_t *acc,
const uint32_t a,
......@@ -57,7 +57,7 @@ smlal2 (
*acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b * 2;
}
static inline void __attribute__((gnu_inline,always_inline))
static inline void __attribute__((gnu_inline,always_inline,unused))
smull (
uint64_t *acc,
const uint32_t a,
......@@ -66,7 +66,7 @@ smull (
*acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b;
}
static inline void __attribute__((gnu_inline,always_inline))
static inline void __attribute__((gnu_inline,always_inline,unused))
smull2 (
uint64_t *acc,
const uint32_t a,
......@@ -84,6 +84,7 @@ p448_mul (
const uint32_t *a = as->limb, *b = bs->limb;
uint32_t *c = cs->limb;
const int32x2_t
*val = (const int32x2_t *)a,
*vbl = (const int32x2_t *)b,
......@@ -109,155 +110,170 @@ p448_mul (
accumx0a = vmull_lane_s32( delta = val[1] + vah[1], vbh[3], 0);
accumx1a = vmull_lane_s32( delta, vbh[3], 1);
accumx2a = vmull_lane_s32( delta = val[2] + vah[2], vbh[3], 0);
accumx3a = vmull_lane_s32( delta, vbh[3], 1);
accumx0a = vmlal_lane_s32(accumx0a, delta, vbh[2], 0);
accumx0a = vmlal_lane_s32(accumx0a, delta = val[2] + vah[2], vbh[2], 0);
accumx1a = vmlal_lane_s32(accumx1a, delta, vbh[2], 1);
accumx2a = vmlal_lane_s32(accumx2a, delta = val[3] + vah[3], vbh[2], 0);
accumx3a = vmlal_lane_s32(accumx3a, delta, vbh[2], 1);
accumx0a = vmlal_lane_s32(accumx0a, delta, vbh[1], 0);
accumx0a = vmlal_lane_s32(accumx0a, delta = val[3] + vah[3], vbh[1], 0);
accumx1a = vmlal_lane_s32(accumx1a, delta, vbh[1], 1);
accumx2b = vmull_lane_s32( delta = val[0] + vah[0], vbh[1], 0);
accumx3b = vmull_lane_s32( delta, vbh[1], 1);
accumx0b = vmull_lane_s32( delta, vbh[0], 0);
accumx0b = vmull_lane_s32( delta = val[0] + vah[0], vbh[0], 0);
accumx1b = vmull_lane_s32( delta, vbh[0], 1);
accumx2b = vmlal_lane_s32(accumx2b, delta = val[1] + vah[1], vbh[0], 0);
accumx3b = vmlal_lane_s32(accumx3b, delta, vbh[0], 1);
accumx0b = vmlal_lane_s32(accumx0b, vah[1], vbl[3], 0);
accumx1b = vmlal_lane_s32(accumx1b, vah[1], vbl[3], 1);
accumx2b = vmlal_lane_s32(accumx2b, vah[2], vbl[3], 0);
accumx3b = vmlal_lane_s32(accumx3b, vah[2], vbl[3], 1);
accumx0b = vmlal_lane_s32(accumx0b, vah[2], vbl[2], 0);
accumx1b = vmlal_lane_s32(accumx1b, vah[2], vbl[2], 1);
accumx2b = vmlal_lane_s32(accumx2b, vah[3], vbl[2], 0);
accumx3b = vmlal_lane_s32(accumx3b, vah[3], vbl[2], 1);
accumx0b = vmlal_lane_s32(accumx0b, vah[3], vbl[1], 0);
accumx1b = vmlal_lane_s32(accumx1b, vah[3], vbl[1], 1);
accumx2b += accumx2a;
accumx3b += accumx3a;
accumx2a = vmlal_lane_s32(accumx2a, vah[0], vbl[1], 0);
accumx3a = vmlal_lane_s32(accumx3a, vah[0], vbl[1], 1);
accumx0b += accumx0a;
accumx1b += accumx1a;
accumx0a = vmlal_lane_s32(accumx0a, vah[0], vbl[0], 0);
accumx1a = vmlal_lane_s32(accumx1a, vah[0], vbl[0], 1);
accumx2a = vmlal_lane_s32(accumx2a, vah[1], vbl[0], 0);
accumx3a = vmlal_lane_s32(accumx3a, vah[1], vbl[0], 1);
accumx0a = vmlal_lane_s32(accumx0a, val[1], delta = vbl[3] - vbh[3], 0);
accumx1a = vmlal_lane_s32(accumx1a, val[1], delta, 1);
accumx2a = vmlal_lane_s32(accumx2a, val[2], delta, 0);
accumx3a = vmlal_lane_s32(accumx3a, val[2], delta, 1);
accumx0a = vmlal_lane_s32(accumx0a, val[2], delta = vbl[2] - vbh[2], 0);
accumx1a = vmlal_lane_s32(accumx1a, val[2], delta, 1);
accumx2a = vmlal_lane_s32(accumx2a, val[3], delta, 0);
accumx3a = vmlal_lane_s32(accumx3a, val[3], delta, 1);
accumx0a = vmlal_lane_s32(accumx0a, val[3], delta = vbl[1] - vbh[1], 0);
accumx1a = vmlal_lane_s32(accumx1a, val[3], delta, 1);
accumx2a += accumx2b;
accumx3a += accumx3b;
accumx2b = vmlal_lane_s32(accumx2b, val[0], delta, 0);
accumx3b = vmlal_lane_s32(accumx3b, val[0], delta, 1);
accumx0a += accumx0b;
accumx1a += accumx1b;
accumx0b = vmlal_lane_s32(accumx0b, val[0], delta = vbl[0] - vbh[0], 0);
accumx1b = vmlal_lane_s32(accumx1b, val[0], delta, 1);
accumx2b = vmlal_lane_s32(accumx2b, val[1], delta, 0);
accumx3b = vmlal_lane_s32(accumx3b, val[1], delta, 1);
xx_vtrnq_s64(&accumx0a, &accumx0b);
xx_vtrnq_s64(&accumx1a, &accumx1b);
xx_vtrnq_s64(&accumx2a, &accumx2b);
xx_vtrnq_s64(&accumx3a, &accumx3b);
accumx0b += accumx1a;
accumx0b = vsraq_n_s64(accumx0b,accumx0a,28);
accumx1b = vsraq_n_s64(accumx1b,accumx0b,28);
trn_res = vtrn_s32(vmovn_s64(accumx0a), vmovn_s64(accumx0b));
vcl[0] = trn_res.val[1] & vmask;
vch[0] = trn_res.val[0] & vmask;
accumx2a = vmull_lane_s32( delta = val[2] + vah[2], vbh[3], 0);
accumx3a = vmull_lane_s32( delta, vbh[3], 1);
accumx2a = vmlal_lane_s32(accumx2a, delta = val[3] + vah[3], vbh[2], 0);
accumx3a = vmlal_lane_s32(accumx3a, delta, vbh[2], 1);
accumx2b = vmull_lane_s32( delta = val[0] + vah[0], vbh[1], 0);
accumx3b = vmull_lane_s32( delta, vbh[1], 1);
accumx2b = vmlal_lane_s32(accumx2b, delta = val[1] + vah[1], vbh[0], 0);
accumx3b = vmlal_lane_s32(accumx3b, delta, vbh[0], 1);
accumx2b = vmlal_lane_s32(accumx2b, vah[2], vbl[3], 0);
accumx3b = vmlal_lane_s32(accumx3b, vah[2], vbl[3], 1);
accumx2b = vmlal_lane_s32(accumx2b, vah[3], vbl[2], 0);
accumx3b = vmlal_lane_s32(accumx3b, vah[3], vbl[2], 1);
accumx2b += accumx2a;
accumx3b += accumx3a;
accumx2a = vmlal_lane_s32(accumx2a, vah[0], vbl[1], 0);
accumx3a = vmlal_lane_s32(accumx3a, vah[0], vbl[1], 1);
accumx2a = vmlal_lane_s32(accumx2a, vah[1], vbl[0], 0);
accumx3a = vmlal_lane_s32(accumx3a, vah[1], vbl[0], 1);
accumx2a = vmlal_lane_s32(accumx2a, val[2], delta = vbl[3] - vbh[3], 0);
accumx3a = vmlal_lane_s32(accumx3a, val[2], delta, 1);
accumx2a = vmlal_lane_s32(accumx2a, val[3], delta = vbl[2] - vbh[2], 0);
accumx3a = vmlal_lane_s32(accumx3a, val[3], delta, 1);
accumx2a += accumx2b;
accumx3a += accumx3b;
accumx2b = vmlal_lane_s32(accumx2b, val[0], delta = vbl[1] - vbh[1], 0);
accumx3b = vmlal_lane_s32(accumx3b, val[0], delta, 1);
accumx2b = vmlal_lane_s32(accumx2b, val[1], delta = vbl[0] - vbh[0], 0);
accumx3b = vmlal_lane_s32(accumx3b, val[1], delta, 1);
xx_vtrnq_s64(&accumx2a, &accumx2b);
xx_vtrnq_s64(&accumx3a, &accumx3b);
accumx2a += accumx1b;
accumx2b += accumx3a;
accumx2b = vsraq_n_s64(accumx2b,accumx2a,28);
accumx3b = vsraq_n_s64(accumx3b,accumx2b,28);
trn_res = vtrn_s32(vmovn_s64(accumx0a), vmovn_s64(accumx0b));
vcl[0] = trn_res.val[1] & vmask;
vch[0] = trn_res.val[0] & vmask;
trn_res = vtrn_s32(vmovn_s64(accumx2a), vmovn_s64(accumx2b));
vcl[1] = trn_res.val[1] & vmask;
vch[1] = trn_res.val[0] & vmask;
carry = accumx3b;
accumx4a = vmull_lane_s32( delta = val[3] + vah[3], vbh[3], 0);
accumx5a = vmull_lane_s32( delta, vbh[3], 1);
accumx6b = vmull_lane_s32( delta = val[0] + vah[0], vbh[3], 0);
accumx7b = vmull_lane_s32( delta, vbh[3], 1);
accumx4b = accumx4a;
accumx5b = accumx5a;
accumx4b = vmlal_lane_s32(accumx4b, delta, vbh[2], 0);
accumx4b = vmlal_lane_s32(accumx4b, delta = val[0] + vah[0], vbh[2], 0);
accumx5b = vmlal_lane_s32(accumx5b, delta, vbh[2], 1);
accumx6b = vmlal_lane_s32(accumx6b, delta = val[1] + vah[1], vbh[2], 0);
accumx7b = vmlal_lane_s32(accumx7b, delta, vbh[2], 1);
accumx4b = vmlal_lane_s32(accumx4b, delta, vbh[1], 0);
accumx4b = vmlal_lane_s32(accumx4b, delta = val[1] + vah[1], vbh[1], 0);
accumx5b = vmlal_lane_s32(accumx5b, delta, vbh[1], 1);
accumx6b = vmlal_lane_s32(accumx6b, delta = val[2] + vah[2], vbh[1], 0);
accumx7b = vmlal_lane_s32(accumx7b, delta, vbh[1], 1);
accumx4b = vmlal_lane_s32(accumx4b, delta, vbh[0], 0);
accumx4b = vmlal_lane_s32(accumx4b, delta = val[2] + vah[2], vbh[0], 0);
accumx5b = vmlal_lane_s32(accumx5b, delta, vbh[0], 1);
accumx6b = vmlal_lane_s32(accumx6b, delta = val[3] + vah[3], vbh[0], 0);
accumx7b = vmlal_lane_s32(accumx7b, delta, vbh[0], 1);
accumx4b = vmlal_lane_s32(accumx4b, vah[3], vbl[3], 0);
accumx5b = vmlal_lane_s32(accumx5b, vah[3], vbl[3], 1);
accumx6a = accumx6b;
accumx7a = accumx7b;
accumx6a = vmlal_lane_s32(accumx6a, vah[0], vbl[3], 0);
accumx7a = vmlal_lane_s32(accumx7a, vah[0], vbl[3], 1);
accumx4a += accumx4b;
accumx5a += accumx5b;
accumx4a = vmlal_lane_s32(accumx4a, vah[0], vbl[2], 0);
accumx5a = vmlal_lane_s32(accumx5a, vah[0], vbl[2], 1);
accumx6a = vmlal_lane_s32(accumx6a, vah[1], vbl[2], 0);
accumx7a = vmlal_lane_s32(accumx7a, vah[1], vbl[2], 1);
accumx4a = vmlal_lane_s32(accumx4a, vah[1], vbl[1], 0);
accumx5a = vmlal_lane_s32(accumx5a, vah[1], vbl[1], 1);
accumx6a = vmlal_lane_s32(accumx6a, vah[2], vbl[1], 0);
accumx7a = vmlal_lane_s32(accumx7a, vah[2], vbl[1], 1);
accumx4a = vmlal_lane_s32(accumx4a, vah[2], vbl[0], 0);
accumx5a = vmlal_lane_s32(accumx5a, vah[2], vbl[0], 1);
accumx6a = vmlal_lane_s32(accumx6a, vah[3], vbl[0], 0);
accumx7a = vmlal_lane_s32(accumx7a, vah[3], vbl[0], 1);
accumx4a = vmlal_lane_s32(accumx4a, val[3], delta = vbl[3] - vbh[3], 0);
accumx5a = vmlal_lane_s32(accumx5a, val[3], delta, 1);
/**/
accumx6b = vmlal_lane_s32(accumx6b, val[0], delta, 0);
accumx7b = vmlal_lane_s32(accumx7b, val[0], delta, 1);
accumx4b = vmlal_lane_s32(accumx4b, val[0], delta = vbl[2] - vbh[2], 0);
accumx5b = vmlal_lane_s32(accumx5b, val[0], delta, 1);
accumx6b = vmlal_lane_s32(accumx6b, val[1], delta, 0);
accumx7b = vmlal_lane_s32(accumx7b, val[1], delta, 1);
accumx4b = vmlal_lane_s32(accumx4b, val[1], delta = vbl[1] - vbh[1], 0);
accumx5b = vmlal_lane_s32(accumx5b, val[1], delta, 1);
accumx6b = vmlal_lane_s32(accumx6b, val[2], delta, 0);
accumx7b = vmlal_lane_s32(accumx7b, val[2], delta, 1);
accumx4b = vmlal_lane_s32(accumx4b, val[2], delta = vbl[0] - vbh[0], 0);
accumx5b = vmlal_lane_s32(accumx5b, val[2], delta, 1);
accumx6b = vmlal_lane_s32(accumx6b, val[3], delta, 0);
accumx7b = vmlal_lane_s32(accumx7b, val[3], delta, 1);
xx_vtrnq_s64(&accumx4a, &accumx4b);
xx_vtrnq_s64(&accumx5a, &accumx5b);
xx_vtrnq_s64(&accumx6a, &accumx6b);
xx_vtrnq_s64(&accumx7a, &accumx7b);
accumx4a += carry;
accumx4b += accumx5a;
accumx4b = vsraq_n_s64(accumx4b,accumx4a,28);
accumx5b = vsraq_n_s64(accumx5b,accumx4b,28);
accumx6a += accumx5b;
accumx6b += accumx7a;
trn_res = vtrn_s32(vmovn_s64(accumx4a), vmovn_s64(accumx4b));
vcl[2] = trn_res.val[1] & vmask;
vch[2] = trn_res.val[0] & vmask;
accumx6b = vmull_lane_s32( delta = val[0] + vah[0], vbh[3], 0);
accumx7b = vmull_lane_s32( delta, vbh[3], 1);
accumx6b = vmlal_lane_s32(accumx6b, delta = val[1] + vah[1], vbh[2], 0);
accumx7b = vmlal_lane_s32(accumx7b, delta, vbh[2], 1);
accumx6b = vmlal_lane_s32(accumx6b, delta = val[2] + vah[2], vbh[1], 0);
accumx7b = vmlal_lane_s32(accumx7b, delta, vbh[1], 1);
accumx6b = vmlal_lane_s32(accumx6b, delta = val[3] + vah[3], vbh[0], 0);
accumx7b = vmlal_lane_s32(accumx7b, delta, vbh[0], 1);
accumx6a = accumx6b;
accumx7a = accumx7b;
accumx6a = vmlal_lane_s32(accumx6a, vah[0], vbl[3], 0);
accumx7a = vmlal_lane_s32(accumx7a, vah[0], vbl[3], 1);
accumx6a = vmlal_lane_s32(accumx6a, vah[1], vbl[2], 0);
accumx7a = vmlal_lane_s32(accumx7a, vah[1], vbl[2], 1);
accumx6a = vmlal_lane_s32(accumx6a, vah[2], vbl[1], 0);
accumx7a = vmlal_lane_s32(accumx7a, vah[2], vbl[1], 1);
accumx6a = vmlal_lane_s32(accumx6a, vah[3], vbl[0], 0);
accumx7a = vmlal_lane_s32(accumx7a, vah[3], vbl[0], 1);
/**/
accumx6b = vmlal_lane_s32(accumx6b, val[0], delta = vbl[3] - vbh[3], 0);
accumx7b = vmlal_lane_s32(accumx7b, val[0], delta, 1);
accumx6b = vmlal_lane_s32(accumx6b, val[1], delta = vbl[2] - vbh[2], 0);
accumx7b = vmlal_lane_s32(accumx7b, val[1], delta, 1);
accumx6b = vmlal_lane_s32(accumx6b, val[2], delta = vbl[1] - vbh[1], 0);
accumx7b = vmlal_lane_s32(accumx7b, val[2], delta, 1);
accumx6b = vmlal_lane_s32(accumx6b, val[3], delta = vbl[0] - vbh[0], 0);
accumx7b = vmlal_lane_s32(accumx7b, val[3], delta, 1);
xx_vtrnq_s64(&accumx6a, &accumx6b);
xx_vtrnq_s64(&accumx7a, &accumx7b);
accumx6a += accumx5b;
accumx6b += accumx7a;
accumx6b = vsraq_n_s64(accumx6b,accumx6a,28);
accumx7b = vsraq_n_s64(accumx7b,accumx6b,28);
trn_res = vtrn_s32(vmovn_s64(accumx6a), vmovn_s64(accumx6b));
vcl[3] = trn_res.val[1] & vmask;
vch[3] = trn_res.val[0] & vmask;
accumx7b = xx_vaddup_s64(accumx7b);
int32x2_t t0 = vcl[0], t1 = vch[0];
......
This diff is collapsed.
This diff is collapsed.
/* Copyright (c) 2014 Cryptography Research, Inc.
* Released under the MIT License. See LICENSE.txt for license information.
*/
#ifndef __P448_H__
#define __P448_H__ 1
#include "word.h"
#include <stdint.h>
#include <assert.h>
typedef struct p448_t {
uint32_t limb[16];
} __attribute__((aligned(32))) p448_t;
#define LIMBPERM(x) (((x)<<1 | (x)>>3) & 15)
#define USE_NEON_PERM 1
#ifdef __cplusplus
extern "C" {
#endif
static __inline__ void
p448_set_ui (
p448_t *out,
uint64_t x
) __attribute__((unused,always_inline));
static __inline__ void
p448_cond_swap (
p448_t *a,
p448_t *b,
mask_t do_swap
) __attribute__((unused,always_inline));
static __inline__ void
p448_add (
p448_t *out,
const p448_t *a,
const p448_t *b
) __attribute__((unused,always_inline));
static __inline__ void
p448_sub (
p448_t *out,
const p448_t *a,
const p448_t *b
) __attribute__((unused,always_inline));
static __inline__ void
p448_neg (
p448_t *out,
const p448_t *a
) __attribute__((unused,always_inline));
static __inline__ void
p448_cond_neg (
p448_t *a,
mask_t doNegate
) __attribute__((unused,always_inline));
static __inline__ void
p448_addw (
p448_t *a,
uint32_t x
) __attribute__((unused,always_inline));
static __inline__ void
p448_subw (
p448_t *a,
uint32_t x
) __attribute__((unused,always_inline));
static __inline__ void
p448_copy (
p448_t *out,
const p448_t *a
) __attribute__((unused,always_inline));
static __inline__ void
p448_weak_reduce (
p448_t *inout
) __attribute__((unused,always_inline));
void
p448_strong_reduce (
p448_t *inout
);
mask_t
p448_is_zero (
const p448_t *in
);
static __inline__ void
p448_bias (
p448_t *inout,
int amount
) __attribute__((unused,always_inline));
void
p448_mul (
p448_t *__restrict__ out,
const p448_t *a,
const p448_t *b
);
void
p448_mulw (
p448_t *__restrict__ out,
const p448_t *a,
uint64_t b
);
void
p448_sqr (
p448_t *__restrict__ out,
const p448_t *a
);
static __inline__ void
p448_sqrn (
p448_t *__restrict__ y,
const p448_t *x,
int n
) __attribute__((unused,always_inline));
void
p448_serialize (
uint8_t *serial,
const struct p448_t *x
);
mask_t
p448_deserialize (
p448_t *x,
const uint8_t serial[56]
);
static __inline__ void
p448_mask(
struct p448_t *a,
const struct p448_t *b,
mask_t mask
) __attribute__((unused,always_inline));
/**
* Returns 1/x.
*
* If x=0, returns 0.
*/
void
p448_inverse (
struct p448_t* a,
const struct p448_t* x
);
void
simultaneous_invert_p448 (
struct p448_t *__restrict__ out,
const struct p448_t *in,
unsigned int n
);
static inline mask_t
p448_eq (
const struct p448_t *a,
const struct p448_t *b
) __attribute__((always_inline,unused));
/* -------------- Inline functions begin here -------------- */
void
p448_set_ui (
p448_t *out,
uint64_t x
) {
int i;
for (i=0; i<16; i++) {
out->limb[i] = 0;
}
out->limb[0] = x & ((1<<28)-1);
out->limb[2] = x>>28;
}
void
p448_cond_swap (
p448_t *a,
p448_t *b,
mask_t doswap
) {
big_register_t *aa = (big_register_t*)a;
big_register_t *bb = (big_register_t*)b;
big_register_t m = br_set_to_mask(doswap);
unsigned int i;
for (i=0; i<sizeof(*a)/sizeof(*aa); i++) {
big_register_t x = m & (aa[i]^bb[i]);
aa[i] ^= x;
bb[i] ^= x;
}
}
void
p448_add (
p448_t *out,
const p448_t *a,
const p448_t *b
) {
unsigned int i;
for (i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
((uint32xn_t*)out)[i] = ((const uint32xn_t*)a)[i] + ((const uint32xn_t*)b)[i];
}
}
void
p448_sub (
p448_t *out,
const p448_t *a,
const p448_t *b
) {
unsigned int i;
for (i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
((uint32xn_t*)out)[i] = ((const uint32xn_t*)a)[i] - ((const uint32xn_t*)b)[i];
}
/*
unsigned int i;
for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
out->limb[i] = a->limb[i] - b->limb[i];
}
*/
}
void
p448_neg (
p448_t *out,
const p448_t *a
) {
unsigned int i;
for (i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
((uint32xn_t*)out)[i] = -((const uint32xn_t*)a)[i];
}
/*
unsigned int i;
for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
out->limb[i] = -a->limb[i];
}
*/
}
void
p448_cond_neg(