Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OTRv4
libgoldilocks
Commits
a9c72b5a
Commit
a9c72b5a
authored
Sep 18, 2014
by
Mike Hamburg
Browse files
Begin ref impl, currently an arch option (arch_ref64).
parent
4ca664e6
Changes
6
Expand all
Hide whitespace changes
Inline
Side-by-side
HISTORY.txt
View file @
a9c72b5a
September 18, 2014:
Begin work on a "ref" implementation. Currently this is just the
arch_ref64 architecture. The ref implementation always weak_reduces
after arithmetic, and doesn't use vectors or other hackery. Currently
it still must declare field elements as vector aligned, though,
other code outside the arch directory can be vectorized.
Change goldilocks.c to use field_eq instead of calling deep into field
apis.
September 6, 2014:
Pull in minor changes from David Leon Gil and Nicholas Wilson, with
some adjustments. I hope the adjustments don't break their compiles.
...
...
src/arch_ref64/ec_point.c
0 → 100644
View file @
a9c72b5a
This diff is collapsed.
Click to expand it.
src/arch_ref64/p448.c
0 → 100644
View file @
a9c72b5a
/* Copyright (c) 2014 Cryptography Research, Inc.
* Released under the MIT License. See LICENSE.txt for license information.
*/
#include "p448.h"
static
__inline__
__uint128_t
widemul
(
const
uint64_t
a
,
const
uint64_t
b
)
{
return
((
__uint128_t
)
a
)
*
((
__uint128_t
)
b
);
}
static
__inline__
uint64_t
is_zero
(
uint64_t
a
)
{
/* let's hope the compiler isn't clever enough to optimize this. */
return
(((
__uint128_t
)
a
)
-
1
)
>>
64
;
}
void
p448_mul
(
p448_t
*
__restrict__
cs
,
const
p448_t
*
as
,
const
p448_t
*
bs
)
{
const
uint64_t
*
a
=
as
->
limb
,
*
b
=
bs
->
limb
;
uint64_t
*
c
=
cs
->
limb
;
__uint128_t
accum0
=
0
,
accum1
=
0
,
accum2
;
uint64_t
mask
=
(
1ull
<<
56
)
-
1
;
uint64_t
aa
[
4
],
bb
[
4
],
bbb
[
4
];
unsigned
int
i
;
for
(
i
=
0
;
i
<
4
;
i
++
)
{
aa
[
i
]
=
a
[
i
]
+
a
[
i
+
4
];
bb
[
i
]
=
b
[
i
]
+
b
[
i
+
4
];
bbb
[
i
]
=
bb
[
i
]
+
b
[
i
+
4
];
}
int
I_HATE_UNROLLED_LOOPS
=
0
;
if
(
I_HATE_UNROLLED_LOOPS
)
{
/* The compiler probably won't unroll this,
* so it's like 80% slower.
*/
for
(
i
=
0
;
i
<
4
;
i
++
)
{
accum2
=
0
;
unsigned
int
j
;
for
(
j
=
0
;
j
<=
i
;
j
++
)
{
accum2
+=
widemul
(
a
[
j
],
b
[
i
-
j
]);
accum1
+=
widemul
(
aa
[
j
],
bb
[
i
-
j
]);
accum0
+=
widemul
(
a
[
j
+
4
],
b
[
i
-
j
+
4
]);
}
for
(;
j
<
4
;
j
++
)
{
accum2
+=
widemul
(
a
[
j
],
b
[
i
-
j
+
8
]);
accum1
+=
widemul
(
aa
[
j
],
bbb
[
i
-
j
+
4
]);
accum0
+=
widemul
(
a
[
j
+
4
],
bb
[
i
-
j
+
4
]);
}
accum1
-=
accum2
;
accum0
+=
accum2
;
c
[
i
]
=
((
uint64_t
)(
accum0
))
&
mask
;
c
[
i
+
4
]
=
((
uint64_t
)(
accum1
))
&
mask
;
accum0
>>=
56
;
accum1
>>=
56
;
}
}
else
{
accum2
=
widemul
(
a
[
0
],
b
[
0
]);
accum1
+=
widemul
(
aa
[
0
],
bb
[
0
]);
accum0
+=
widemul
(
a
[
4
],
b
[
4
]);
accum2
+=
widemul
(
a
[
1
],
b
[
7
]);
accum1
+=
widemul
(
aa
[
1
],
bbb
[
3
]);
accum0
+=
widemul
(
a
[
5
],
bb
[
3
]);
accum2
+=
widemul
(
a
[
2
],
b
[
6
]);
accum1
+=
widemul
(
aa
[
2
],
bbb
[
2
]);
accum0
+=
widemul
(
a
[
6
],
bb
[
2
]);
accum2
+=
widemul
(
a
[
3
],
b
[
5
]);
accum1
+=
widemul
(
aa
[
3
],
bbb
[
1
]);
accum0
+=
widemul
(
a
[
7
],
bb
[
1
]);
accum1
-=
accum2
;
accum0
+=
accum2
;
c
[
0
]
=
((
uint64_t
)(
accum0
))
&
mask
;
c
[
4
]
=
((
uint64_t
)(
accum1
))
&
mask
;
accum0
>>=
56
;
accum1
>>=
56
;
accum2
=
widemul
(
a
[
0
],
b
[
1
]);
accum1
+=
widemul
(
aa
[
0
],
bb
[
1
]);
accum0
+=
widemul
(
a
[
4
],
b
[
5
]);
accum2
+=
widemul
(
a
[
1
],
b
[
0
]);
accum1
+=
widemul
(
aa
[
1
],
bb
[
0
]);
accum0
+=
widemul
(
a
[
5
],
b
[
4
]);
accum2
+=
widemul
(
a
[
2
],
b
[
7
]);
accum1
+=
widemul
(
aa
[
2
],
bbb
[
3
]);
accum0
+=
widemul
(
a
[
6
],
bb
[
3
]);
accum2
+=
widemul
(
a
[
3
],
b
[
6
]);
accum1
+=
widemul
(
aa
[
3
],
bbb
[
2
]);
accum0
+=
widemul
(
a
[
7
],
bb
[
2
]);
accum1
-=
accum2
;
accum0
+=
accum2
;
c
[
1
]
=
((
uint64_t
)(
accum0
))
&
mask
;
c
[
5
]
=
((
uint64_t
)(
accum1
))
&
mask
;
accum0
>>=
56
;
accum1
>>=
56
;
accum2
=
widemul
(
a
[
0
],
b
[
2
]);
accum1
+=
widemul
(
aa
[
0
],
bb
[
2
]);
accum0
+=
widemul
(
a
[
4
],
b
[
6
]);
accum2
+=
widemul
(
a
[
1
],
b
[
1
]);
accum1
+=
widemul
(
aa
[
1
],
bb
[
1
]);
accum0
+=
widemul
(
a
[
5
],
b
[
5
]);
accum2
+=
widemul
(
a
[
2
],
b
[
0
]);
accum1
+=
widemul
(
aa
[
2
],
bb
[
0
]);
accum0
+=
widemul
(
a
[
6
],
b
[
4
]);
accum2
+=
widemul
(
a
[
3
],
b
[
7
]);
accum1
+=
widemul
(
aa
[
3
],
bbb
[
3
]);
accum0
+=
widemul
(
a
[
7
],
bb
[
3
]);
accum1
-=
accum2
;
accum0
+=
accum2
;
c
[
2
]
=
((
uint64_t
)(
accum0
))
&
mask
;
c
[
6
]
=
((
uint64_t
)(
accum1
))
&
mask
;
accum0
>>=
56
;
accum1
>>=
56
;
accum2
=
widemul
(
a
[
0
],
b
[
3
]);
accum1
+=
widemul
(
aa
[
0
],
bb
[
3
]);
accum0
+=
widemul
(
a
[
4
],
b
[
7
]);
accum2
+=
widemul
(
a
[
1
],
b
[
2
]);
accum1
+=
widemul
(
aa
[
1
],
bb
[
2
]);
accum0
+=
widemul
(
a
[
5
],
b
[
6
]);
accum2
+=
widemul
(
a
[
2
],
b
[
1
]);
accum1
+=
widemul
(
aa
[
2
],
bb
[
1
]);
accum0
+=
widemul
(
a
[
6
],
b
[
5
]);
accum2
+=
widemul
(
a
[
3
],
b
[
0
]);
accum1
+=
widemul
(
aa
[
3
],
bb
[
0
]);
accum0
+=
widemul
(
a
[
7
],
b
[
4
]);
accum1
-=
accum2
;
accum0
+=
accum2
;
c
[
3
]
=
((
uint64_t
)(
accum0
))
&
mask
;
c
[
7
]
=
((
uint64_t
)(
accum1
))
&
mask
;
accum0
>>=
56
;
accum1
>>=
56
;
}
/* !I_HATE_UNROLLED_LOOPS */
accum0
+=
accum1
;
accum0
+=
c
[
4
];
accum1
+=
c
[
0
];
c
[
4
]
=
((
uint64_t
)(
accum0
))
&
mask
;
c
[
0
]
=
((
uint64_t
)(
accum1
))
&
mask
;
accum0
>>=
56
;
accum1
>>=
56
;
c
[
5
]
+=
((
uint64_t
)(
accum0
));
c
[
1
]
+=
((
uint64_t
)(
accum1
));
}
void
p448_mulw
(
p448_t
*
__restrict__
cs
,
const
p448_t
*
as
,
uint64_t
b
)
{
const
uint64_t
*
a
=
as
->
limb
;
uint64_t
*
c
=
cs
->
limb
;
__uint128_t
accum0
=
0
,
accum4
=
0
;
uint64_t
mask
=
(
1ull
<<
56
)
-
1
;
int
i
;
for
(
i
=
0
;
i
<
4
;
i
++
)
{
accum0
+=
widemul
(
b
,
a
[
i
]);
accum4
+=
widemul
(
b
,
a
[
i
+
4
]);
c
[
i
]
=
accum0
&
mask
;
accum0
>>=
56
;
c
[
i
+
4
]
=
accum4
&
mask
;
accum4
>>=
56
;
}
accum0
+=
accum4
+
c
[
4
];
c
[
4
]
=
accum0
&
mask
;
c
[
5
]
+=
accum0
>>
56
;
accum4
+=
c
[
0
];
c
[
0
]
=
accum4
&
mask
;
c
[
1
]
+=
accum4
>>
56
;
}
void
p448_sqr
(
p448_t
*
__restrict__
cs
,
const
p448_t
*
as
)
{
const
uint64_t
*
a
=
as
->
limb
;
uint64_t
*
c
=
cs
->
limb
;
__uint128_t
accum0
=
0
,
accum1
=
0
,
accum2
;
uint64_t
mask
=
(
1ull
<<
56
)
-
1
;
uint64_t
aa
[
4
];
/* For some reason clang doesn't vectorize this without prompting? */
unsigned
int
i
;
for
(
i
=
0
;
i
<
4
;
i
++
)
{
aa
[
i
]
=
a
[
i
]
+
a
[
i
+
4
];
}
accum2
=
widemul
(
a
[
0
],
a
[
3
]);
accum0
=
widemul
(
aa
[
0
],
aa
[
3
]);
accum1
=
widemul
(
a
[
4
],
a
[
7
]);
accum2
+=
widemul
(
a
[
1
],
a
[
2
]);
accum0
+=
widemul
(
aa
[
1
],
aa
[
2
]);
accum1
+=
widemul
(
a
[
5
],
a
[
6
]);
accum0
-=
accum2
;
accum1
+=
accum2
;
c
[
3
]
=
((
uint64_t
)(
accum1
))
<<
1
&
mask
;
c
[
7
]
=
((
uint64_t
)(
accum0
))
<<
1
&
mask
;
accum0
>>=
55
;
accum1
>>=
55
;
accum0
+=
widemul
(
2
*
aa
[
1
],
aa
[
3
]);
accum1
+=
widemul
(
2
*
a
[
5
],
a
[
7
]);
accum0
+=
widemul
(
aa
[
2
],
aa
[
2
]);
accum1
+=
accum0
;
accum0
-=
widemul
(
2
*
a
[
1
],
a
[
3
]);
accum1
+=
widemul
(
a
[
6
],
a
[
6
]);
accum2
=
widemul
(
a
[
0
],
a
[
0
]);
accum1
-=
accum2
;
accum0
+=
accum2
;
accum0
-=
widemul
(
a
[
2
],
a
[
2
]);
accum1
+=
widemul
(
aa
[
0
],
aa
[
0
]);
accum0
+=
widemul
(
a
[
4
],
a
[
4
]);
c
[
0
]
=
((
uint64_t
)(
accum0
))
&
mask
;
c
[
4
]
=
((
uint64_t
)(
accum1
))
&
mask
;
accum0
>>=
56
;
accum1
>>=
56
;
accum2
=
widemul
(
2
*
aa
[
2
],
aa
[
3
]);
accum0
-=
widemul
(
2
*
a
[
2
],
a
[
3
]);
accum1
+=
widemul
(
2
*
a
[
6
],
a
[
7
]);
accum1
+=
accum2
;
accum0
+=
accum2
;
accum2
=
widemul
(
2
*
a
[
0
],
a
[
1
]);
accum1
+=
widemul
(
2
*
aa
[
0
],
aa
[
1
]);
accum0
+=
widemul
(
2
*
a
[
4
],
a
[
5
]);
accum1
-=
accum2
;
accum0
+=
accum2
;
c
[
1
]
=
((
uint64_t
)(
accum0
))
&
mask
;
c
[
5
]
=
((
uint64_t
)(
accum1
))
&
mask
;
accum0
>>=
56
;
accum1
>>=
56
;
accum2
=
widemul
(
aa
[
3
],
aa
[
3
]);
accum0
-=
widemul
(
a
[
3
],
a
[
3
]);
accum1
+=
widemul
(
a
[
7
],
a
[
7
]);
accum1
+=
accum2
;
accum0
+=
accum2
;
accum2
=
widemul
(
2
*
a
[
0
],
a
[
2
]);
accum1
+=
widemul
(
2
*
aa
[
0
],
aa
[
2
]);
accum0
+=
widemul
(
2
*
a
[
4
],
a
[
6
]);
accum2
+=
widemul
(
a
[
1
],
a
[
1
]);
accum1
+=
widemul
(
aa
[
1
],
aa
[
1
]);
accum0
+=
widemul
(
a
[
5
],
a
[
5
]);
accum1
-=
accum2
;
accum0
+=
accum2
;
c
[
2
]
=
((
uint64_t
)(
accum0
))
&
mask
;
c
[
6
]
=
((
uint64_t
)(
accum1
))
&
mask
;
accum0
>>=
56
;
accum1
>>=
56
;
accum0
+=
c
[
3
];
accum1
+=
c
[
7
];
c
[
3
]
=
((
uint64_t
)(
accum0
))
&
mask
;
c
[
7
]
=
((
uint64_t
)(
accum1
))
&
mask
;
/* we could almost stop here, but it wouldn't be stable, so... */
accum0
>>=
56
;
accum1
>>=
56
;
c
[
4
]
+=
((
uint64_t
)(
accum0
))
+
((
uint64_t
)(
accum1
));
c
[
0
]
+=
((
uint64_t
)(
accum1
));
}
void
p448_strong_reduce
(
p448_t
*
a
)
{
uint64_t
mask
=
(
1ull
<<
56
)
-
1
;
/* first, clear high */
a
->
limb
[
4
]
+=
a
->
limb
[
7
]
>>
56
;
a
->
limb
[
0
]
+=
a
->
limb
[
7
]
>>
56
;
a
->
limb
[
7
]
&=
mask
;
/* now the total is less than 2^448 - 2^(448-56) + 2^(448-56+8) < 2p */
/* compute total_value - p. No need to reduce mod p. */
__int128_t
scarry
=
0
;
int
i
;
for
(
i
=
0
;
i
<
8
;
i
++
)
{
scarry
=
scarry
+
a
->
limb
[
i
]
-
((
i
==
4
)
?
mask
-
1
:
mask
);
a
->
limb
[
i
]
=
scarry
&
mask
;
scarry
>>=
56
;
}
/* uncommon case: it was >= p, so now scarry = 0 and this = x
* common case: it was < p, so now scarry = -1 and this = x - p + 2^448
* so let's add back in p. will carry back off the top for 2^448.
*/
assert
(
is_zero
(
scarry
)
|
is_zero
(
scarry
+
1
));
uint64_t
scarry_mask
=
scarry
&
mask
;
__uint128_t
carry
=
0
;
/* add it back */
for
(
i
=
0
;
i
<
8
;
i
++
)
{
carry
=
carry
+
a
->
limb
[
i
]
+
((
i
==
4
)
?
(
scarry_mask
&~
1
)
:
scarry_mask
);
a
->
limb
[
i
]
=
carry
&
mask
;
carry
>>=
56
;
}
assert
(
is_zero
(
carry
+
scarry
));
}
mask_t
p448_is_zero
(
const
struct
p448_t
*
a
)
{
struct
p448_t
b
;
p448_copy
(
&
b
,
a
);
p448_strong_reduce
(
&
b
);
uint64_t
any
=
0
;
int
i
;
for
(
i
=
0
;
i
<
8
;
i
++
)
{
any
|=
b
.
limb
[
i
];
}
return
is_zero
(
any
);
}
void
p448_serialize
(
uint8_t
*
serial
,
const
struct
p448_t
*
x
)
{
int
i
,
j
;
p448_t
red
;
p448_copy
(
&
red
,
x
);
p448_strong_reduce
(
&
red
);
for
(
i
=
0
;
i
<
8
;
i
++
)
{
for
(
j
=
0
;
j
<
7
;
j
++
)
{
serial
[
7
*
i
+
j
]
=
red
.
limb
[
i
];
red
.
limb
[
i
]
>>=
8
;
}
assert
(
red
.
limb
[
i
]
==
0
);
}
}
mask_t
p448_deserialize
(
p448_t
*
x
,
const
uint8_t
serial
[
56
]
)
{
int
i
,
j
;
for
(
i
=
0
;
i
<
8
;
i
++
)
{
uint64_t
out
=
0
;
for
(
j
=
0
;
j
<
7
;
j
++
)
{
out
|=
((
uint64_t
)
serial
[
7
*
i
+
j
])
<<
(
8
*
j
);
}
x
->
limb
[
i
]
=
out
;
}
/* Check for reduction.
*
* The idea is to create a variable ge which is all ones (rather, 56 ones)
* if and only if the low $i$ words of $x$ are >= those of p.
*
* Remember p = little_endian(1111,1111,1111,1111,1110,1111,1111,1111)
*/
uint64_t
ge
=
-
1
,
mask
=
(
1ull
<<
56
)
-
1
;
for
(
i
=
0
;
i
<
4
;
i
++
)
{
ge
&=
x
->
limb
[
i
];
}
/* At this point, ge = 1111 iff bottom are all 1111. Now propagate if 1110, or set if 1111 */
ge
=
(
ge
&
(
x
->
limb
[
4
]
+
1
))
|
is_zero
(
x
->
limb
[
4
]
^
mask
);
/* Propagate the rest */
for
(
i
=
5
;
i
<
8
;
i
++
)
{
ge
&=
x
->
limb
[
i
];
}
return
~
is_zero
(
ge
^
mask
);
}
void
simultaneous_invert_p448
(
struct
p448_t
*
__restrict__
out
,
const
struct
p448_t
*
in
,
unsigned
int
n
)
{
if
(
n
==
0
)
{
return
;
}
else
if
(
n
==
1
)
{
p448_inverse
(
out
,
in
);
return
;
}
p448_copy
(
&
out
[
1
],
&
in
[
0
]);
int
i
;
for
(
i
=
1
;
i
<
(
int
)
(
n
-
1
);
i
++
)
{
p448_mul
(
&
out
[
i
+
1
],
&
out
[
i
],
&
in
[
i
]);
}
p448_mul
(
&
out
[
0
],
&
out
[
n
-
1
],
&
in
[
n
-
1
]);
struct
p448_t
tmp
;
p448_inverse
(
&
tmp
,
&
out
[
0
]);
p448_copy
(
&
out
[
0
],
&
tmp
);
/* at this point, out[0] = product(in[i]) ^ -1
* out[i] = product(in[0]..in[i-1]) if i != 0
*/
for
(
i
=
n
-
1
;
i
>
0
;
i
--
)
{
p448_mul
(
&
tmp
,
&
out
[
i
],
&
out
[
0
]);
p448_copy
(
&
out
[
i
],
&
tmp
);
p448_mul
(
&
tmp
,
&
out
[
0
],
&
in
[
i
]);
p448_copy
(
&
out
[
0
],
&
tmp
);
}
}
src/arch_ref64/p448.h
0 → 100644
View file @
a9c72b5a
/* Copyright (c) 2014 Cryptography Research, Inc.
* Released under the MIT License. See LICENSE.txt for license information.
*/
#ifndef __P448_H__
#define __P448_H__ 1
#include <stdint.h>
#include <assert.h>
#include <string.h>
#include "word.h"
typedef
struct
p448_t
{
uint64_t
limb
[
8
];
}
__attribute__
((
aligned
(
32
)))
p448_t
;
#ifdef __cplusplus
extern
"C"
{
#endif
static
__inline__
void
p448_set_ui
(
p448_t
*
out
,
uint64_t
x
)
__attribute__
((
unused
));
static
__inline__
void
p448_cond_swap
(
p448_t
*
a
,
p448_t
*
b
,
mask_t
do_swap
)
__attribute__
((
unused
));
static
__inline__
void
p448_add
(
p448_t
*
out
,
const
p448_t
*
a
,
const
p448_t
*
b
)
__attribute__
((
unused
));
static
__inline__
void
p448_sub
(
p448_t
*
out
,
const
p448_t
*
a
,
const
p448_t
*
b
)
__attribute__
((
unused
));
static
__inline__
void
p448_neg
(
p448_t
*
out
,
const
p448_t
*
a
)
__attribute__
((
unused
));
static
__inline__
void
p448_cond_neg
(
p448_t
*
a
,
mask_t
doNegate
)
__attribute__
((
unused
));
static
__inline__
void
p448_addw
(
p448_t
*
a
,
uint64_t
x
)
__attribute__
((
unused
));
static
__inline__
void
p448_subw
(
p448_t
*
a
,
uint64_t
x
)
__attribute__
((
unused
));
static
__inline__
void
p448_copy
(
p448_t
*
out
,
const
p448_t
*
a
)
__attribute__
((
unused
));
static
__inline__
void
p448_weak_reduce
(
p448_t
*
inout
)
__attribute__
((
unused
));
void
p448_strong_reduce
(
p448_t
*
inout
);
mask_t
p448_is_zero
(
const
p448_t
*
in
);
static
__inline__
void
p448_bias
(
p448_t
*
inout
,
int
amount
)
__attribute__
((
unused
));
static
__inline__
void
p448_really_bias
(
p448_t
*
inout
,
int
amount
)
__attribute__
((
unused
));
void
p448_mul
(
p448_t
*
__restrict__
out
,
const
p448_t
*
a
,
const
p448_t
*
b
);
void
p448_mulw
(
p448_t
*
__restrict__
out
,
const
p448_t
*
a
,
uint64_t
b
);
void