-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfk20_poly2toeplitz_coefficients.cu
60 lines (46 loc) · 1.79 KB
/
fk20_poly2toeplitz_coefficients.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
// bls12_381: Arithmetic for BLS12-381
// Copyright 2022-2023 Dag Arne Osvik
// Copyright 2022-2023 Luan Cardoso dos Santos
#include <cassert>
#include "fr.cuh"
#include "g1.cuh"
#include "fk20.cuh"
/**
* @brief polynomial -> toeplitz_coefficients
*
* @param[out] toeplitz_coefficients array with dimension [4096 * gridDim.x]
* @param[in] polynomial array with dimensions [rows * 16 * 512]
* @return void
*
* Grid must be 1-D, 256 threads per block.
*
* IMPORTANT: This function does not need shared memory. Making the kernel call with a dynamic shared memory allocation
* is known to cause some subtle bugs, that not always show during normal execution.
* Similar comment is present in fk20test_poly.cu and fk20_512test_poly.cu. In case this function changes and starts
* needing shared memory, correct the tests on those two files.
*/
__global__ void fk20_poly2toeplitz_coefficients(fr_t *toeplitz_coefficients, const fr_t *polynomial) {
// gridDim.x is the number of rows
assert(gridDim.y == 1);
assert(gridDim.z == 1);
assert(blockDim.x == 256);
assert(blockDim.y == 1);
assert(blockDim.z == 1);
unsigned tid = threadIdx.x; // Thread number
unsigned bid = blockIdx.x; // Block number
polynomial += 4096 * bid;
toeplitz_coefficients += 8192 * bid;
for (int i=0; i<16; i++) {
// Copy from the polynomial into half of the coefficient array
unsigned src = tid*16+15-i;
unsigned dst = (tid+257)%512 + 512*i;
if (tid > 0)
fr_cpy(toeplitz_coefficients[dst], polynomial[src]);
else
fr_zero(toeplitz_coefficients[dst]);
__syncwarp(0xffffffff);
// The other half of coefficients is all zero
fr_zero(toeplitz_coefficients[512*i+tid+1]);
}
}
// vim: ts=4 et sw=4 si