-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfftTest.cu
125 lines (96 loc) · 3.33 KB
/
fftTest.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
//testing for the fk20, loselly based on the fk20test_kat.cu
#include <stdio.h>
#include <stdio.h>
#include <cuda.h>
#include <time.h>
#include "g1.cuh"
#include "fk20.cuh"
extern "C"{
#include "parseFFTTest.h"
}
__managed__ g1p_t g1p_input[512], g1p_output[512], g1p_expected[512];
bool g1a_iszeroHost(const g1a_t &a) {
return (a.x[5] | a.x[4] | a.x[3] | a.x[2] | a.x[1] | a.x[0] |
a.y[5] | a.y[4] | a.y[3] | a.y[2] | a.y[1] | a.y[0]) == 0;
}
void g1p_fromG1aHost(g1p_t &p, const g1a_t &a) {
if (g1a_iszeroHost(a)) {
p = { { 0, 0, 0, 0, 0, 0 }, { 1, 0, 0, 0, 0, 0 }, { 0, 0, 0, 0, 0, 0 } };
}
for(int i=0; i<6; i++) p.x[i]=a.x[i];
for(int i=0; i<6; i++) p.y[i]=a.y[i];
//fp_one(p.z);
p.z[0]=1;
for(int i=2; i<6; i++) p.z[i]=0;
}
void unpackffttest(ffttest_t testInputs, int testIDX, g1p_t g1p_input[512]){
g1a_t tmp;
// First, read the 256 fft input elements
for(int argidx=0; argidx<256; argidx++){
/* Because of limitation in the API of BLST, the test-case generator only has access to
* the affine representation of G1 elements -- where each element is represented as two Fp
* elements. The g1p_fft uses projective representation, where an extra Fp element is used.
* Note that FFTTestCase.fftInputp is TODO
*/
for(int j=0; j<6; j++){
tmp.x[j] = testInputs.testCase[testIDX].fftInput[argidx].word[j];
tmp.y[j] = testInputs.testCase[testIDX].fftInput[argidx].word[j+6];
}
// Convert these g1a to g1p
g1p_fromG1aHost(g1p_input[argidx], tmp);
}
// The last 256 elements are set to infinity due to the design of the reference Python implementation
for(int i=256; i<512; i++)
g1p_inf(g1p_input[i]);
}
void FFTTest_random(){
// Generates tests from randomness
return;
}
void FFTTest(){
// Uses tests picked from actual use-cases, extracted from the instrumented Python implementation
const dim3 block(256,1,1);
const dim3 grid(512,1,1);
const size_t sharedmem = 73728; //72 KiB
clock_t elapsedTime;
// Read data from testFFT.in using partseFFTTest
const char inputFile[] = "testFFT.in";
ffttest_t testInputs = parseFFTTest(inputFile);
if (testInputs.nTest == 0){
exit(-1);
}
else{
fprintf(stderr, "<%s> Test inputs read: %d tests.\n", __func__, testInputs.nTest);
}
// Convert testcase into g1p format
unpackffttest(testInputs, 0, g1p_input);
// Allocate memory
const size_t fftsize = 512*sizeof(g1p_t);
const size_t memsize = grid.x*fftsize;
g1p_t *in, *out;
cudaMallocManaged(&in, memsize);
cudaMallocManaged(&out, memsize);
// Copy input to device
for (int i=0; i<grid.x; i++) memcpy(in+i*512, g1p_input, fftsize);
// Run multi-fft
elapsedTime = -clock();
g1p_fft_wrapper<<<grid, block, sharedmem>>>(out, in);
cudaDeviceSynchronize();
elapsedTime += clock();
fprintf(stderr, "Kernel executed in %.5fs\n", elapsedTime * (1.0 / CLOCKS_PER_SEC) );
// Check for correctness, report errors
fprintf(stderr, "Hello, I still don't do error checking, duuude\n");
// Deallocate
cudaFree(in);
cudaFree(out);
freeffttest_t(&testInputs);
}
void init(){
}
int main(){
init();
printf("Debug\n");
FFTTest();
return 0;
}
// vim: ts=4 et sw=4 si