Skip to content

Commit dd3238b

Browse files
committed
Documentation improvements
1 parent dfb2b11 commit dd3238b

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+402
-405
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ FK20Py/datar
1313
FK20Py/scratchpad.py
1414
FK20Py/toeplitz_tmp.py
1515
fk20benchmark
16+
fk20test_poly2toeplitz_coefficients
1617
fk20test_poly2toeplitz_coefficients_fft
1718
gdb.in
1819
poly.txt

fftTest.cu

+23-24
Original file line numberDiff line numberDiff line change
@@ -30,82 +30,79 @@ void g1p_fromG1aHost(g1p_t &p, const g1a_t &a) {
3030

3131
void unpackffttest(ffttest_t testInputs, int testIDX, g1p_t g1p_input[512]){
3232
g1a_t tmp;
33-
//first, read the 256 fft input elements
33+
// First, read the 256 fft input elements
3434
for(int argidx=0; argidx<256; argidx++){
35-
/* because of limitation in the API of BLST, the test-case generator only
36-
* has access to the affine representation of G1 elements -- where each ealement is represented as
37-
* two elements of fp. The g1p_fft uses the other representation, where an extra element is used.
38-
* Notice that FFTTestCase.fftInputp is
35+
/* Because of limitation in the API of BLST, the test-case generator only has access to
36+
* the affine representation of G1 elements -- where each element is represented as two Fp
37+
* elements. The g1p_fft uses projective representation, where an extra Fp element is used.
38+
* Note that FFTTestCase.fftInputp is TODO
3939
*/
4040

4141
for(int j=0; j<6; j++){
4242
tmp.x[j] = testInputs.testCase[testIDX].fftInput[argidx].word[j];
4343
tmp.y[j] = testInputs.testCase[testIDX].fftInput[argidx].word[j+6];
4444
}
45-
//Convert these g1a to g1p
45+
// Convert these g1a to g1p
4646
g1p_fromG1aHost(g1p_input[argidx], tmp);
4747
}
4848

49-
50-
//the last 256 elements are zero at infinity due to the design of the reference python implementation
51-
g1p_t zinf = { { 0, 0, 0, 0, 0, 0 }, { 1, 0, 0, 0, 0, 0 }, { 0, 0, 0, 0, 0, 0 } };
49+
// The last 256 elements are set to infinity due to the design of the reference Python implementation
5250

5351
for(int i=256; i<512; i++)
54-
g1p_input[i] = zinf;
55-
52+
g1p_inf(g1p_input[i]);
5653
}
5754

5855
void FFTTest_random(){
59-
//generates tests from randomness
56+
// Generates tests from randomness
6057
return;
6158
}
6259

6360
void FFTTest(){
64-
//uses tests picked from actual use cases, extracted from the instrumented python implementation
61+
// Uses tests picked from actual use-cases, extracted from the instrumented Python implementation
6562
const dim3 block(256,1,1);
6663
const dim3 grid(512,1,1);
6764
const size_t sharedmem = 73728; //72 KiB
6865

6966
clock_t elapsedTime;
7067

71-
//read data from testFFT.in using partseFFTTest
68+
// Read data from testFFT.in using partseFFTTest
7269
const char inputFile[] = "testFFT.in";
73-
ffttest_t testInputs = parseFFTTest(inputFile);
70+
ffttest_t testInputs = parseFFTTest(inputFile);
7471
if (testInputs.nTest == 0){
7572
exit(-1);
76-
}
73+
}
7774
else{
7875
fprintf(stderr, "<%s> Test inputs read: %d tests.\n", __func__, testInputs.nTest);
7976
}
8077

81-
//convert testcase into g1p format
78+
// Convert testcase into g1p format
8279
unpackffttest(testInputs, 0, g1p_input);
8380

84-
//Allocate memory
81+
// Allocate memory
8582
const size_t fftsize = 512*sizeof(g1p_t);
8683
const size_t memsize = grid.x*fftsize;
8784

8885
g1p_t *in, *out;
8986

9087
cudaMallocManaged(&in, memsize);
9188
cudaMallocManaged(&out, memsize);
92-
89+
9390
// Copy input to device
9491
for (int i=0; i<grid.x; i++) memcpy(in+i*512, g1p_input, fftsize);
9592

96-
//run multi-fft
93+
// Run multi-fft
9794
elapsedTime = -clock();
9895

9996
g1p_fft_wrapper<<<grid, block, sharedmem>>>(out, in);
100-
97+
10198
cudaDeviceSynchronize();
10299
elapsedTime += clock();
103100

104101
fprintf(stderr, "Kernel executed in %.5fs\n", elapsedTime * (1.0 / CLOCKS_PER_SEC) );
105-
//check for correctness, report errors
102+
// Check for correctness, report errors
106103
fprintf(stderr, "Hello, I still don't do error checking, duuude\n");
107104

108-
//dealocate
105+
// Deallocate
109106
cudaFree(in);
110107
cudaFree(out);
111108
freeffttest_t(&testInputs);
@@ -123,4 +120,6 @@ int main(){
123120
FFTTest();
124121

125122
return 0;
126-
}
123+
}
124+
125+
// vim: ts=4 et sw=4 si

fk20.cu

+6-6
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,12 @@
1717

1818
/**
1919
* @brief setup -> xext_fft
20-
*
20+
*
2121
* Grid must be 16, 256 threads per block.
22-
*
22+
*
2323
* @param[out] xext_fft array with dimension [16*512]
2424
* @param setup array with dimension [16*512]
25-
* @return void
25+
* @return void
2626
*/
2727
__global__ void fk20_setup2xext_fft(g1p_t *xext_fft, const g1p_t *setup) {
2828
//TODO: Not passing test, probably bad block indexing
@@ -65,10 +65,10 @@ __global__ void fk20_setup2xext_fft(g1p_t *xext_fft, const g1p_t *setup) {
6565

6666
/**
6767
* @brief hext_fft -> hext
68-
*
68+
*
6969
* @param[in] hext array with 512*gridDim.x elements
7070
* @param[out] hext_fft array with 512*gridDim.x elements
71-
* @return
71+
* @return
7272
*/
7373
__global__ void fk20_hext_fft2hext(g1p_t *hext, const g1p_t *hext_fft) {
7474
g1p_ift(hext, hext_fft);
@@ -84,7 +84,7 @@ __global__ void fk20_hext_fft2hext(g1p_t *hext, const g1p_t *hext_fft) {
8484

8585
/**
8686
* @brief h -> h_fft
87-
*
87+
*
8888
* @param[out] h_fft array with 512*gridDim.x elements
8989
* @param[in] h array with 512*gridDim.x elements
9090
* @return void

fk20_512test.cu

+24-24
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ void varMangle(g1p_t *target, size_t size, unsigned step);
6767
* @brief Executes a many-row tests on FK20. Behavior is similar to fk20test.cu
6868
* but using many GPU blocks, each one executing one known-answer test. All tests
6969
* are different. KATS are statically linked in the binary.
70-
*
70+
*
7171
* @param argc Command line argument cont
7272
* @param argv Command line argument values
7373
* @return int 0
@@ -113,21 +113,21 @@ int main(int argc, char **argv) {
113113

114114
/**
115115
* NOTE ON DEPRECATED FUNCTIONS
116-
*
116+
*
117117
* In the main call, some tests are commented out, namely:
118118
* -hext_fft2h_fft_512
119119
* -fk20_poly2toeplitz_coefficients_fft_test
120120
* Those tests are regarding fk20 functions that execute more than one step in
121121
* a single kernel. They cover a unimplemented (possible) future optimization.
122-
*
122+
*
123123
*/
124124
/******************************************************************************/
125125

126126
/**
127-
* @brief Executes many FK20 computations on a single row, with a check on
127+
* @brief Executes many FK20 computations on a single row, with a check on
128128
* each step. A computation failure will not cause a cascade effect, eliminating
129129
* false-fails due to data dependencies.
130-
*
130+
*
131131
* @param rows number of blocks in the range [1,512]
132132
*/
133133
void fullTest_512(unsigned rows){
@@ -137,7 +137,7 @@ void fullTest_512(unsigned rows){
137137

138138
// Setup
139139

140-
//SET_SHAREDMEM(fr_sharedmem, fr_fft_wrapper);
140+
SET_SHAREDMEM(fr_sharedmem, fr_fft_wrapper);
141141
SET_SHAREDMEM(g1p_sharedmem, g1p_fft_wrapper);
142142
SET_SHAREDMEM(g1p_sharedmem, g1p_ift_wrapper);
143143

@@ -227,9 +227,9 @@ void fullTest_512(unsigned rows){
227227
/**
228228
* @brief Similar to fullTest, but polynomial is has changes done to it. The
229229
* function checks for false-positive in the tests.
230-
*
230+
*
231231
* polynomial is restored after execution.
232-
*
232+
*
233233
* @param rows number of blocks in the range [1,512]
234234
*/
235235
void fullTestFalseability_512(unsigned rows){
@@ -239,7 +239,7 @@ void fullTestFalseability_512(unsigned rows){
239239

240240
// Setup
241241

242-
//SET_SHAREDMEM(fr_sharedmem, fr_fft_wrapper);
242+
SET_SHAREDMEM(fr_sharedmem, fr_fft_wrapper);
243243
SET_SHAREDMEM(g1p_sharedmem, g1p_fft_wrapper);
244244
SET_SHAREDMEM(g1p_sharedmem, g1p_ift_wrapper);
245245

@@ -332,7 +332,7 @@ The testing functions follow an common template, described in ./doc/fk20test.md
332332

333333
/**
334334
* @brief Test for fr_fft: toeplitz_coefficients -> toeplitz_coefficients_fft
335-
*
335+
*
336336
* @param rows number of blocks in the range [1,512]
337337
*/
338338
void toeplitz_coefficients2toeplitz_coefficients_fft_512(unsigned rows){
@@ -370,7 +370,7 @@ void toeplitz_coefficients2toeplitz_coefficients_fft_512(unsigned rows){
370370

371371
/**
372372
* @brief Test for g1p_fft: h -> h_fft"
373-
*
373+
*
374374
* @param rows number of blocks in the range [1,512]
375375
*/
376376
void h2h_fft_512(unsigned rows){
@@ -410,7 +410,7 @@ void h2h_fft_512(unsigned rows){
410410

411411
/**
412412
* @brief Test for g1p_ift: h_fft -> h
413-
*
413+
*
414414
* @param rows number of blocks in the range [1,512]
415415
*/
416416
void h_fft2h_512(unsigned rows){
@@ -451,7 +451,7 @@ void h_fft2h_512(unsigned rows){
451451

452452
/**
453453
* @brief Test for g1p_ift: hext_fft -> h
454-
*
454+
*
455455
* @param rows number of blocks in the range [1,512]
456456
*/
457457
void hext_fft2h_512(unsigned rows){
@@ -467,9 +467,9 @@ void hext_fft2h_512(unsigned rows){
467467

468468
CLOCKSTART;
469469
g1p_ift_wrapper<<<rows, 256, g1p_sharedmem>>>(g1p_tmp, hext_fft);
470-
CUDASYNC("g1p_ift_wrapper");
470+
CUDASYNC("g1p_ift_wrapper");
471471
fk20_hext2h<<<rows, 256>>>(g1p_tmp);
472-
CUDASYNC("fk20_hext2h");
472+
CUDASYNC("fk20_hext2h");
473473
CLOCKEND;
474474

475475
clearRes;
@@ -491,7 +491,7 @@ void hext_fft2h_512(unsigned rows){
491491

492492
/**
493493
* @brief Test for fk20_poly2toeplitz_coefficients: polynomial -> toeplitz_coefficients
494-
*
494+
*
495495
* @param rows number of blocks in the range [1,512]
496496
*/
497497
void fk20_poly2toeplitz_coefficients_512(unsigned rows) {
@@ -529,7 +529,7 @@ void fk20_poly2toeplitz_coefficients_512(unsigned rows) {
529529

530530
/**
531531
* @brief Test for fk20_poly2hext_fft: polynomial -> hext_fft
532-
*
532+
*
533533
* @param rows number of blocks in the range [1,512]
534534
*/
535535
void fk20_poly2hext_fft_512(unsigned rows){
@@ -540,7 +540,7 @@ void fk20_poly2hext_fft_512(unsigned rows){
540540

541541
pass = true;
542542

543-
//SET_SHAREDMEM(g1p_sharedmem, fk20_poly2hext_fft);
543+
SET_SHAREDMEM(g1p_sharedmem, fk20_poly2hext_fft);
544544

545545
printf("=== RUN %s\n", "fk20_poly2hext_fft: polynomial -> hext_fft");
546546
for(int testIDX=0; testIDX<=1; testIDX++){
@@ -569,7 +569,7 @@ void fk20_poly2hext_fft_512(unsigned rows){
569569

570570
/**
571571
* @brief Test for fk20_poly2h_fft: polynomial -> h_fft
572-
*
572+
*
573573
* @param rows number of blocks in the range [1,512]
574574
*/
575575
void fk20_poly2h_fft_512(unsigned rows){
@@ -606,7 +606,7 @@ void fk20_poly2h_fft_512(unsigned rows){
606606

607607
/**
608608
* @brief Test for hext_fft2h_fft_512: hext_fft -> h_fft
609-
*
609+
*
610610
* @param rows number of blocks in the range [1,512]
611611
*/
612612
void hext_fft2h_fft_512(unsigned rows){
@@ -644,7 +644,7 @@ void hext_fft2h_fft_512(unsigned rows){
644644

645645
/**
646646
* @brief Test for fk20_msm: Toeplitz_coefficients+xext_fft -> hext_fft
647-
*
647+
*
648648
* @param rows number of blocks in the range [1,512]
649649
*/
650650
void fk20_msmloop_512(unsigned rows){
@@ -693,7 +693,7 @@ void fk20_msmloop_512(unsigned rows){
693693
CLOCKSTART;
694694
fk20_poly2toeplitz_coefficients_fft<<<rows, 256>>>(fr_tmp_, polynomial);
695695
err = cudaDeviceSynchronize();
696-
CUDASYNC("fk20_poly2toeplitz_coefficients_fft");
696+
CUDASYNC("fk20_poly2toeplitz_coefficients_fft");
697697
CLOCKEND;
698698
clearRes;
699699
fr_eq_wrapper<<<16, 256>>>(cmp, rows*16*512, fr_tmp_, (fr_t *)toeplitz_coefficients_fft);
@@ -722,7 +722,7 @@ void fk20_msmloop_512(unsigned rows){
722722
/**
723723
* @brief swap elements at positions multiple of step. Nondestructive, call
724724
* a second time to undo the changes
725-
*
725+
*
726726
* @param[out] target Pointer to array
727727
* @param[in] size length of the array
728728
* @param[in] step distance between elements swapped.
@@ -744,7 +744,7 @@ void varMangle(fr_t *target, size_t size, unsigned step){
744744
/**
745745
* @brief swap elements at positions multiple of step. Nondestructive, call
746746
* a second time to undo the changes
747-
*
747+
*
748748
* @param[out] target Pointer to array
749749
* @param[in] size length of the array
750750
* @param[in] step distance between elements swapped.

fk20_hext_fft2h_fft.cu

+4-4
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,13 @@
1010

1111
/**
1212
* @brief hext_fft -> h_fft
13-
*
13+
*
1414
* Grid must be 1-D, 256 threads per block.
1515
* Dynamic shared memory: g1p_sharedmem(73728 Bytes)
16-
*
16+
*
1717
* @param[out] h_fft array with dimensions [gridDim.x * 512]
1818
* @param[in] hext_fft array with dimensions [gridDim.x * 512]
19-
* @return void
19+
* @return void
2020
*/
2121
__global__ void fk20_hext_fft2h_fft(g1p_t *h_fft, const g1p_t *hext_fft){
2222
if (gridDim.y != 1) return;
@@ -37,7 +37,7 @@ __global__ void fk20_hext_fft2h_fft(g1p_t *h_fft, const g1p_t *hext_fft){
3737
g1p_ift(h_fft, hext_fft);
3838
__syncthreads();
3939

40-
// zero second half of h
40+
// Zero second half of h
4141
g1p_inf(h_fft[256+tid]);
4242
__syncthreads();
4343

0 commit comments

Comments
 (0)