lCardosoSantos
diff --git a/‎.gitignore
+1 b/‎.gitignore
+1
diff --git a/‎fftTest.cu
+23-24 b/‎fftTest.cu
+23-24
diff --git a/‎fk20.cu
+6-6 b/‎fk20.cu
+6-6
diff --git a/‎fk20_512test.cu
+24-24 b/‎fk20_512test.cu
+24-24
diff --git a/‎fk20_hext_fft2h_fft.cu
+4-4 b/‎fk20_hext_fft2h_fft.cu
+4-4
@@ -13,6 +13,7 @@ FK20Py/datar
 FK20Py/scratchpad.py
 FK20Py/toeplitz_tmp.py
 fk20benchmark
+fk20test_poly2toeplitz_coefficients
 fk20test_poly2toeplitz_coefficients_fft
 gdb.in
 poly.txt
 
@@ -30,82 +30,79 @@ void g1p_fromG1aHost(g1p_t &p, const g1a_t &a) {
 
 void unpackffttest(ffttest_t testInputs, int testIDX, g1p_t g1p_input[512]){
     g1a_t tmp;
-    //first, read the 256 fft input elements
+    // First, read the 256 fft input elements
     for(int argidx=0; argidx<256; argidx++){
-        /* because of limitation in the API of BLST, the test-case generator only
-         * has access to the affine representation of G1 elements -- where each ealement is represented as
-         * two elements of fp. The g1p_fft uses the other representation, where an extra element is used. 
-         * Notice that FFTTestCase.fftInputp is 
+        /* Because of limitation in the API of BLST, the test-case generator only has access to
+         * the affine representation of G1 elements -- where each element is represented as two Fp
+         * elements. The g1p_fft uses projective representation, where an extra Fp element is used.
+         * Note that FFTTestCase.fftInputp is TODO
          */
 
         for(int j=0; j<6; j++){
         tmp.x[j] = testInputs.testCase[testIDX].fftInput[argidx].word[j];
         tmp.y[j] = testInputs.testCase[testIDX].fftInput[argidx].word[j+6];
         }
-        //Convert these g1a to g1p
+        // Convert these g1a to g1p
         g1p_fromG1aHost(g1p_input[argidx], tmp);
     }
 
-
-    //the last 256 elements are zero at infinity due to the design of the reference python implementation
-    g1p_t zinf = { { 0, 0, 0, 0, 0, 0 }, { 1, 0, 0, 0, 0, 0 }, { 0, 0, 0, 0, 0, 0 } };
+    // The last 256 elements are set to infinity due to the design of the reference Python implementation
 
     for(int i=256; i<512; i++)
-        g1p_input[i] = zinf;
-
+        g1p_inf(g1p_input[i]);
 }
 
 void FFTTest_random(){
-    //generates tests from randomness
+    // Generates tests from randomness
     return;
 }
 
 void FFTTest(){
-    //uses tests picked from actual use cases, extracted from the instrumented python implementation
+    // Uses tests picked from actual use-cases, extracted from the instrumented Python implementation
     const dim3 block(256,1,1);
     const dim3 grid(512,1,1);
     const size_t sharedmem = 73728; //72 KiB
 
     clock_t elapsedTime;
 
-    //read data from testFFT.in using partseFFTTest
+    // Read data from testFFT.in using partseFFTTest
     const char inputFile[] = "testFFT.in";
-    ffttest_t testInputs = parseFFTTest(inputFile); 
+    ffttest_t testInputs = parseFFTTest(inputFile);
     if (testInputs.nTest == 0){
         exit(-1);
-    } 
+    }
     else{
         fprintf(stderr, "<%s> Test inputs read: %d tests.\n", __func__, testInputs.nTest);
     }
 
-    //convert testcase into g1p format
+    // Convert testcase into g1p format
     unpackffttest(testInputs, 0, g1p_input);
 
-    //Allocate memory
+    // Allocate memory
     const size_t fftsize = 512*sizeof(g1p_t);
     const size_t memsize = grid.x*fftsize;
 
     g1p_t *in, *out;
 
     cudaMallocManaged(&in,  memsize);
     cudaMallocManaged(&out, memsize);
-    
+
     // Copy input to device
     for (int i=0; i<grid.x; i++) memcpy(in+i*512, g1p_input, fftsize);
 
-    //run multi-fft
+    // Run multi-fft
     elapsedTime = -clock();
 
     g1p_fft_wrapper<<<grid, block, sharedmem>>>(out, in);
-    
+
     cudaDeviceSynchronize();
     elapsedTime += clock();
 
     fprintf(stderr, "Kernel executed in %.5fs\n", elapsedTime * (1.0 / CLOCKS_PER_SEC) );
-    //check for correctness, report errors
+    // Check for correctness, report errors
     fprintf(stderr, "Hello, I still don't do error checking, duuude\n");
 
-    //dealocate
+    // Deallocate
     cudaFree(in);
     cudaFree(out);
     freeffttest_t(&testInputs);
@@ -123,4 +120,6 @@ int main(){
     FFTTest();
 
     return 0;
-}
+}
+
+// vim: ts=4 et sw=4 si
@@ -17,12 +17,12 @@
 
 /**
  * @brief setup -> xext_fft
- * 
+ *
  * Grid must be 16, 256 threads per block.
- * 
+ *
  * @param[out] xext_fft array with dimension [16*512]
  * @param setup array with dimension [16*512]
- * @return void 
+ * @return void
  */
 __global__ void fk20_setup2xext_fft(g1p_t *xext_fft, const g1p_t *setup) {
     //TODO: Not passing test, probably bad block indexing
@@ -65,10 +65,10 @@ __global__ void fk20_setup2xext_fft(g1p_t *xext_fft, const g1p_t *setup) {
 
 /**
  * @brief hext_fft -> hext
- * 
+ *
  * @param[in] hext array with 512*gridDim.x elements
  * @param[out] hext_fft array with 512*gridDim.x elements
- * @return 
+ * @return
  */
 __global__ void fk20_hext_fft2hext(g1p_t *hext, const g1p_t *hext_fft) {
     g1p_ift(hext, hext_fft);
@@ -84,7 +84,7 @@ __global__ void fk20_hext_fft2hext(g1p_t *hext, const g1p_t *hext_fft) {
 
 /**
  * @brief h -> h_fft
- * 
+ *
  * @param[out] h_fft array with 512*gridDim.x elements
  * @param[in] h array with 512*gridDim.x elements
  * @return void
 
@@ -67,7 +67,7 @@ void varMangle(g1p_t *target, size_t size, unsigned step);
  * @brief Executes a many-row tests on FK20. Behavior is similar to fk20test.cu
  * but using many GPU blocks, each one executing one known-answer test. All tests
  * are different. KATS are statically linked in the binary.
- * 
+ *
  * @param argc Command line argument cont
  * @param argv Command line argument values
  * @return int 0
@@ -113,21 +113,21 @@ int main(int argc, char **argv) {
 
 /**
  * NOTE ON DEPRECATED FUNCTIONS
- * 
+ *
  * In the main call, some tests are commented out, namely:
  * -hext_fft2h_fft_512
  * -fk20_poly2toeplitz_coefficients_fft_test
  * Those tests are regarding fk20 functions that execute more than one step in
  * a single kernel. They cover a unimplemented (possible) future optimization.
- * 
+ *
  */
 /******************************************************************************/
 
 /**
- * @brief Executes many FK20 computations on a single row, with a check on 
+ * @brief Executes many FK20 computations on a single row, with a check on
  * each step. A computation failure will not cause a cascade effect, eliminating
  * false-fails due to data dependencies.
- * 
+ *
  * @param rows number of blocks in the range [1,512]
  */
 void fullTest_512(unsigned rows){
@@ -137,7 +137,7 @@ void fullTest_512(unsigned rows){
 
     // Setup
 
-    //SET_SHAREDMEM(fr_sharedmem,  fr_fft_wrapper);
+    SET_SHAREDMEM(fr_sharedmem,  fr_fft_wrapper);
     SET_SHAREDMEM(g1p_sharedmem, g1p_fft_wrapper);
     SET_SHAREDMEM(g1p_sharedmem, g1p_ift_wrapper);
 
@@ -227,9 +227,9 @@ void fullTest_512(unsigned rows){
 /**
  * @brief Similar to fullTest, but polynomial is has changes done to it. The
  * function checks for false-positive in the tests.
- * 
+ *
  * polynomial is restored after execution.
- * 
+ *
  * @param rows number of blocks in the range [1,512]
  */
 void fullTestFalseability_512(unsigned rows){
@@ -239,7 +239,7 @@ void fullTestFalseability_512(unsigned rows){
 
     // Setup
 
-    //SET_SHAREDMEM(fr_sharedmem,  fr_fft_wrapper);
+    SET_SHAREDMEM(fr_sharedmem,  fr_fft_wrapper);
     SET_SHAREDMEM(g1p_sharedmem, g1p_fft_wrapper);
     SET_SHAREDMEM(g1p_sharedmem, g1p_ift_wrapper);
 
@@ -332,7 +332,7 @@ The testing functions follow an common template, described in ./doc/fk20test.md
 
 /**
  * @brief Test for fr_fft: toeplitz_coefficients -> toeplitz_coefficients_fft
- * 
+ *
  * @param rows number of blocks in the range [1,512]
  */
 void toeplitz_coefficients2toeplitz_coefficients_fft_512(unsigned rows){
@@ -370,7 +370,7 @@ void toeplitz_coefficients2toeplitz_coefficients_fft_512(unsigned rows){
 
 /**
  * @brief Test for g1p_fft: h -> h_fft"
- * 
+ *
  * @param rows number of blocks in the range [1,512]
  */
 void h2h_fft_512(unsigned rows){
@@ -410,7 +410,7 @@ void h2h_fft_512(unsigned rows){
 
 /**
  * @brief Test for g1p_ift: h_fft -> h
- * 
+ *
  * @param rows number of blocks in the range [1,512]
  */
 void h_fft2h_512(unsigned rows){
@@ -451,7 +451,7 @@ void h_fft2h_512(unsigned rows){
 
 /**
  * @brief Test for g1p_ift: hext_fft -> h
- * 
+ *
  * @param rows number of blocks in the range [1,512]
  */
 void hext_fft2h_512(unsigned rows){
@@ -467,9 +467,9 @@ void hext_fft2h_512(unsigned rows){
 
         CLOCKSTART;
         g1p_ift_wrapper<<<rows, 256, g1p_sharedmem>>>(g1p_tmp, hext_fft);
-        CUDASYNC("g1p_ift_wrapper"); 
+        CUDASYNC("g1p_ift_wrapper");
         fk20_hext2h<<<rows, 256>>>(g1p_tmp);
-        CUDASYNC("fk20_hext2h"); 
+        CUDASYNC("fk20_hext2h");
         CLOCKEND;
 
         clearRes;
@@ -491,7 +491,7 @@ void hext_fft2h_512(unsigned rows){
 
 /**
  * @brief Test for fk20_poly2toeplitz_coefficients: polynomial -> toeplitz_coefficients
- * 
+ *
  * @param rows number of blocks in the range [1,512]
  */
 void fk20_poly2toeplitz_coefficients_512(unsigned rows) {
@@ -529,7 +529,7 @@ void fk20_poly2toeplitz_coefficients_512(unsigned rows) {
 
 /**
  * @brief Test for fk20_poly2hext_fft: polynomial -> hext_fft
- * 
+ *
  * @param rows number of blocks in the range [1,512]
  */
 void fk20_poly2hext_fft_512(unsigned rows){
@@ -540,7 +540,7 @@ void fk20_poly2hext_fft_512(unsigned rows){
 
     pass = true;
 
-    //SET_SHAREDMEM(g1p_sharedmem, fk20_poly2hext_fft);
+    SET_SHAREDMEM(g1p_sharedmem, fk20_poly2hext_fft);
 
     printf("=== RUN   %s\n", "fk20_poly2hext_fft: polynomial -> hext_fft");
     for(int testIDX=0; testIDX<=1; testIDX++){
@@ -569,7 +569,7 @@ void fk20_poly2hext_fft_512(unsigned rows){
 
 /**
  * @brief Test for fk20_poly2h_fft: polynomial -> h_fft
- * 
+ *
  * @param rows number of blocks in the range [1,512]
  */
 void fk20_poly2h_fft_512(unsigned rows){
@@ -606,7 +606,7 @@ void fk20_poly2h_fft_512(unsigned rows){
 
 /**
  * @brief Test for hext_fft2h_fft_512: hext_fft -> h_fft
- * 
+ *
  * @param rows number of blocks in the range [1,512]
  */
 void hext_fft2h_fft_512(unsigned rows){
@@ -644,7 +644,7 @@ void hext_fft2h_fft_512(unsigned rows){
 
 /**
  * @brief Test for fk20_msm: Toeplitz_coefficients+xext_fft -> hext_fft
- * 
+ *
  * @param rows number of blocks in the range [1,512]
  */
 void fk20_msmloop_512(unsigned rows){
@@ -693,7 +693,7 @@ void fk20_msmloop_512(unsigned rows){
         CLOCKSTART;
         fk20_poly2toeplitz_coefficients_fft<<<rows, 256>>>(fr_tmp_, polynomial);
         err = cudaDeviceSynchronize();
-        CUDASYNC("fk20_poly2toeplitz_coefficients_fft"); 
+        CUDASYNC("fk20_poly2toeplitz_coefficients_fft");
         CLOCKEND;
         clearRes;
         fr_eq_wrapper<<<16, 256>>>(cmp, rows*16*512, fr_tmp_, (fr_t *)toeplitz_coefficients_fft);
@@ -722,7 +722,7 @@ void fk20_msmloop_512(unsigned rows){
 /**
  * @brief swap elements at positions multiple of step. Nondestructive, call
  * a second time to undo the changes
- * 
+ *
  * @param[out] target Pointer to array
  * @param[in] size length of the array
  * @param[in] step distance between elements swapped.
@@ -744,7 +744,7 @@ void varMangle(fr_t *target, size_t size, unsigned step){
 /**
  * @brief swap elements at positions multiple of step. Nondestructive, call
  * a second time to undo the changes
- * 
+ *
  * @param[out] target Pointer to array
  * @param[in] size length of the array
  * @param[in] step distance between elements swapped.
 
@@ -10,13 +10,13 @@
 
 /**
  * @brief hext_fft -> h_fft
- * 
+ *
  * Grid must be 1-D, 256 threads per block.
  * Dynamic shared memory: g1p_sharedmem(73728 Bytes)
- * 
+ *
  * @param[out] h_fft array with dimensions [gridDim.x * 512]
  * @param[in] hext_fft array with dimensions [gridDim.x * 512]
- * @return void 
+ * @return void
  */
 __global__ void fk20_hext_fft2h_fft(g1p_t *h_fft, const g1p_t *hext_fft){
     if (gridDim.y  !=   1) return;
@@ -37,7 +37,7 @@ __global__ void fk20_hext_fft2h_fft(g1p_t *h_fft, const g1p_t *hext_fft){
     g1p_ift(h_fft, hext_fft);
     __syncthreads();
 
-    // zero second half of h
+    // Zero second half of h
     g1p_inf(h_fft[256+tid]);
     __syncthreads();