diff --git a/benchmarks/bench_gt_multiexp_bls12_381.nim b/benchmarks/bench_gt_multiexp_bls12_381.nim index 630f2a1d..01a8e699 100644 --- a/benchmarks/bench_gt_multiexp_bls12_381.nim +++ b/benchmarks/bench_gt_multiexp_bls12_381.nim @@ -45,6 +45,7 @@ proc main() = for numPoints in testNumPoints: let batchIters = max(1, Iters div numPoints) ctx12o4.multiExpParallelBench(numPoints, batchIters) + echo "----" ctx12o6.multiExpParallelBench(numPoints, batchIters) separator() separator() diff --git a/benchmarks/bench_gt_parallel_template.nim b/benchmarks/bench_gt_parallel_template.nim index bef3d7d7..f5d81dc9 100644 --- a/benchmarks/bench_gt_parallel_template.nim +++ b/benchmarks/bench_gt_parallel_template.nim @@ -128,6 +128,7 @@ proc multiExpParallelBench*[GT](ctx: var BenchMultiExpContext[GT], numInputs: in var startNaive, stopNaive, startMultiExpBaseline, stopMultiExpBaseline: MonoTime var startMultiExpOpt, stopMultiExpOpt: MonoTime var startMultiExpPara, stopMultiExpPara: MonoTime + var startMultiExpParaTorus, stopMultiExpParaTorus: MonoTime when GT is QuadraticExt: var startMultiExpBaselineTorus: MonoTime @@ -186,11 +187,22 @@ proc multiExpParallelBench*[GT](ctx: var BenchMultiExpContext[GT], numInputs: in startMultiExpPara = getMonotime() bench("𝔾ₜ multi-exp " & align($ctx.tp.numThreads & " threads", 11) & align($numInputs, 10) & " (" & $bits & "-bit exponents)", GT, iters): - ctx.tp.multiExp_vartime_parallel(r, elems, exponents) + ctx.tp.multiExp_vartime_parallel(r, elems, exponents, useTorus = false) stopMultiExpPara = getMonotime() ctx.tp.shutdown() + when GT is QuadraticExt: + block: + ctx.tp = Threadpool.new() + + startMultiExpParaTorus = getMonotime() + bench("𝔾ₜ multi-exp torus" & align($ctx.tp.numThreads & " threads", 11) & align($numInputs, 10) & " (" & $bits & "-bit exponents)", GT, iters): + ctx.tp.multiExp_vartime_parallel(r, elems, exponents, useTorus = true) + stopMultiExpParaTorus = getMonotime() + + ctx.tp.shutdown() + let perfNaive = inNanoseconds((stopNaive-startNaive) div iters) let perfMultiExpBaseline = inNanoseconds((stopMultiExpBaseline-startMultiExpBaseline) div iters) let perfMultiExpOpt = inNanoseconds((stopMultiExpOpt-startMultiExpOpt) div iters) @@ -198,6 +210,7 @@ proc multiExpParallelBench*[GT](ctx: var BenchMultiExpContext[GT], numInputs: in when GT is QuadraticExt: let perfMultiExpBaselineTorus = inNanoseconds((stopMultiExpBaselineTorus-startMultiExpBaselineTorus) div iters) let perfMultiExpOptTorus = inNanoseconds((stopMultiExpOptTorus-startMultiExpOptTorus) div iters) + let perfMultiExpParaTorus = inNanoSeconds((stopMultiExpParaTorus-startMultiExpParaTorus) div iters) if numInputs <= 100000: let speedupBaseline = float(perfNaive) / float(perfMultiExpBaseline) @@ -215,3 +228,10 @@ proc multiExpParallelBench*[GT](ctx: var BenchMultiExpContext[GT], numInputs: in let speedupParaOpt = float(perfMultiExpOpt) / float(perfMultiExpPara) echo &"Speedup ratio parallel over serial optimized linear combination: {speedupParaOpt:>6.3f}x" + + when GT is QuadraticExt: + let speedupParaTorus = float(perfMultiExpOptTorus) / float(perfMultiExpParaTorus) + echo &"Speedup ratio parallel over serial for Torus-based multiexp: {speedupParaTorus:>6.3f}x" + + let speedupParaTorusOpt = float(perfMultiExpPara) / float(perfMultiExpParaTorus) + echo &"Speedup ratio parallel over parallel Torus-based multiexp: {speedupParaTorusOpt:>6.3f}x" \ No newline at end of file diff --git a/constantine/math/pairings/gt_multiexp_parallel.nim b/constantine/math/pairings/gt_multiexp_parallel.nim index c4e7a9a5..a7aad4fd 100644 --- a/constantine/math/pairings/gt_multiexp_parallel.nim +++ b/constantine/math/pairings/gt_multiexp_parallel.nim @@ -239,7 +239,7 @@ proc applyEndoTorus_parallel[bits: static int, GT]( # but we could parallel batch convert over the whole range endoTorusBasis[i].batchFromGT_vartime(endoBasis[i]) - let endoTorusElems = cast[ptr UncheckedArray[GT]](endoTorusBasis) + let endoTorusElems = cast[ptr UncheckedArray[T2Aff[F]]](endoTorusBasis) let endoExpos = cast[ptr UncheckedArray[BigInt[L]]](splitExpos) freeHeapAligned(endoBasis) @@ -258,7 +258,10 @@ template withEndoTorus[exponentsBits: static int, GT]( let (endoTorusElems, endoExpos, endoN) = applyEndoTorus_parallel(tp, elems, expos, N) # Given that bits and N changed, we are able to use a bigger `c` # TODO: bench - multiExpProc(tp, r, endoTorusElems, endoExpos, endoN, c) + type F = typeof(elems[0].c0) + var r_torus {.noInit.}: T2Prj[F] + multiExpProc(tp, r_torus.addr, endoTorusElems, endoExpos, endoN, c) + r[].fromTorus2_vartime(r_torus) freeHeap(endoTorusElems) freeHeap(endoExpos) else: @@ -283,18 +286,18 @@ proc multiexp_dispatch_vartime_parallel[bits: static int, GT]( when useTorus: case c - of 2: withTorus(multiExpImpl_vartime_parallel, tp, r, elems, expos, N, c = 2) - of 3: withTorus(multiExpImpl_vartime_parallel, tp, r, elems, expos, N, c = 3) - of 4: withTorus(multiExpImpl_vartime_parallel, tp, r, elems, expos, N, c = 4) - of 5: withTorus(multiExpImpl_vartime_parallel, tp, r, elems, expos, N, c = 5) - of 6: withTorus(multiExpImpl_vartime_parallel, tp, r, elems, expos, N, c = 6) - of 7: withTorus(multiExpImpl_vartime_parallel, tp, r, elems, expos, N, c = 7) - of 8: withTorus(multiExpImpl_vartime_parallel, tp, r, elems, expos, N, c = 8) - of 9: withTorus(multiExpImpl_vartime_parallel, tp, r, elems, expos, N, c = 9) - of 10: withTorus(multiExpImpl_vartime_parallel, tp, r, elems, expos, N, c = 10) - of 11: withTorus(multiExpImpl_vartime_parallel, tp, r, elems, expos, N, c = 11) - of 12: withTorus(multiExpImpl_vartime_parallel, tp, r, elems, expos, N, c = 12) - of 13: withTorus(multiExpImpl_vartime_parallel, tp, r, elems, expos, N, c = 13) + of 2: withEndoTorus(multiExpImpl_vartime_parallel, tp, r, elems, expos, N, c = 2) + of 3: withEndoTorus(multiExpImpl_vartime_parallel, tp, r, elems, expos, N, c = 3) + of 4: withEndoTorus(multiExpImpl_vartime_parallel, tp, r, elems, expos, N, c = 4) + of 5: withEndoTorus(multiExpImpl_vartime_parallel, tp, r, elems, expos, N, c = 5) + of 6: withEndoTorus(multiExpImpl_vartime_parallel, tp, r, elems, expos, N, c = 6) + of 7: withEndoTorus(multiExpImpl_vartime_parallel, tp, r, elems, expos, N, c = 7) + of 8: withEndoTorus(multiExpImpl_vartime_parallel, tp, r, elems, expos, N, c = 8) + of 9: withEndoTorus(multiExpImpl_vartime_parallel, tp, r, elems, expos, N, c = 9) + of 10: withEndoTorus(multiExpImpl_vartime_parallel, tp, r, elems, expos, N, c = 10) + of 11: withEndoTorus(multiExpImpl_vartime_parallel, tp, r, elems, expos, N, c = 11) + of 12: withEndoTorus(multiExpImpl_vartime_parallel, tp, r, elems, expos, N, c = 12) + of 13: withEndoTorus(multiExpImpl_vartime_parallel, tp, r, elems, expos, N, c = 13) of 14: withTorus(multiExpImpl_vartime_parallel, tp, r, elems, expos, N, c = 14) of 15: withTorus(multiExpImpl_vartime_parallel, tp, r, elems, expos, N, c = 15)