Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add optional DUCC FFT support #463

Merged
merged 33 commits into from
Jul 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
6b17007
downloading ducc0 in cmake
DiamonDinoia Jun 21, 2024
6479f47
most things are in place; need CMake machinery to compile ducc's .cc …
mreineck Jun 22, 2024
bc6f789
change names of unused variables, to spot inadvertent usage
mreineck Jun 22, 2024
0ff245a
almost working; some tests don't compile yet
mreineck Jun 22, 2024
2a8b39a
seems to run
mreineck Jun 22, 2024
127e015
more fixes
mreineck Jun 22, 2024
910d656
fix typo
mreineck Jun 22, 2024
83c7ecb
require C++17
mreineck Jun 22, 2024
2de0e50
more fixes
mreineck Jun 22, 2024
34013ab
one more
mreineck Jun 22, 2024
4fabaef
try to fix Windows
mreineck Jun 22, 2024
ff0944e
resolved conflicts
DiamonDinoia Jun 22, 2024
2142092
Added missing files
DiamonDinoia Jun 22, 2024
a252fff
merge Marco's updates
mreineck Jun 23, 2024
32bc8a1
cleanup
mreineck Jun 23, 2024
4975637
adjust indentation style
mreineck Jun 23, 2024
be575fc
fix
mreineck Jun 23, 2024
09874c2
pedantic cleanups
mreineck Jun 23, 2024
b7d10fd
added fft wrapper files and fixed cmake
DiamonDinoia Jun 24, 2024
601468a
be more careful about the number of threads to use
mreineck Jun 25, 2024
9436792
cleanup
mreineck Jun 25, 2024
9f86e46
move central FFT part to separate file
mreineck Jul 3, 2024
c8cc746
merge master
mreineck Jul 3, 2024
aa957c4
add fft.o to makefile
mreineck Jul 4, 2024
fe42592
add fft.o to makefile; run formatter on changed C++ files
mreineck Jul 4, 2024
caae2fc
more band-aid fixes
mreineck Jul 4, 2024
132a755
move fft.o to OBJS
mreineck Jul 8, 2024
5206e14
more fixes
mreineck Jul 8, 2024
5a5acb4
try to fix CI
mreineck Jul 8, 2024
632b335
shorten the partial FFT part
mreineck Jul 10, 2024
8b4fadf
allow switching off ducc FFT optimization via #define
mreineck Jul 11, 2024
17e8e04
allocate big arrays during plan generation unconditionally
mreineck Jul 11, 2024
6f8e4c8
add a comment on FFT savings
mreineck Jul 11, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 46 additions & 13 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ option(FINUFFT_USE_CUDA "Whether to build CUDA accelerated FINUFFT library (libc
option(FINUFFT_USE_CPU "Whether to build the ordinary FINUFFT library (libfinufft)." ON)
option(FINUFFT_STATIC_LINKING "Whether to link the static FINUFFT library (libfinufft_static)." ON)
option(FINUFFT_BUILD_DEVEL "Whether to build development executables" OFF)
option(FINUFFT_USE_DUCC0 "Whether to use DUCC0 for CPU fft library" ON)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, this has to be OFF by default, since users will need to be eased into this over time, if it turns out to be useful for them..

# sphinx tag (don't remove): @cmake_opts_end

if (FINUFFT_USE_CPU)
Expand All @@ -53,9 +54,21 @@ if (FINUFFT_USE_CPU)
set(FFTW_VERSION 3.3.10)
set(XTL_VERSION 0.7.7)
set(XSIMD_VERSION 13.0.0)
set(DUCC0_VERSION ducc0_0_34_0)
set(FINUFFT_FFTW_LIBRARIES)
include(cmake/setupCPM.cmake)
include(cmake/setupFFTW.cmake)
include(cmake/setupXSIMD.cmake)
if (FINUFFT_USE_DUCC0)
include(cmake/setupDUCC.cmake)
else ()
include(cmake/setupFFTW.cmake)
endif ()
endif ()

if (FINUFFT_USE_DUCC0)
set (FINUFFT_FFTLIBS ducc0)
else ()
set (FINUFFT_FFTLIBS ${FINUFFT_FFTW_LIBRARIES})
endif ()

if (FINUFFT_BUILD_MATLAB)
Expand Down Expand Up @@ -84,7 +97,7 @@ endif ()

# This set of sources is compiled twice, once in single precision and once in double precision
# The single precision compilation is done with -DSINGLE
set(FINUFFT_PRECISION_DEPENDENT_SOURCES src/finufft.cpp src/simpleinterfaces.cpp src/spreadinterp.cpp src/utils.cpp fortran/finufftfort.cpp)
set(FINUFFT_PRECISION_DEPENDENT_SOURCES src/finufft.cpp src/fft.cpp src/simpleinterfaces.cpp src/spreadinterp.cpp src/utils.cpp fortran/finufftfort.cpp)

# Set of compilers which behave like gcc
set(FINUFFT_GNU_LIKE_COMPILERS AppleClang Clang GNU)
Expand All @@ -104,6 +117,11 @@ endfunction()

# Utility function to link static/dynamic lib
function(finufft_link_test target)

if (FINUFFT_USE_DUCC0)
target_compile_definitions(${target} PRIVATE FINUFFT_USE_DUCC0)
endif ()

if (FINUFFT_STATIC_LINKING)
target_link_libraries(${target} PRIVATE finufft_static)
if (FINUFFT_USE_OPENMP)
Expand Down Expand Up @@ -151,37 +169,52 @@ function(set_finufft_options target)
endif ()
endif ()

# FFTW CMAKE file includes the APIs only as an install target, so we need to manually
# include them since we need them for build not for install
# trying to include them directly into the fftw and fftwf targets causes issues with
# the latest version of cmake, so we do it here instead.
if ((NOT FFTW_FOUND) OR (FINUFFT_FFTW_LIBRARIES STREQUAL DOWNLOAD))
list(GET FINUFFT_FFTW_LIBRARIES 0 element)
get_property(FFTW_SOURCE_DIR TARGET ${element} PROPERTY SOURCE_DIR)
set(FFTW_INCLUDE_DIR ${FFTW_SOURCE_DIR}/api)
target_include_directories(${target} PUBLIC ${FFTW_INCLUDE_DIR})
if (FINUFFT_USE_DUCC0)
target_compile_definitions(${target} PRIVATE FINUFFT_USE_DUCC0)
else ()
# FFTW CMAKE file includes the APIs only as an install target, so we need to manually
# include them since we need them for build not for install
# trying to include them directly into the fftw and fftwf targets causes issues with
# the latest version of cmake, so we do it here instead.
if ((NOT FFTW_FOUND) OR (FINUFFT_FFTW_LIBRARIES STREQUAL DOWNLOAD))
list(GET FINUFFT_FFTW_LIBRARIES 0 element)
get_property(FFTW_SOURCE_DIR TARGET ${element} PROPERTY SOURCE_DIR)
set(FFTW_INCLUDE_DIR ${FFTW_SOURCE_DIR}/api)
target_include_directories(${target} PUBLIC ${FFTW_INCLUDE_DIR})
endif ()
endif ()
target_link_libraries(${target} PRIVATE xsimd)
target_link_libraries(${target} PUBLIC ${FINUFFT_FFTW_LIBRARIES})
# target_link_libraries(${target} PRIVATE xsimd)
# target_link_libraries(${target} PUBLIC ${FINUFFT_FFTW_LIBRARIES})
endfunction()

if (FINUFFT_USE_CPU)
# Main finufft libraries
if (FINUFFT_USE_DUCC0)
# okay for now, to be cleaned up
set_finufft_options(ducc0)
endif ()

add_library(finufft_f32 OBJECT ${FINUFFT_PRECISION_DEPENDENT_SOURCES})
target_compile_definitions(finufft_f32 PRIVATE SINGLE)
set_finufft_options(finufft_f32)
target_link_libraries(finufft_f32 PUBLIC ${FINUFFT_FFTLIBS})
target_link_libraries(finufft_f32 PRIVATE xsimd)

add_library(finufft_f64 OBJECT ${FINUFFT_PRECISION_DEPENDENT_SOURCES})
target_compile_definitions(finufft_f64 PRIVATE)
set_finufft_options(finufft_f64)
target_link_libraries(finufft_f64 PUBLIC ${FINUFFT_FFTLIBS})
target_link_libraries(finufft_f64 PRIVATE xsimd)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@DiamonDinoia Marco, the reason windows can not find xsimd seems to be here, this line is moved out of set_finufft_options function. the below if(WIN32) does not add link xsimd.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great thanks for fixing it!

if (WIN32)
add_library(finufft_f32_dll OBJECT ${FINUFFT_PRECISION_DEPENDENT_SOURCES})
target_compile_definitions(finufft_f32_dll PRIVATE SINGLE dll_EXPORTS FINUFFT_DLL)
set_finufft_options(finufft_f32_dll)
target_link_libraries(finufft_f32_dll PUBLIC ${FINUFFT_FFTLIBS})

add_library(finufft_f64_dll OBJECT ${FINUFFT_PRECISION_DEPENDENT_SOURCES})
target_compile_definitions(finufft_f64_dll PRIVATE dll_EXPORTS FINUFFT_DLL)
set_finufft_options(finufft_f64_dll)
target_link_libraries(finufft_f64_dll PUBLIC ${FINUFFT_FFTLIBS})
endif ()

add_library(finufft SHARED src/utils_precindep.cpp contrib/legendre_rule_fast.cpp)
Expand Down
25 changes: 25 additions & 0 deletions cmake/setupDUCC.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
CPMAddPackage(
NAME ducc0
GIT_REPOSITORY https://gitlab.mpcdf.mpg.de/mtr/ducc.git
GIT_TAG ${DUCC0_VERSION}
DOWNLOAD_ONLY YES
)


if(ducc0_ADDED)
add_library(ducc0 STATIC
${ducc0_SOURCE_DIR}/src/ducc0/infra/string_utils.cc
${ducc0_SOURCE_DIR}/src/ducc0/infra/threading.cc
${ducc0_SOURCE_DIR}/src/ducc0/infra/mav.cc
${ducc0_SOURCE_DIR}/src/ducc0/math/gridding_kernel.cc
${ducc0_SOURCE_DIR}/src/ducc0/math/gl_integrator.cc
)
target_include_directories(ducc0 PUBLIC ${ducc0_SOURCE_DIR}/src/)
target_compile_options(ducc0 PRIVATE -ffast-math)
target_compile_features(ducc0 PRIVATE cxx_std_17) # private because we do not want to propagate this requirement

if (NOT OpenMP_CXX_FOUND)
find_package(Threads REQUIRED)
target_link_libraries(ducc0 PRIVATE Threads::Threads)
endif ()
endif ()
1 change: 0 additions & 1 deletion cmake/setupFFTW.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@ if (FINUFFT_FFTW_LIBRARIES STREQUAL DEFAULT OR FINUFFT_FFTW_LIBRARIES STREQUAL D
EXCLUDE_FROM_ALL YES
GIT_SHALLOW YES
)

set(FINUFFT_FFTW_LIBRARIES fftw3 fftw3f)
if (FINUFFT_USE_THREADS)
list(APPEND FINUFFT_FFTW_LIBRARIES fftw3_threads fftw3f_threads)
Expand Down
2 changes: 1 addition & 1 deletion devel/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,4 @@ add_executable(foldrescale foldrescale.cpp)
target_link_libraries(foldrescale finufft benchmark xsimd)
add_executable(padding padding.cpp)
target_link_libraries(padding finufft xsimd)
target_compile_options(padding PRIVATE -march=native)
target_compile_options(padding PRIVATE -march=native)
8 changes: 4 additions & 4 deletions include/finufft/defs.h
Original file line number Diff line number Diff line change
Expand Up @@ -186,9 +186,7 @@
// -------- FINUFFT's plan object, prec-switching version ------------------
// NB: now private (the public C++ or C etc user sees an opaque pointer to it)

// FFTW is needed since we include a FFTW plan in the FINUFFT plan...
#include <finufft/fftw_defs.h> // (must come after complex.h)
// (other FFT lib headers eg MKL could be here...)
#include <finufft/fft.h> // (must come after complex.h)

// group together a bunch of type 3 rescaling/centering/phasing parameters:
#define TYPE3PARAMS FINUFFTIFY(_type3Params)
Expand Down Expand Up @@ -225,7 +223,7 @@ typedef struct FINUFFT_PLAN_S { // the main plan object, fully C++
FLT *phiHat2; // " y-axis.
FLT *phiHat3; // " z-axis.

FFTW_CPX *fwBatch; // (batches of) fine grid(s) for FFTW to plan
CPX *fwBatch; // (batches of) fine grid(s) for FFTW to plan
// & act on. Usually the largest working array

BIGINT *sortIndices; // precomputed NU pt permutation, speeds spread/interp
Expand All @@ -244,7 +242,9 @@ typedef struct FINUFFT_PLAN_S { // the main plan object, fully C++
FINUFFT_PLAN innerT2plan; // ptr used for type 2 in step 2 of type 3

// other internal structs; each is C-compatible of course
#ifndef FINUFFT_USE_DUCC0
FFTW_PLAN fftwPlan;
#endif
finufft_opts opts; // this and spopts could be made ptrs
finufft_spread_opts spopts;

Expand Down
18 changes: 18 additions & 0 deletions include/finufft/fft.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#ifndef FINUFFT_INCLUDE_FINUFFT_FFT_H
#define FINUFFT_INCLUDE_FINUFFT_FFT_H

#ifdef FINUFFT_USE_DUCC0
#include "ducc0/fft/fftnd_impl.h"
#define FFTW_FORGET_WISDOM() // temporary hack since some tests call this unconditionally
#define FFTW_CLEANUP() // temporary hack since some tests call this unconditionally
#define FFTW_CLEANUP_THREADS() // temporary hack since some tests call this
// unconditionally
#else
#include "fftw_defs.h"
#endif
#include <finufft/defs.h>

int *gridsize_for_fft(FINUFFT_PLAN p);
void do_fft(FINUFFT_PLAN p);

#endif // FINUFFT_INCLUDE_FINUFFT_FFT_H
3 changes: 0 additions & 3 deletions include/finufft/test_defs.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,6 @@
#include <finufft/utils_precindep.h>
// prec-switching (via SINGLE) to set up FLT, CPX, BIGINT, FINUFFT1D1, etc...
#include <finufft/defs.h>
// since "many" (vector) tests need direct access to FFTW commands...
// (although this now happens to be included in defs.h too)
#include <finufft/fftw_defs.h>

// std stuff for tester src
#include <cstdio>
Expand Down
2 changes: 0 additions & 2 deletions include/finufft_eitherprec.h
Original file line number Diff line number Diff line change
Expand Up @@ -184,5 +184,3 @@ FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(3d3many)(
#undef FINUFFT_PLAN
#undef FINUFFT_PLAN_S
#undef FINUFFT_TYPE3PARAMS
#undef FINUFFT_FFTW_CPX
#undef FINUFFT_FFTW_PLAN
2 changes: 1 addition & 1 deletion makefile
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ SOBJS_PI = src/utils_precindep.o
SOBJSD = $(SOBJS) $(SOBJSF) $(SOBJS_PI)

# double-prec library object files that also need single precision...
OBJS = $(SOBJS) src/finufft.o src/simpleinterfaces.o fortran/finufftfort.o
OBJS = $(SOBJS) src/finufft.o src/simpleinterfaces.o fortran/finufftfort.o src/fft.o
# their single-prec versions
OBJSF = $(OBJS:%.o=%_32.o)
# precision-dependent library object files (compiled & linked only once)...
Expand Down
6 changes: 6 additions & 0 deletions perftest/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,15 @@ set(PERFTESTS guru_timing_test manysmallprobs spreadtestnd spreadtestndall)

foreach(TEST ${PERFTESTS})
add_executable(${TEST} ${TEST}.cpp)
if (FINUFFT_USE_DUCC0)
target_compile_definitions(${TEST} PRIVATE -DFINUFFT_USE_DUCC0)
endif ()
finufft_link_test(${TEST})

add_executable(${TEST}f ${TEST}.cpp)
target_compile_definitions(${TEST}f PRIVATE -DSINGLE)
if (FINUFFT_USE_DUCC0)
target_compile_definitions(${TEST}f PRIVATE -DFINUFFT_USE_DUCC0)
endif ()
finufft_link_test(${TEST}f)
endforeach()
115 changes: 115 additions & 0 deletions src/fft.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
#include <algorithm>
#include <finufft/fft.h>

using namespace std;

int *gridsize_for_fft(FINUFFT_PLAN p) {
// local helper func returns a new int array of length dim, extracted from
// the finufft plan, that fftw_plan_many_dft needs as its 2nd argument.
int *nf;
if (p->dim == 1) {
nf = new int[1];
nf[0] = (int)p->nf1;
} else if (p->dim == 2) {
nf = new int[2];
nf[0] = (int)p->nf2;
nf[1] = (int)p->nf1;
} // fftw enforced row major ordering, ie dims are backwards ordered
else {
nf = new int[3];
nf[0] = (int)p->nf3;
nf[1] = (int)p->nf2;
nf[2] = (int)p->nf1;
}
return nf;
}

void do_fft(FINUFFT_PLAN p) {
#ifdef FINUFFT_USE_DUCC0
size_t nthreads = min<size_t>(MY_OMP_GET_MAX_THREADS(), p->opts.nthreads);
int *ns = gridsize_for_fft(p);
vector<size_t> arrdims, axes;
arrdims.push_back(size_t(p->batchSize));
arrdims.push_back(size_t(ns[0]));
axes.push_back(1);
if (p->dim >= 2) {
arrdims.push_back(size_t(ns[1]));
axes.push_back(2);
}
if (p->dim >= 3) {
arrdims.push_back(size_t(ns[2]));
axes.push_back(3);
}
ducc0::vfmav<CPX> data(p->fwBatch, arrdims);
#ifdef FINUFFT_NO_DUCC0_TWEAKS
ducc0::c2c(data, data, axes, p->fftSign < 0, FLT(1), nthreads);
#else
/* For type 1 NUFFTs, only the low-frequency parts of the output fine grid are
going to be used, and for type 2 NUFFTs, the high frequency parts of the
input fine grid are zero by definition. This can be used to reduce the
total FFT work for 2D and 3D NUFFTs. One of the FFT axes always has to be
transformed fully (that's why there is no savings for 1D NUFFTs), for the
second axis we need to do (roughly) a fraction of 1/oversampling_factor
of all 1D FFTs, and for the last remaining axis the factor is
1/oversampling_factor^2. */
if (p->dim == 1) // 1D: no chance for FFT shortcuts
ducc0::c2c(data, data, axes, p->fftSign < 0, FLT(1), nthreads);
else if (p->dim == 2) { // 2D: do partial FFTs
if (p->ms < 2) // something is weird, do standard FFT
ducc0::c2c(data, data, axes, p->fftSign < 0, FLT(1), nthreads);
else {
size_t y_lo = size_t((p->ms + 1) / 2);
size_t y_hi = size_t(ns[1] - p->ms / 2);
auto sub1 = ducc0::subarray(data, {{}, {}, {0, y_lo}});
auto sub2 = ducc0::subarray(data, {{}, {}, {y_hi, ducc0::MAXIDX}});
if (p->type == 1) // spreading, not all parts of the output array are needed
// do axis 2 in full
ducc0::c2c(data, data, {2}, p->fftSign < 0, FLT(1), nthreads);
// do only parts of axis 1
ducc0::c2c(sub1, sub1, {1}, p->fftSign < 0, FLT(1), nthreads);
ducc0::c2c(sub2, sub2, {1}, p->fftSign < 0, FLT(1), nthreads);
if (p->type == 2) // interpolation, parts of the input array are zero
// do axis 2 in full
ducc0::c2c(data, data, {2}, p->fftSign < 0, FLT(1), nthreads);
}
} else { // 3D
if ((p->ms < 2) || (p->mt < 2)) // something is weird, do standard FFT
ducc0::c2c(data, data, axes, p->fftSign < 0, FLT(1), nthreads);
else {
size_t z_lo = size_t((p->ms + 1) / 2);
size_t z_hi = size_t(ns[2] - p->ms / 2);
size_t y_lo = size_t((p->mt + 1) / 2);
size_t y_hi = size_t(ns[1] - p->mt / 2);
auto sub1 = ducc0::subarray(data, {{}, {}, {}, {0, z_lo}});
auto sub2 = ducc0::subarray(data, {{}, {}, {}, {z_hi, ducc0::MAXIDX}});
auto sub3 = ducc0::subarray(sub1, {{}, {}, {0, y_lo}, {}});
auto sub4 = ducc0::subarray(sub1, {{}, {}, {y_hi, ducc0::MAXIDX}, {}});
auto sub5 = ducc0::subarray(sub2, {{}, {}, {0, y_lo}, {}});
auto sub6 = ducc0::subarray(sub2, {{}, {}, {y_hi, ducc0::MAXIDX}, {}});
if (p->type == 1) { // spreading, not all parts of the output array are needed
// do axis 3 in full
ducc0::c2c(data, data, {3}, p->fftSign < 0, FLT(1), nthreads);
// do only parts of axis 2
ducc0::c2c(sub1, sub1, {2}, p->fftSign < 0, FLT(1), nthreads);
ducc0::c2c(sub2, sub2, {2}, p->fftSign < 0, FLT(1), nthreads);
}
// do even smaller parts of axis 1
ducc0::c2c(sub3, sub3, {1}, p->fftSign < 0, FLT(1), nthreads);
ducc0::c2c(sub4, sub4, {1}, p->fftSign < 0, FLT(1), nthreads);
ducc0::c2c(sub5, sub5, {1}, p->fftSign < 0, FLT(1), nthreads);
ducc0::c2c(sub6, sub6, {1}, p->fftSign < 0, FLT(1), nthreads);
if (p->type == 2) { // interpolation, parts of the input array are zero
// do only parts of axis 2
ducc0::c2c(sub1, sub1, {2}, p->fftSign < 0, FLT(1), nthreads);
ducc0::c2c(sub2, sub2, {2}, p->fftSign < 0, FLT(1), nthreads);
// do axis 3 in full
ducc0::c2c(data, data, {3}, p->fftSign < 0, FLT(1), nthreads);
}
}
}
#endif
delete[] ns;
#else
FFTW_EX(p->fftwPlan); // if thisBatchSize<batchSize it wastes some flops
#endif
}
Loading
Loading