add anderson acceleration

works great !!
samayala22 · Feb 25, 2025 · 449171f · 449171f
1 parent 5079de1
commit 449171f
Show file tree

Hide file tree

Showing 6 changed files with 256 additions and 26 deletions.
diff --git a/data/anderson.py b/data/anderson.py
@@ -1,5 +1,6 @@
 import numpy as np
 from scipy.optimize import newton_krylov
+import scipy.linalg.lapack as lapack
 import time
 import plotly.express as px
 
@@ -41,7 +42,7 @@ def anderson_acceleration_fast(f, x0, k_max=100, tol_res=1e-6, m=3):
     x_new = np.zeros(n)
     g_curr = np.zeros(n)
     g_new = np.zeros(n)
-    gamma = np.zeros(m)
+    gamma = np.zeros(n)
     residual_history = []
 
     # Initialization
@@ -63,12 +64,12 @@ def anderson_acceleration_fast(f, x0, k_max=100, tol_res=1e-6, m=3):
         m_k = min(m, k)
         # Solve the least–squares problem:
         #     gamma = argmin || g_curr - Gbuf[:, :m_k]*gamma ||
-        gamma, _, _, _ = np.linalg.lstsq(Gbuf[:, :m_k], g_curr, rcond=None)
+        # gamma, _, _, _ = np.linalg.lstsq(Gbuf[:, :m_k], g_curr, rcond=None)
+        _, gamma, _ = lapack.dgels(Gbuf[:, :m_k], g_curr)
         # Compute the update correction:
         # The acceleration step uses all stored differences (from both x and g):
         #    x_new = x_curr + g_curr - (Xbuf + Gbuf) * gamma.
         x_new = x_curr + g_curr - (Xbuf[:, :m_k] + Gbuf[:, :m_k]) @ gamma[:m_k]
-
         # Shift
         if (m_k == m):
             Xbuf[:, :-1] = Xbuf[:, 1:]
@@ -88,7 +89,7 @@ def anderson_acceleration_fast(f, x0, k_max=100, tol_res=1e-6, m=3):
         tickformat='.2e',  # Format as exponential with 2 decimal places
         exponentformat='e'  # Use 'e' notation (alternatives: 'E', 'power', 'SI', 'B')
     )
-    fig.show()
+    # fig.show()
 
     return x_curr, k
 
@@ -108,16 +109,9 @@ def picard(f, x0, k_max=100, tol_res=1e-6):
             print("Warning: Picard iteration did not converge.")
     return x_curr, k
 
-# def f(x):
-#     """Fixed-point function mapping R^2 to R^2.
-#     In this example, we define:
-#     f₁(x,y) = sin(x) + arctan(y)
-#     f₂(x,y) = sin(y) + arctan(x)
-#     so that a fixed point solves:
-#     x = sin(x) + arctan(y),  y = sin(y) + arctan(x)
-#     """
-#     return np.array([2.0 * np.sin(x[0]) + np.arctan(x[1]),
-#                      np.sin(x[1]) + 2.0 * np.arctan(x[0])])
+def mapping_2d(x):
+    return np.array([2.0 * np.sin(x[0]) + np.arctan(x[1]),
+                     np.sin(x[1]) + 2.0 * np.arctan(x[0])])
 
 # def f(x):
 #     return np.array([np.sin(x[0]) + np.arctan(x[0])])
@@ -136,7 +130,7 @@ def h(x):
     def f(x):
         return x + delta * (h(x) - x)
 
-    return f
+    return h
 
 def bench(f: callable, func: callable, warmup=10, tries=50):
     for i in range(warmup):
@@ -166,23 +160,30 @@ def main():
     # x0 = np.array([1.0])
     np.random.seed(42)  # For reproducibility
 
-    n = 50
-    x0 = np.random.rand(n)
-    func = nonlinear_fixed_point(n)
+    # n = 5
+    # x0 = np.random.rand(n)
+    # func = nonlinear_fixed_point(n)
+
+    x0 = np.array([1.0, 1.0])
+    func = mapping_2d
 
     k_max = 1000
     tol_res = 1e-6
-    m = 3
+    m = 2
 
     def F(x): return func(x)-x
     ff = FunctionCounter(F)
     fff = FunctionCounter(func)
     fixed_point, iterations = anderson_acceleration_fast(fff, x0, k_max, tol_res, m)
     print("Anderson Function evaluations:", fff.eval_count)
+    print(fixed_point)
     # print(f"Anderson computed fixed point after {iterations} iterations")
-    # assert np.allclose(func(fixed_point), fixed_point, atol=tol_res)
+    assert np.allclose(func(fixed_point), fixed_point, atol=tol_res)
+
+    fff.reset()
+    fixed_point, iterations = picard(fff, x0, k_max, tol_res)
+    print("Picard Function evaluations:", fff.eval_count)
 
-    # fixed_point, iterations = picard(func, x0, k_max, tol_res)
     # print(f"Picard computed fixed point after {iterations} iterations")
     # assert np.allclose(func(fixed_point), fixed_point, atol=tol_res)
 

diff --git a/tests/hbvlm.cpp b/tests/hbvlm.cpp
@@ -13,6 +13,8 @@
 #include "vlm_kinematics.hpp"
 #include "vlm_utils.hpp"
 
+// #define PICARD 1
+
 using namespace vlm;
 using namespace linalg::ostream_overloads;
 
@@ -50,6 +52,7 @@ class HBVLM final: public Simulation {
         std::vector<i32> condition0;
         Assembly m_assembly;
         i32 m_harmonics;
+
     private:
         void alloc_buffers();
 };
@@ -143,6 +146,102 @@ void gamma_wake_from_coeffs(
     }
 }
 
+void anderson_acceleration(
+    Backend* backend,
+    const TensorView1D<Location::Device>& x0,
+    std::function<void(const TensorView1D<Location::Device>& x, const TensorView1D<Location::Device>& y)> f,
+    i32 max_iter = 100,
+    f32 tol_res = 1e-6,
+    i32 m = 3
+)
+{
+    i64 n = x0.shape(0);
+    Tensor2D<Location::Device> _X_buf{backend->memory.get()};
+    Tensor2D<Location::Device> _G_buf{backend->memory.get()};
+    Tensor2D<Location::Device> _G_buf_k{backend->memory.get()}; // copy because lsq overwrites
+    Tensor1D<Location::Device> _x_curr{backend->memory.get()};
+    Tensor1D<Location::Device> _x_new{backend->memory.get()};
+    Tensor1D<Location::Device> _g_curr{backend->memory.get()};
+    Tensor1D<Location::Device> _g_new{backend->memory.get()};
+    Tensor1D<Location::Device> _gamma{backend->memory.get()};
+    auto lsq_solver = backend->create_lsq();
+
+    _X_buf.init({n, m});
+    _G_buf.init({n, m});
+    _G_buf_k.init({n, m});
+    _x_curr.init({n});
+    _x_new.init({n});
+    _g_curr.init({n});
+    _g_new.init({n});
+    _gamma.init({n});
+
+    auto& X_buf = _X_buf.view();
+    auto& G_buf = _G_buf.view();
+    auto& G_buf_k = _G_buf_k.view();
+    auto& x_curr = _x_curr.view();
+    auto& x_new = _x_new.view();
+    auto& g_curr = _g_curr.view();
+    auto& g_new = _g_new.view();
+    auto& gamma = _gamma.view();
+
+    X_buf.fill(0.f); // not necessary
+    G_buf.fill(0.f); // not necessary
+    G_buf_k.fill(0.f); // not necessary
+
+    x0.to(x_curr);
+    f(x_curr, x_new);
+    x_new.to(g_curr);
+    backend->blas->axpy(-1.0f, x_curr, g_curr);
+    g_curr.to(X_buf.slice(All, 0));
+
+    x_new.to(x_curr);
+    f(x_curr, x_new);
+    x_new.to(g_new);
+    backend->blas->axpy(-1.0f, x_curr, g_new);
+    g_new.to(G_buf.slice(All, 0));
+    backend->blas->axpy(-1.0f, g_curr, G_buf.slice(All, 0));
+    g_new.to(g_curr);
+
+    i32 k = 1;
+    while (k < max_iter && backend->blas->norm(g_curr) > tol_res) {
+        i32 m_k = std::min(m, k);
+        i32 m_bk = std::min(m_k, m-1);
+        g_curr.to(gamma);
+        auto G_bufk = G_buf.slice(All, Range{0, m_k});
+        auto G_bufk_k = G_buf_k.slice(All, Range{0, m_k});
+        auto X_bufk = X_buf.slice(All, Range{0, m_k});
+        auto gammak = gamma.slice(Range{0, m_k});
+
+        G_bufk.to(G_bufk_k);
+        lsq_solver->solve(G_bufk_k, gamma.reshape(gamma.shape(0), 1));
+        x_curr.to(x_new); // i think not necessary
+        backend->blas->axpy(1.0f, g_curr, x_new);
+        backend->blas->gemv(-1.0f, X_bufk, gammak, 1.0f, x_new);
+        backend->blas->gemv(-1.0f, G_bufk, gammak, 1.0f, x_new);
+
+        if (m_k == m) {
+            for (i32 i = 0; i < m-1; i++) {
+                X_bufk.slice(All, i+1).to(X_bufk.slice(All, i));
+                G_bufk.slice(All, i+1).to(G_bufk.slice(All, i));
+            }
+        }
+
+        x_new.to(X_buf.slice(All, m_bk));
+        backend->blas->axpy(-1.0f, x_curr, X_buf.slice(All, m_bk));
+        x_new.to(x_curr);
+        f(x_curr, x_new);
+        x_new.to(g_new);
+        backend->blas->axpy(-1.0f, x_curr, g_new);
+        g_new.to(G_buf.slice(All, m_bk));
+        backend->blas->axpy(-1.0f, g_curr, G_buf.slice(All, m_bk));
+        g_new.to(g_curr);
+        k += 1;
+    }
+    x_curr.to(x0);
+
+    std::printf("Anderson fixed point converged in %d iterations\n", k);
+}
+
 void HBVLM::run(f32 t_start, f32 omega) {
     const tiny::ScopedTimer timer("HBVLM::run");
     const f32 period = 2.0f * PI_f / omega;
@@ -219,10 +318,90 @@ void HBVLM::run(f32 t_start, f32 omega) {
         dft_hv.to(dft_dv);
     }
 
+    auto hb_vlm_iter = [&](
+        const TensorView1D<Location::Device>& _gamma_in, 
+        const TensorView1D<Location::Device>& _gamma_out
+    ){
+        auto gamma_in  = _gamma_in.reshape(lhs.view().shape(0), 2*m_harmonics+1);
+        auto gamma_out = _gamma_out.reshape(lhs.view().shape(0), 2*m_harmonics+1);
+
+        rhs.view().fill(0.f);
+        // Note this only work on a single wing
+        auto te_gamma = gamma_in
+            .reshape(colloc_d.views()[0].shape(0), colloc_d.views()[0].shape(1), gamma_in.shape(1))
+            .slice(All, -1, All);
+
+        // For each unknown we fill their respective rhs column in a matrix free fashion
+        for (i32 s = 0; s < 2*m_harmonics+1; s++) {
+            const f32 t_final = t_start + period * (f32)s / (f32)(2*m_harmonics+1);
+            const i64 t_steps = static_cast<i64>(std::round(t_final / dt));
+            const f32 t = (f32)t_steps * dt; // t_final rounded to nearest dt
+
+            for (i64 m = 0; m < assembly_wings.size(); m++) {
+                auto transform = m_assembly.surface_kinematics()[m]->transform(t);
+                transform.store(transforms_h.views()[m].ptr(), transforms_h.views()[m].stride(1));
+                transforms_h.views()[m].to(transforms.views()[m]);
+            }
+
+            backend->displace_wing(transforms.views(), verts_wing.views(), verts_wing_init.views());
+            backend->mesh_metrics(0.0f, verts_wing.views(), colloc_d.views(), normals_d.views(), areas_d.views());
+
+            // parallel for
+            for (i64 m = 0; m < assembly_wings.size(); m++) {
+                const auto transform_node = m_assembly.surface_kinematics()[m];
+                const auto& colloc_h_m = colloc_h.views()[m];
+                const auto& velocities_h_m = velocities_h.views()[m];
+                const auto& velocities_m = velocities.views()[m];
+                const auto mat_transform_dual = transform_node->transform_dual(t);
+
+                for (i64 j = 0; j < colloc_h_m.shape(1); j++) {
+                    for (i64 i = 0; i < colloc_h_m.shape(0); i++) {
+                        auto local_velocity = -transform_node->linear_velocity(mat_transform_dual, {colloc_h_m(i, j, 0), colloc_h_m(i, j, 1), colloc_h_m(i, j, 2)});
+                        velocities_h_m(i, j, 0) = local_velocity.x;
+                        velocities_h_m(i, j, 1) = local_velocity.y;
+                        velocities_h_m(i, j, 2) = local_velocity.z;
+                    }
+                }
+                velocities_h_m.to(velocities_m);
+            }
+
+            auto rhs_s = rhs.view().slice(All, s);
+            backend->rhs_assemble_velocities(rhs_s, normals_d.views(), velocities.views());
+            gamma_wake_from_coeffs(gamma_wake[s].views()[0], te_gamma, m_harmonics, t, omega, dt, t_steps);
+            backend->rhs_assemble_wake_influence(rhs_s, gamma_wake[s].views(), colloc_d.views(), normals_d.views(), verts_wake.views(), m_assembly.lifting(), (i32)t_steps);
+        } // unknowns for loop
+
+        // Apply the the dft matrix to the rhs
+        // A @ G @ D^T = R <=> A @ G = R @ D (since D^-1 = D^T)
+        backend->blas->gemm(1.0f, rhs.view(), dft_d.view(), 0.0f, gamma_out);
+        // Solve to obtain the fourier coefficients for each panel
+        solver->solve(lhs.view(), gamma_out);
+    };
+
+    auto& gamma_coeffs_v = gamma_coeffs.view();
+    gamma_coeffs_v.fill(0.f);
+
+    #ifndef PICARD
+    anderson_acceleration(backend.get(), gamma_coeffs_v.reshape(gamma_coeffs_v.shape(0)*gamma_coeffs_v.shape(1)), hb_vlm_iter, 100, 1e-3, 3);
+    #endif
+
+    // Tensor1D<Location::Device> test_tensor{backend->memory.get()};
+    // test_tensor.init({2});
+    // test_tensor.view()(0) = 1.f;
+    // test_tensor.view()(1) = 1.f;
+
+    // auto mapping_2d = [](const TensorView1D<Location::Device>& x_in, const TensorView1D<Location::Device>& x_out) {
+    //     x_out(0) = 2.f * std::sin(x_in(0)) + std::atan(x_in(1));
+    //     x_out(1) = std::sin(x_in(1)) + 2.f * std::atan(x_in(0));
+    // };
+
+    // anderson_acceleration(backend.get(), test_tensor.view(), mapping_2d, 10, 1e-6, 2);
+    // std::cout << test_tensor.view()(0) << " " << test_tensor.view()(1) << "\n";
+
     // Iterative process for solving the blocked harmonic balance equation
+    #ifdef PICARD
     auto& residual_v = residual.view();
     residual_v.fill(1.f);
-    gamma_coeffs.view().fill(0.f);
     const f32 tol = 1e-3f;
     const i32 max_iter = 1000;
     i32 iter = 0;
@@ -290,7 +469,8 @@ void HBVLM::run(f32 t_start, f32 omega) {
         }
     } // while loop
 
-    std::printf("Fixed iteration process converged in %d iterations\n", iter);
+    std::printf("Picard iterative process converged in %d iterations\n", iter);
+    #endif
 
     // Compute time domain solution from the obtained fourier coeffs
     {
@@ -314,7 +494,7 @@ void HBVLM::run(f32 t_start, f32 omega) {
 }
 
 int main() {
-    const std::vector<std::string> meshes = {"../../../../mesh/infinite_rectangular_40x10.x"};
+    const std::vector<std::string> meshes = {"../../../../mesh/infinite_rectangular_20x5.x"};
     const std::vector<std::string> backends = {"cpu"};
 
     auto solvers = tiny::make_combination(meshes, backends);

diff --git a/vlm/backends/cpu/include/vlm_backend_cpu.hpp b/vlm/backends/cpu/include/vlm_backend_cpu.hpp
@@ -56,6 +56,7 @@ class BackendCPU final : public Backend {
         // std::unique_ptr<Kernels> create_kernels() override;
         std::unique_ptr<LU> create_lu_solver() override;
         std::unique_ptr<BLAS> create_blas() override;
+        std::unique_ptr<LSQ> create_lsq() override;
 };
 
 } // namespace vlm
diff --git a/vlm/backends/cpu/src/vlm_backend_cpu.cpp b/vlm/backends/cpu/src/vlm_backend_cpu.cpp
@@ -216,6 +216,39 @@ void CPU_BLAS::axpy(const f32 alpha, const TensorView<f32, 2, Location::Device>&
     }
 }
 
+class CPU_LSQ final : public LSQ {
+    public:
+        explicit CPU_LSQ(std::unique_ptr<Memory> memory) : LSQ(std::move(memory)) {}
+        virtual ~CPU_LSQ() = default;
+
+        void solve(
+            const TensorView<f32, 2, Location::Device>& A,
+            const TensorView<f32, 2, Location::Device>& B
+        ) override;
+
+    protected:
+        std::unique_ptr<Memory> m_memory;
+};
+
+void CPU_LSQ::solve(
+    const TensorView<f32, 2, Location::Device>& A,
+    const TensorView<f32, 2, Location::Device>& B
+) {
+    CHECK_LAPACK(
+        LAPACKE_sgels(
+            LAPACK_COL_MAJOR,
+            'N',
+            A.shape(0),
+            A.shape(1),
+            B.shape(1),
+            A.ptr(),
+            A.stride(1),
+            B.ptr(),
+            B.stride(1)
+        )
+    );
+}
+
 BackendCPU::BackendCPU() : Backend(create_memory_manager(), create_blas()) {
     name = "CPU";
     print_cpu_info();
@@ -227,7 +260,7 @@ std::unique_ptr<Memory> BackendCPU::create_memory_manager() { return std::make_u
 // std::unique_ptr<Kernels> create_kernels() { return std::make_unique<CPU_Kernels>(); }
 std::unique_ptr<LU> BackendCPU::create_lu_solver() { return std::make_unique<CPU_LU>(create_memory_manager()); }
 std::unique_ptr<BLAS> BackendCPU::create_blas() { return std::make_unique<CPU_BLAS>(); }
-
+std::unique_ptr<LSQ> BackendCPU::create_lsq() { return std::make_unique<CPU_LSQ>(create_memory_manager()); }
 
 /// @brief Assemble the left hand side matrix
 /// @details