Skip to content

Commit

Permalink
add anderson acceleration
Browse files Browse the repository at this point in the history
works great !!
  • Loading branch information
samayala22 committed Feb 25, 2025
1 parent 5079de1 commit 449171f
Show file tree
Hide file tree
Showing 6 changed files with 256 additions and 26 deletions.
43 changes: 22 additions & 21 deletions data/anderson.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import numpy as np
from scipy.optimize import newton_krylov
import scipy.linalg.lapack as lapack
import time
import plotly.express as px

Expand Down Expand Up @@ -41,7 +42,7 @@ def anderson_acceleration_fast(f, x0, k_max=100, tol_res=1e-6, m=3):
x_new = np.zeros(n)
g_curr = np.zeros(n)
g_new = np.zeros(n)
gamma = np.zeros(m)
gamma = np.zeros(n)
residual_history = []

# Initialization
Expand All @@ -63,12 +64,12 @@ def anderson_acceleration_fast(f, x0, k_max=100, tol_res=1e-6, m=3):
m_k = min(m, k)
# Solve the least–squares problem:
# gamma = argmin || g_curr - Gbuf[:, :m_k]*gamma ||
gamma, _, _, _ = np.linalg.lstsq(Gbuf[:, :m_k], g_curr, rcond=None)
# gamma, _, _, _ = np.linalg.lstsq(Gbuf[:, :m_k], g_curr, rcond=None)
_, gamma, _ = lapack.dgels(Gbuf[:, :m_k], g_curr)
# Compute the update correction:
# The acceleration step uses all stored differences (from both x and g):
# x_new = x_curr + g_curr - (Xbuf + Gbuf) * gamma.
x_new = x_curr + g_curr - (Xbuf[:, :m_k] + Gbuf[:, :m_k]) @ gamma[:m_k]

# Shift
if (m_k == m):
Xbuf[:, :-1] = Xbuf[:, 1:]
Expand All @@ -88,7 +89,7 @@ def anderson_acceleration_fast(f, x0, k_max=100, tol_res=1e-6, m=3):
tickformat='.2e', # Format as exponential with 2 decimal places
exponentformat='e' # Use 'e' notation (alternatives: 'E', 'power', 'SI', 'B')
)
fig.show()
# fig.show()

return x_curr, k

Expand All @@ -108,16 +109,9 @@ def picard(f, x0, k_max=100, tol_res=1e-6):
print("Warning: Picard iteration did not converge.")
return x_curr, k

# def f(x):
# """Fixed-point function mapping R^2 to R^2.
# In this example, we define:
# f₁(x,y) = sin(x) + arctan(y)
# f₂(x,y) = sin(y) + arctan(x)
# so that a fixed point solves:
# x = sin(x) + arctan(y), y = sin(y) + arctan(x)
# """
# return np.array([2.0 * np.sin(x[0]) + np.arctan(x[1]),
# np.sin(x[1]) + 2.0 * np.arctan(x[0])])
def mapping_2d(x):
return np.array([2.0 * np.sin(x[0]) + np.arctan(x[1]),
np.sin(x[1]) + 2.0 * np.arctan(x[0])])

# def f(x):
# return np.array([np.sin(x[0]) + np.arctan(x[0])])
Expand All @@ -136,7 +130,7 @@ def h(x):
def f(x):
return x + delta * (h(x) - x)

return f
return h

def bench(f: callable, func: callable, warmup=10, tries=50):
for i in range(warmup):
Expand Down Expand Up @@ -166,23 +160,30 @@ def main():
# x0 = np.array([1.0])
np.random.seed(42) # For reproducibility

n = 50
x0 = np.random.rand(n)
func = nonlinear_fixed_point(n)
# n = 5
# x0 = np.random.rand(n)
# func = nonlinear_fixed_point(n)

x0 = np.array([1.0, 1.0])
func = mapping_2d

k_max = 1000
tol_res = 1e-6
m = 3
m = 2

def F(x): return func(x)-x
ff = FunctionCounter(F)
fff = FunctionCounter(func)
fixed_point, iterations = anderson_acceleration_fast(fff, x0, k_max, tol_res, m)
print("Anderson Function evaluations:", fff.eval_count)
print(fixed_point)
# print(f"Anderson computed fixed point after {iterations} iterations")
# assert np.allclose(func(fixed_point), fixed_point, atol=tol_res)
assert np.allclose(func(fixed_point), fixed_point, atol=tol_res)

fff.reset()
fixed_point, iterations = picard(fff, x0, k_max, tol_res)
print("Picard Function evaluations:", fff.eval_count)

# fixed_point, iterations = picard(func, x0, k_max, tol_res)
# print(f"Picard computed fixed point after {iterations} iterations")
# assert np.allclose(func(fixed_point), fixed_point, atol=tol_res)

Expand Down
186 changes: 183 additions & 3 deletions tests/hbvlm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
#include "vlm_kinematics.hpp"
#include "vlm_utils.hpp"

// #define PICARD 1

using namespace vlm;
using namespace linalg::ostream_overloads;

Expand Down Expand Up @@ -50,6 +52,7 @@ class HBVLM final: public Simulation {
std::vector<i32> condition0;
Assembly m_assembly;
i32 m_harmonics;

private:
void alloc_buffers();
};
Expand Down Expand Up @@ -143,6 +146,102 @@ void gamma_wake_from_coeffs(
}
}

void anderson_acceleration(
Backend* backend,
const TensorView1D<Location::Device>& x0,
std::function<void(const TensorView1D<Location::Device>& x, const TensorView1D<Location::Device>& y)> f,
i32 max_iter = 100,
f32 tol_res = 1e-6,
i32 m = 3
)
{
i64 n = x0.shape(0);
Tensor2D<Location::Device> _X_buf{backend->memory.get()};
Tensor2D<Location::Device> _G_buf{backend->memory.get()};
Tensor2D<Location::Device> _G_buf_k{backend->memory.get()}; // copy because lsq overwrites
Tensor1D<Location::Device> _x_curr{backend->memory.get()};
Tensor1D<Location::Device> _x_new{backend->memory.get()};
Tensor1D<Location::Device> _g_curr{backend->memory.get()};
Tensor1D<Location::Device> _g_new{backend->memory.get()};
Tensor1D<Location::Device> _gamma{backend->memory.get()};
auto lsq_solver = backend->create_lsq();

_X_buf.init({n, m});
_G_buf.init({n, m});
_G_buf_k.init({n, m});
_x_curr.init({n});
_x_new.init({n});
_g_curr.init({n});
_g_new.init({n});
_gamma.init({n});

auto& X_buf = _X_buf.view();
auto& G_buf = _G_buf.view();
auto& G_buf_k = _G_buf_k.view();
auto& x_curr = _x_curr.view();
auto& x_new = _x_new.view();
auto& g_curr = _g_curr.view();
auto& g_new = _g_new.view();
auto& gamma = _gamma.view();

X_buf.fill(0.f); // not necessary
G_buf.fill(0.f); // not necessary
G_buf_k.fill(0.f); // not necessary

x0.to(x_curr);
f(x_curr, x_new);
x_new.to(g_curr);
backend->blas->axpy(-1.0f, x_curr, g_curr);
g_curr.to(X_buf.slice(All, 0));

x_new.to(x_curr);
f(x_curr, x_new);
x_new.to(g_new);
backend->blas->axpy(-1.0f, x_curr, g_new);
g_new.to(G_buf.slice(All, 0));
backend->blas->axpy(-1.0f, g_curr, G_buf.slice(All, 0));
g_new.to(g_curr);

i32 k = 1;
while (k < max_iter && backend->blas->norm(g_curr) > tol_res) {
i32 m_k = std::min(m, k);
i32 m_bk = std::min(m_k, m-1);
g_curr.to(gamma);
auto G_bufk = G_buf.slice(All, Range{0, m_k});
auto G_bufk_k = G_buf_k.slice(All, Range{0, m_k});
auto X_bufk = X_buf.slice(All, Range{0, m_k});
auto gammak = gamma.slice(Range{0, m_k});

G_bufk.to(G_bufk_k);
lsq_solver->solve(G_bufk_k, gamma.reshape(gamma.shape(0), 1));
x_curr.to(x_new); // i think not necessary
backend->blas->axpy(1.0f, g_curr, x_new);
backend->blas->gemv(-1.0f, X_bufk, gammak, 1.0f, x_new);
backend->blas->gemv(-1.0f, G_bufk, gammak, 1.0f, x_new);

if (m_k == m) {
for (i32 i = 0; i < m-1; i++) {
X_bufk.slice(All, i+1).to(X_bufk.slice(All, i));
G_bufk.slice(All, i+1).to(G_bufk.slice(All, i));
}
}

x_new.to(X_buf.slice(All, m_bk));
backend->blas->axpy(-1.0f, x_curr, X_buf.slice(All, m_bk));
x_new.to(x_curr);
f(x_curr, x_new);
x_new.to(g_new);
backend->blas->axpy(-1.0f, x_curr, g_new);
g_new.to(G_buf.slice(All, m_bk));
backend->blas->axpy(-1.0f, g_curr, G_buf.slice(All, m_bk));
g_new.to(g_curr);
k += 1;
}
x_curr.to(x0);

std::printf("Anderson fixed point converged in %d iterations\n", k);
}

void HBVLM::run(f32 t_start, f32 omega) {
const tiny::ScopedTimer timer("HBVLM::run");
const f32 period = 2.0f * PI_f / omega;
Expand Down Expand Up @@ -219,10 +318,90 @@ void HBVLM::run(f32 t_start, f32 omega) {
dft_hv.to(dft_dv);
}

auto hb_vlm_iter = [&](
const TensorView1D<Location::Device>& _gamma_in,
const TensorView1D<Location::Device>& _gamma_out
){
auto gamma_in = _gamma_in.reshape(lhs.view().shape(0), 2*m_harmonics+1);
auto gamma_out = _gamma_out.reshape(lhs.view().shape(0), 2*m_harmonics+1);

rhs.view().fill(0.f);
// Note this only work on a single wing
auto te_gamma = gamma_in
.reshape(colloc_d.views()[0].shape(0), colloc_d.views()[0].shape(1), gamma_in.shape(1))
.slice(All, -1, All);

// For each unknown we fill their respective rhs column in a matrix free fashion
for (i32 s = 0; s < 2*m_harmonics+1; s++) {
const f32 t_final = t_start + period * (f32)s / (f32)(2*m_harmonics+1);
const i64 t_steps = static_cast<i64>(std::round(t_final / dt));
const f32 t = (f32)t_steps * dt; // t_final rounded to nearest dt

for (i64 m = 0; m < assembly_wings.size(); m++) {
auto transform = m_assembly.surface_kinematics()[m]->transform(t);
transform.store(transforms_h.views()[m].ptr(), transforms_h.views()[m].stride(1));
transforms_h.views()[m].to(transforms.views()[m]);
}

backend->displace_wing(transforms.views(), verts_wing.views(), verts_wing_init.views());
backend->mesh_metrics(0.0f, verts_wing.views(), colloc_d.views(), normals_d.views(), areas_d.views());

// parallel for
for (i64 m = 0; m < assembly_wings.size(); m++) {
const auto transform_node = m_assembly.surface_kinematics()[m];
const auto& colloc_h_m = colloc_h.views()[m];
const auto& velocities_h_m = velocities_h.views()[m];
const auto& velocities_m = velocities.views()[m];
const auto mat_transform_dual = transform_node->transform_dual(t);

for (i64 j = 0; j < colloc_h_m.shape(1); j++) {
for (i64 i = 0; i < colloc_h_m.shape(0); i++) {
auto local_velocity = -transform_node->linear_velocity(mat_transform_dual, {colloc_h_m(i, j, 0), colloc_h_m(i, j, 1), colloc_h_m(i, j, 2)});
velocities_h_m(i, j, 0) = local_velocity.x;
velocities_h_m(i, j, 1) = local_velocity.y;
velocities_h_m(i, j, 2) = local_velocity.z;
}
}
velocities_h_m.to(velocities_m);
}

auto rhs_s = rhs.view().slice(All, s);
backend->rhs_assemble_velocities(rhs_s, normals_d.views(), velocities.views());
gamma_wake_from_coeffs(gamma_wake[s].views()[0], te_gamma, m_harmonics, t, omega, dt, t_steps);
backend->rhs_assemble_wake_influence(rhs_s, gamma_wake[s].views(), colloc_d.views(), normals_d.views(), verts_wake.views(), m_assembly.lifting(), (i32)t_steps);
} // unknowns for loop

// Apply the the dft matrix to the rhs
// A @ G @ D^T = R <=> A @ G = R @ D (since D^-1 = D^T)
backend->blas->gemm(1.0f, rhs.view(), dft_d.view(), 0.0f, gamma_out);
// Solve to obtain the fourier coefficients for each panel
solver->solve(lhs.view(), gamma_out);
};

auto& gamma_coeffs_v = gamma_coeffs.view();
gamma_coeffs_v.fill(0.f);

#ifndef PICARD
anderson_acceleration(backend.get(), gamma_coeffs_v.reshape(gamma_coeffs_v.shape(0)*gamma_coeffs_v.shape(1)), hb_vlm_iter, 100, 1e-3, 3);
#endif

// Tensor1D<Location::Device> test_tensor{backend->memory.get()};
// test_tensor.init({2});
// test_tensor.view()(0) = 1.f;
// test_tensor.view()(1) = 1.f;

// auto mapping_2d = [](const TensorView1D<Location::Device>& x_in, const TensorView1D<Location::Device>& x_out) {
// x_out(0) = 2.f * std::sin(x_in(0)) + std::atan(x_in(1));
// x_out(1) = std::sin(x_in(1)) + 2.f * std::atan(x_in(0));
// };

// anderson_acceleration(backend.get(), test_tensor.view(), mapping_2d, 10, 1e-6, 2);
// std::cout << test_tensor.view()(0) << " " << test_tensor.view()(1) << "\n";

// Iterative process for solving the blocked harmonic balance equation
#ifdef PICARD
auto& residual_v = residual.view();
residual_v.fill(1.f);
gamma_coeffs.view().fill(0.f);
const f32 tol = 1e-3f;
const i32 max_iter = 1000;
i32 iter = 0;
Expand Down Expand Up @@ -290,7 +469,8 @@ void HBVLM::run(f32 t_start, f32 omega) {
}
} // while loop

std::printf("Fixed iteration process converged in %d iterations\n", iter);
std::printf("Picard iterative process converged in %d iterations\n", iter);
#endif

// Compute time domain solution from the obtained fourier coeffs
{
Expand All @@ -314,7 +494,7 @@ void HBVLM::run(f32 t_start, f32 omega) {
}

int main() {
const std::vector<std::string> meshes = {"../../../../mesh/infinite_rectangular_40x10.x"};
const std::vector<std::string> meshes = {"../../../../mesh/infinite_rectangular_20x5.x"};
const std::vector<std::string> backends = {"cpu"};

auto solvers = tiny::make_combination(meshes, backends);
Expand Down
1 change: 1 addition & 0 deletions vlm/backends/cpu/include/vlm_backend_cpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ class BackendCPU final : public Backend {
// std::unique_ptr<Kernels> create_kernels() override;
std::unique_ptr<LU> create_lu_solver() override;
std::unique_ptr<BLAS> create_blas() override;
std::unique_ptr<LSQ> create_lsq() override;
};

} // namespace vlm
35 changes: 34 additions & 1 deletion vlm/backends/cpu/src/vlm_backend_cpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,39 @@ void CPU_BLAS::axpy(const f32 alpha, const TensorView<f32, 2, Location::Device>&
}
}

class CPU_LSQ final : public LSQ {
public:
explicit CPU_LSQ(std::unique_ptr<Memory> memory) : LSQ(std::move(memory)) {}
virtual ~CPU_LSQ() = default;

void solve(
const TensorView<f32, 2, Location::Device>& A,
const TensorView<f32, 2, Location::Device>& B
) override;

protected:
std::unique_ptr<Memory> m_memory;
};

void CPU_LSQ::solve(
const TensorView<f32, 2, Location::Device>& A,
const TensorView<f32, 2, Location::Device>& B
) {
CHECK_LAPACK(
LAPACKE_sgels(
LAPACK_COL_MAJOR,
'N',
A.shape(0),
A.shape(1),
B.shape(1),
A.ptr(),
A.stride(1),
B.ptr(),
B.stride(1)
)
);
}

BackendCPU::BackendCPU() : Backend(create_memory_manager(), create_blas()) {
name = "CPU";
print_cpu_info();
Expand All @@ -227,7 +260,7 @@ std::unique_ptr<Memory> BackendCPU::create_memory_manager() { return std::make_u
// std::unique_ptr<Kernels> create_kernels() { return std::make_unique<CPU_Kernels>(); }
std::unique_ptr<LU> BackendCPU::create_lu_solver() { return std::make_unique<CPU_LU>(create_memory_manager()); }
std::unique_ptr<BLAS> BackendCPU::create_blas() { return std::make_unique<CPU_BLAS>(); }

std::unique_ptr<LSQ> BackendCPU::create_lsq() { return std::make_unique<CPU_LSQ>(create_memory_manager()); }

/// @brief Assemble the left hand side matrix
/// @details
Expand Down
Loading

0 comments on commit 449171f

Please sign in to comment.