# %%
import os

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

import numpy as np
import torch
import time

from SbSGenerateSpike2DManyIP import SbSGenerateSpike2D
from SbSDynamicManyIP1 import SbSDynamic1

from MyDataset_MNIST import MyDataset
from learning_helper import low_pass_filter, update_weight_forward, calc_target_h

from NetworkMNISTManyIP import NetworkManyIP

import sys


from torch.utils.tensorboard import SummaryWriter

tb = SummaryWriter()


def initial_random_weights(wf_shape, noise_amplitude=0.01):
    my_rng = np.random.default_rng()

    weights = np.ones(wf_shape, dtype=np.float32)
    weights += noise_amplitude * my_rng.random(wf_shape, dtype=np.float32)
    weights /= weights.sum(axis=(0, 1, 2), keepdims=True)
    weights = weights.astype(dtype=np.float32)

    return weights


# %%
# Paramters:
batch_size = 1000

learning_step = 0

train_layer = int(sys.argv[1])

learning_step_max = 1000

# If -1 then it looks up the amount of available CPU HT cores
number_of_cpu_processes = -1

if number_of_cpu_processes < 1:
    number_of_cpu_processes = os.cpu_count()

# Random Number generator seed
the_seed = 666
number_of_spikes = 1200
cooldown_after_number_of_spikes = 1000

learning_active = True

# For initial weights and EpsXY
weight_path = "./Weights/"
os.makedirs(weight_path, exist_ok=True)

eps_xy_path = "./EpsXY/"
os.makedirs(eps_xy_path, exist_ok=True)

other_path = "./Temp/"
os.makedirs(other_path, exist_ok=True)

data_path = "./Data/"
export_h_path = "./ExportH/"

weight_noise_amplitude = 0.01
eps_xy_intitial = 0.1

# Learning parameter:
loss_coeffs_mse = 0.0
loss_coeffs_kldiv = 1.0

learning_rate_threshold_w = 0.0001
learning_rate_gamma_w = 0.1

learning_rate_threshold_eps_xy = 0.00001
learning_rate_gamma_eps_xy = 0.000025

learning_rate_tau = 10

print("Batch-Size requested: " + str(batch_size))
batch_size = (batch_size // number_of_cpu_processes) * number_of_cpu_processes
batch_size = np.max((batch_size, number_of_cpu_processes))
batch_size = int(batch_size)
print("Batch-Size granted: " + str(batch_size))

update_after_x_batches = 1
update_after_x_pattern = batch_size * update_after_x_batches

# We crop around the image
# because during learning we moved a
# random crop window around
crop_width_in_pixel = int(2)

# For the H penalty during learning
max_value_init = -1

# %%
# Network Parameter:
reduction_cooldown = 25.0
epsilon_t = np.ones((number_of_spikes), dtype=np.float32)
epsilon_t[cooldown_after_number_of_spikes:number_of_spikes] /= reduction_cooldown
epsilon_t = torch.tensor(epsilon_t)
epsilon_0 = 1.0

# %%
# Load the input data
if train_layer == 0:
    the_dataset_train = MyDataset(train=True, path=data_path, path_label=data_path)
else:
    the_dataset_train = MyDataset(
        train=True, path=export_h_path, layer=train_layer - 1, path_label=data_path
    )

my_loader_train = torch.utils.data.DataLoader(
    the_dataset_train, batch_size=batch_size, shuffle=True
)

# %%

# Get infos about the Network structure:
x_shape = (28 - 2 * crop_width_in_pixel, 28 - 2 * crop_width_in_pixel)

my_network = NetworkManyIP(x_shape)

assert train_layer == len(my_network.wf_shape) - 1


# %%
# Loading the forward weights
if learning_step == 0:
    weights = initial_random_weights(
        my_network.wf_shape[train_layer], noise_amplitude=weight_noise_amplitude
    )
    np.save(weight_path + "/Weight_L" + str(train_layer) + "_S0.npy", weights)

wf = np.load(
    weight_path + "/Weight_L" + str(train_layer) + "_S" + str(learning_step) + ".npy"
)
# %%
# Epsilon XY maps
if learning_step == 0:
    fill_value = eps_xy_intitial / (
        my_network.wf_shape[train_layer][0] * my_network.wf_shape[train_layer][1]
    )
    eps_xy_temp = np.full(
        my_network.eps_xy_shape[train_layer], fill_value, dtype=np.float32
    )
    np.save(eps_xy_path + "/EpsXY_L" + str(train_layer) + "_S0.npy", eps_xy_temp)

eps_xy = np.load(
    eps_xy_path + "/EpsXY_L" + str(train_layer) + "_S" + str(learning_step) + ".npy"
)

# %%
my_spike_generator = SbSGenerateSpike2D(
    number_of_spikes, seed=the_seed, number_of_cpu_processes=number_of_cpu_processes
)

my_layer = SbSDynamic1(
    input_dimensions=my_network.input_dimensions[train_layer],
    weight_size=my_network.wf_shape[train_layer],
    weights=wf,
    epsilon_0=epsilon_0,
    epsilon_t=epsilon_t,
    epsilon_xy=eps_xy,
    dilation=my_network.dilation[train_layer],
    padding=my_network.padding[train_layer],
    stride=my_network.strides[train_layer],
    number_of_cpu_processes=number_of_cpu_processes,
)
my_layer.reset_h_init_to_uniform()


# %%
if learning_step == 0:
    wf_m = np.zeros_like(wf, np.float64)
else:
    wf_m = np.load(
        weight_path
        + "/Weight_L"
        + str(train_layer)
        + "_S"
        + str(learning_step)
        + "_M.npy"
    )

if learning_step == 0:
    loss_f_lp_m = np.zeros((1), dtype=np.float32)
else:
    loss_f_lp_m = np.load(
        other_path
        + "/LossF_L"
        + str(train_layer)
        + "_S"
        + str(learning_step)
        + "_M.npy"
    )

if learning_step == 0:
    loss_f_lp_max = np.zeros((1), dtype=np.float32)
else:
    loss_f_lp_max = np.load(
        other_path
        + "/LossF_L"
        + str(train_layer)
        + "_S"
        + str(learning_step)
        + "_Max.npy"
    )

d_w = np.zeros_like(wf, np.float64)

sum_scale = 1.0 / (loss_coeffs_kldiv + loss_coeffs_mse)
loss_coeffs_kldiv *= sum_scale
loss_coeffs_mse *= sum_scale

train_loss = 0
train_number_of_processed_pattern = 0

mask = np.ones((my_layer.h_dim), dtype=np.float32)
h_temp = np.zeros((my_layer.h_dim), dtype=np.float32)


with torch.no_grad():
    while True:
        for h_x, h_x_labels in my_loader_train:
            # Run section
            t_start_sub0 = time.perf_counter()

            input_pattern = h_x

            output_h = calc_target_h(
                correct_label=h_x_labels,
                number_of_neurons=my_network.number_of_output_neurons,
            )

            h_temp += torch.sum(output_h, dim=(0, 2, 3)).detach().numpy()

            train_number_of_processed_pattern += output_h.shape[0]

            t_start_sub1 = time.perf_counter()
            timywimy_l = t_start_sub1 - t_start_sub0
            print(
                f"Processed {train_number_of_processed_pattern} of {update_after_x_pattern} Time used: {timywimy_l:.2f}sec"
            )

            # Train section
            # We need a reconstruction from the output
            reconstuction = (
                np.expand_dims(output_h, axis=(1, 2, 3))
                * np.expand_dims(wf, axis=(0, 5, 6))
            ).sum(axis=4)
            reconstuction = np.ascontiguousarray(np.moveaxis(reconstuction, 3, 1))

            input_pattern_fold = np.empty_like(reconstuction, dtype=np.float64)

            # Fold the input
            for X in range(0, my_layer.coordinates_0.shape[1]):
                for Y in range(0, my_layer.coordinates_1.shape[1]):
                    idx_0, idx_1 = np.meshgrid(
                        my_layer.coordinates_0.detach().numpy()[:, X],
                        my_layer.coordinates_1.detach().numpy()[:, Y],
                    )
                    input_pattern_fold[:, :, :, :, X, Y] = input_pattern[
                        :, :, idx_0, idx_1
                    ]

            if (loss_coeffs_mse > 0) and (loss_coeffs_kldiv):
                d_loss = (input_pattern_fold - reconstuction) * loss_coeffs_mse
                loss = (
                    (input_pattern_fold - reconstuction) ** 2
                ).mean() * loss_coeffs_mse

                temp = input_pattern_fold / reconstuction
                temp = np.where(np.isfinite(temp), temp, 0)
                d_loss += temp * loss_coeffs_kldiv

                temp = np.log(temp)
                temp = np.where(np.isfinite(temp), temp, 0)
                loss += np.mean(temp * input_pattern_fold) * loss_coeffs_kldiv

            elif loss_coeffs_mse > 0:
                d_loss = input_pattern_fold - reconstuction
                loss = (
                    (input_pattern_fold - reconstuction) ** 2
                ).mean() * loss_coeffs_mse

            elif loss_coeffs_kldiv > 0:
                temp = input_pattern_fold / reconstuction
                temp = np.where(np.isfinite(temp), temp, 0)
                d_loss = temp
                temp = np.log(temp)
                temp = np.where(np.isfinite(temp), temp, 0)
                loss = np.mean(temp * input_pattern_fold)

            train_loss += loss * output_h.shape[0]

            d_w_temp = (
                (
                    np.expand_dims(d_loss, axis=(4))
                    * np.expand_dims(output_h, axis=(1, 2, 3))
                )
                .sum(axis=-1)
                .sum(axis=-1)
                .sum(axis=0)
            )
            d_w += np.ascontiguousarray(np.moveaxis(d_w_temp, 0, 2))

            t_start_sub2 = time.perf_counter()
            timywimy_l = t_start_sub2 - t_start_sub1
            print(
                f"Learning {train_number_of_processed_pattern} of {update_after_x_pattern} Time used: {timywimy_l:.2f}sec"
            )

            # Enougth pattern for an update?
            if train_number_of_processed_pattern >= update_after_x_pattern:

                if max_value_init != -1:
                    h_temp_l = 1.0 / h_temp
                    h_temp_l = np.where(np.isfinite(h_temp_l), h_temp_l, max_value_init)
                    h_temp_l = np.where(
                        h_temp_l < max_value_init, h_temp_l, max_value_init
                    )
                    mask = h_temp_l / h_temp_l.sum()

                    tb.add_histogram(
                        "H Penalty " + str(train_layer), mask, learning_step
                    )

                    h_temp = np.zeros_like(h_temp_l, dtype=np.float32)

                # Lowpass filter the loss value
                train_loss /= train_number_of_processed_pattern
                print(f"({learning_step}) Train Loss: {train_loss}")

                loss_f_lp, loss_f_lp_m = low_pass_filter(
                    loss_f_lp_m, train_loss, learning_rate_tau, learning_step
                )

                # For checking NaNs
                bad_elements = 0

                # Update the weights
                d_w_temp, wf_m = low_pass_filter(
                    wf_m, d_w, learning_rate_tau, learning_step
                )

                wf, bad_temp = update_weight_forward(
                    wf,
                    d_w_temp,
                    loss_f_lp_max,
                    loss_f_lp,
                    learning_rate_gamma_w,
                    learning_rate_threshold_w,
                )

                # wf, bad_temp = update_weight_forward(
                #     wf,
                #     d_w,
                #     loss_f_lp_max,
                #     loss_f_lp,
                #     learning_rate_gamma_w,
                #     learning_rate_threshold_w,
                # )

                bad_elements += bad_temp

                # Update the max loss for a scale
                if loss_f_lp > loss_f_lp_max:
                    loss_f_lp_max = loss_f_lp

                if bad_elements > 0:
                    print("NaN was found. This is bad!!!")
                    exit(1)

                tb.add_scalar("Train Loss", train_loss, learning_step)
                tb.add_scalar("Train LP Loss", loss_f_lp, learning_step)

                train_number_of_processed_pattern = 0
                d_w = np.zeros_like(d_w, np.float64)
                train_loss = 0
                learning_step += 1

                # Save the new values (and _M data)
                np.save(
                    weight_path
                    + "/Weight_L"
                    + str(train_layer)
                    + "_S"
                    + str(learning_step)
                    + ".npy",
                    wf,
                )
                tb.add_histogram("WF " + str(train_layer), wf, learning_step)

                np.save(
                    weight_path
                    + "/Weight_L"
                    + str(train_layer)
                    + "_S"
                    + str(learning_step)
                    + "_M.npy",
                    wf_m,
                )
                tb.add_histogram("WF_M " + str(train_layer), wf_m, learning_step)

                np.save(
                    other_path
                    + "/LossF_L"
                    + str(train_layer)
                    + "_S"
                    + str(learning_step)
                    + "_M.npy",
                    loss_f_lp_m,
                )

                tb.add_histogram(
                    "LossF_LP_M" + str(train_layer), loss_f_lp_m, learning_step
                )

                np.save(
                    other_path
                    + "/LossF_L"
                    + str(train_layer)
                    + "_S"
                    + str(learning_step)
                    + "_Max.npy",
                    loss_f_lp_max,
                )

                tb.add_histogram(
                    "LossF_LP_Max" + str(train_layer), loss_f_lp_max, learning_step
                )

                np.save(
                    other_path
                    + "/LossF_L"
                    + str(train_layer)
                    + "_S"
                    + str(learning_step)
                    + ".npy",
                    loss_f_lp,
                )

                tb.add_histogram(
                    "LossF_LP" + str(train_layer), loss_f_lp, learning_step
                )

                tb.flush()

                my_layer.set_weights(wf)

                if learning_step == learning_step_max:
                    tb.close()
                    exit(1)

tb.close()