From 153d55e4af0537e698ed26a84dede6963726fc5a Mon Sep 17 00:00:00 2001
From: Bruno Stegani <bruno.stega8@gmail.com>
Date: Fri, 29 Nov 2024 01:18:44 +0100
Subject: [PATCH 1/5] new make mat: --mode as cmdata

---
 tools/make_mat/make_mat.py | 882 ++++++++++++++-----------------------
 1 file changed, 342 insertions(+), 540 deletions(-)

diff --git a/tools/make_mat/make_mat.py b/tools/make_mat/make_mat.py
index 4c1c7056..e9c38fdd 100644
--- a/tools/make_mat/make_mat.py
+++ b/tools/make_mat/make_mat.py
@@ -51,81 +51,19 @@ def read_mat(name, protein_ref_indices, args, cumulative=False):
 
     return ref_df
 
-
-def run_intra_(arguments):
+def zero_probability_decorator(flag):
     """
-    Preforms the main routine of the histogram analysis to obtain the intra- and intermat files.
-    Is used in combination with multiprocessing to speed up the calculations.
-
-    Parameters
-    ----------
-    arguments : dict
-        Contains all the command-line parsed arguments
-
-    Returns
-    -------
-    out_path : str
-        Path to the temporary file which contains a partial pd.DataFrame with the analyzed data
+    Decorator of function to return 0 if flag is rased
     """
-    (
-        args,
-        protein_ref_indices_i,
-        protein_ref_indices_j,
-        original_size_j,
-        c12_cutoff,
-        mi,
-        mj,
-        frac_target_list,
-    ) = arguments
-    process = multiprocessing.current_process()
-    df = pd.DataFrame()
-    # We do not consider old histograms
-    frac_target_list = [x for x in frac_target_list if x[0] != "#" and x[-1] != "#"]
-    for i, ref_f in enumerate(frac_target_list):
-        results_df = pd.DataFrame()
-        ai = ref_f.split(".")[-2].split("_")[-1]
-
-        if True:
-            all_ai = [ai for _ in range(1, original_size_j + 1)]
-            range_list = [str(x) for x in range(1, original_size_j + 1)]
+    def decorator(func):
+        def wrapper(*args, **kwargs):
+            if flag:
+                return 0  # Return 0 if the flag is set
+            return func(*args, **kwargs)  # Otherwise, execute the original function
+        return wrapper
+    return decorator
 
-            results_df["ai"] = np.array(all_ai).astype(int)
-            results_df["aj"] = np.array(range_list).astype(int)
-
-            results_df["mi"] = mi
-            results_df["mj"] = mj
-            results_df["c12dist"] = 0.0
-            results_df["p"] = 0.0
-            results_df["cutoff"] = 0.0
-
-        if np.isin(int(ai), protein_ref_indices_i):
-            cut_i = np.where(protein_ref_indices_i == int(ai))[0][0]
-
-            # column mapping
-            ref_df = read_mat(ref_f, protein_ref_indices_j, args)
-            ref_df.loc[len(ref_df)] = c12_cutoff[cut_i]
-
-            # calculate data
-            c12dist = ref_df.apply(lambda x: c12_avg(ref_df.index.to_numpy(), weights=x.to_numpy()), axis=0).values
-            p = ref_df.apply(
-                lambda x: calculate_probability(ref_df.index.to_numpy(), weights=x.to_numpy()),
-                axis=0,
-            ).values
-            results_df.loc[results_df["aj"].isin(protein_ref_indices_j), "c12dist"] = c12dist
-            results_df.loc[results_df["aj"].isin(protein_ref_indices_j), "p"] = p
-            results_df.loc[results_df["aj"].isin(protein_ref_indices_j), "cutoff"] = c12_cutoff[cut_i].astype(float)
-
-        df = pd.concat([df, results_df])
-        df = df.sort_values(by=["p", "c12dist"], ascending=True)
-
-    df.fillna(0)
-    out_path = f"mat_{process.pid}_t{time.time()}.part"
-    df.to_csv(out_path, index=False)
-
-    return out_path
-
-
-def run_inter_(arguments):
+def run_mat_(arguments):
     """
     Preforms the main routine of the histogram analysis to obtain the intra- and intermat files.
     Is used in combination with multiprocessing to speed up the calculations.
@@ -149,7 +87,9 @@ def run_inter_(arguments):
         mi,
         mj,
         frac_target_list,
+        mat_type,
     ) = arguments
+
     process = multiprocessing.current_process()
     df = pd.DataFrame()
     # We do not consider old histograms
@@ -178,24 +118,33 @@ def run_inter_(arguments):
             ref_df = read_mat(ref_f, protein_ref_indices_j, args)
             ref_df.loc[len(ref_df)] = c12_cutoff[cut_i]
 
-            # repeat for cumulative
-            c_ref_f = ref_f.replace("inter_mol_", "inter_mol_c_")
-            c_ref_df = read_mat(c_ref_f, protein_ref_indices_j, args, True)
-            c_ref_df.loc[len(c_ref_df)] = c12_cutoff[cut_i]
-
             # calculate data
-            c12dist = ref_df.apply(lambda x: c12_avg(ref_df.index.to_numpy(), weights=x.to_numpy()), axis=0).values
-            p = c_ref_df.apply(
-                lambda x: get_cumulative_probability(c_ref_df.index.to_numpy(), weights=x.to_numpy()),
-                axis=0,
-            ).values
+            c12_avg_ = zero_probability_decorator( args.zero)(c12_avg)
+            calculate_probability_ = zero_probability_decorator( args.zero)(calculate_probability)
+            get_cumulative_probability_ = zero_probability_decorator( args.zero)(get_cumulative_probability)
+           
+            c12dist = ref_df.apply(lambda x: c12_avg_(ref_df.index.to_numpy(), weights=x.to_numpy()), axis=0).values
+            if mat_type == "intra":
+                p = ref_df.apply(
+                    lambda x: calculate_probability_(ref_df.index.to_numpy(), weights=x.to_numpy()),
+                    axis=0,
+                ).values
+
+            if mat_type == "inter":
+                # repeat for cumulative
+                c_ref_f = ref_f.replace("inter_mol_", "inter_mol_c_")
+                c_ref_df = read_mat(c_ref_f, protein_ref_indices_j, args, True)
+                c_ref_df.loc[len(c_ref_df)] = c12_cutoff[cut_i]
+                p = c_ref_df.apply(
+                    lambda x: get_cumulative_probability_(c_ref_df.index.to_numpy(), weights=x.to_numpy()),
+                    axis=0,
+                ).values
 
             results_df.loc[results_df["aj"].isin(protein_ref_indices_j), "c12dist"] = c12dist
             results_df.loc[results_df["aj"].isin(protein_ref_indices_j), "p"] = p
             results_df.loc[results_df["aj"].isin(protein_ref_indices_j), "cutoff"] = c12_cutoff[cut_i].astype(float)
 
         df = pd.concat([df, results_df])
-
         df = df.sort_values(by=["p", "c12dist"], ascending=True)
 
     df.fillna(0)
@@ -205,6 +154,7 @@ def run_inter_(arguments):
     return out_path
 
 
+# TODO add intra or remove this and use resdata?
 def run_residue_inter_(arguments):
     """
     Preforms the main routine of the histogram analysis to obtain the intra- and intermat files.
@@ -372,7 +322,7 @@ def map_if_exists(atom_name):
     else:
         return atom_name
 
-
+# TODO is it needed anymore?
 def hallfunction(values, weights):
     """
     A substitute to the allfunction without considering the cutoff.
@@ -469,6 +419,27 @@ def zero_callback(values, weights):
     """
     return None, None, np.sum(weights), values, weights
 
+def calculate_probability(values, weights, callback=allfunction):
+    """
+    Calculates a plain probability accoring to \sum_x x * dx
+
+    Parameters
+    ----------
+    values : np.array
+        The array of the histograms x values
+    weights : np.array
+        The array with the respective weights
+    callback : function
+        Preprocesses the data before going in to the analysis
+
+    Returns
+    -------
+    The probability of the histogram
+    """
+    dx = values[1] - values[0]
+    cutoff, i, norm, v, w = callback(values, weights)
+    return np.minimum(np.sum(w * dx), 1)
+
 
 def get_cumulative_probability(values, weights, callback=allfunction):
     cutoff, i, norm, v, w = callback(values, weights)
@@ -564,194 +535,7 @@ def generate_c12_values(df, types, combinations, molecule_type):
     return c12_map
 
 
-def calculate_intra_probabilities(args):
-    """
-    Starts the main routine for calculating the intramat by:
-     - reading the topologies
-     - calculating the cutoffs
-     - and caclulating the probabilities
-    The operation is finalized by writing out a csv with the name pattern intramat<_name>_{mol_i}_{mol_j}.ndx
-
-    Parameters
-    ----------
-    args : dict
-        The command-line parsed parameters
-    """
-    (
-        topology_mego,
-        topology_ref,
-        N_molecules,
-        molecules_name,
-        mol_list,
-    ) = read_topologies(args.mego_top, args.target_top)
-
-    print(
-        f"""
-    Topology contains {N_molecules} molecules species. Namely {molecules_name}.
-    Calculating intramat for all species
-    """
-    )
-    for i in range(N_molecules):
-        print(f"\n Calculating intramat for molecule {mol_list[i]}: {molecules_name[i]}")
-        df = pd.DataFrame()
-        topology_df = pd.DataFrame()
-
-        prefix = f"intra_mol_{mol_list[i]}_{mol_list[i]}"
-        if args.tar:
-            with tarfile.open(args.histo, "r:*") as tar:
-                target_list = [x.name for x in tar.getmembers() if prefix in x.name]
-        else:
-            target_list = [x for x in os.listdir(args.histo) if prefix in x]
-
-        protein_mego = topology_mego.molecules[list(topology_mego.molecules.keys())[i]][0]
-        protein_ref = topology_ref.molecules[list(topology_ref.molecules.keys())[i]][0]
-        original_size = len(protein_ref.atoms)
-        protein_ref_indices = np.array([i + 1 for i in range(len(protein_ref.atoms)) if protein_ref[i].element_name != "H"])
-        protein_ref = [a for a in protein_ref.atoms if a.element_name != "H"]
-
-        sorter = [str(x.residue.number) + map_if_exists(x.name) for x in protein_ref]
-        sorter_mego = [str(x.residue.number) + x.name for x in protein_mego]
-
-        topology_df["ref_ai"] = protein_ref_indices
-        topology_df["ref_type"] = [a.name for a in protein_ref]
-        topology_df["resname"] = [a.residue.name for a in protein_ref]
-        topology_df["resnum"] = [a.residue.idx for a in protein_ref]
-        topology_df["sorter"] = sorter
-        topology_df["ref_ri"] = topology_df["sorter"].str.replace("[a-zA-Z]+[0-9]*", "", regex=True).astype(int)
-        topology_df.sort_values(by="sorter", inplace=True)
-        topology_df["mego_ai"] = [a[0].idx for a in sorted(zip(protein_mego, sorter_mego), key=lambda x: x[1])]
-        topology_df["mego_type"] = [a[0].type for a in sorted(zip(protein_mego, sorter_mego), key=lambda x: x[1])]
-        topology_df["mego_name"] = [a[0].name for a in sorted(zip(protein_mego, sorter_mego), key=lambda x: x[1])]
-        topology_df["name"] = topology_df["mego_name"]
-        topology_df["type"] = topology_df["mego_type"]
-        # need to sort back otherwise c12_cutoff are all wrong
-        topology_df.sort_values(by="ref_ai", inplace=True)
-        if args.custom_c12 is not None:
-            custom_c12_dict = io.read_custom_c12_parameters(args.custom_c12)
-            d_appo = {key: val for key, val in zip(custom_c12_dict.name, custom_c12_dict.c12)}
-            d.update(d_appo)
-
-        topology_df["c12"] = topology_df["mego_type"].map(d)
-        first_aminoacid = topology_mego.residues[0].name
-        if first_aminoacid in type_definitions.aminoacids_list:
-            molecule_type = "protein"
-        elif first_aminoacid in type_definitions.nucleic_acid_list:
-            molecule_type = "nucleic_acid"
-        else:
-            molecule_type = "other"
-
-        types = type_definitions.lj14_generator(topology_df)
-
-        if molecule_type == "other":
-            # read user pairs
-            molecule_keys = list(topology_mego.molecules.keys())
-            user_pairs = [
-                (pair.atom1.idx, pair.atom2.idx, pair.type.epsilon * 4.184)
-                for pair in topology_mego.molecules[molecule_keys[i]][0].adjusts
-            ]
-            user_pairs = [
-                (topology_df[topology_df["mego_ai"] == ai].index[0], topology_df[topology_df["mego_ai"] == aj].index[0], c12)
-                for ai, aj, c12 in user_pairs
-            ]
-
-        # create Datarame with the pairs and the c12 values
-        c12_values = generate_c12_values(topology_df, types, type_definitions.atom_type_combinations, molecule_type)
-
-        # consider special cases
-        oxygen_mask = masking.create_matrix_mask(
-            topology_df["mego_type"].to_numpy(),
-            topology_df["mego_type"].to_numpy(),
-            [("OM", "OM"), ("O", "O"), ("OM", "O")],
-            symmetrize=True,
-        )
-
-        # define all cutoff
-        c12_cutoff = CUTOFF_FACTOR * np.power(np.where(oxygen_mask, 11.4 * c12_values, c12_values), 1.0 / 12.0)
-
-        # apply the user pairs (overwrite all other rules)
-        if molecule_type == "other":
-            for ai, aj, c12 in user_pairs:
-                ai = int(ai)
-                aj = int(aj)
-                if c12 > 0.0:
-                    c12_cutoff[ai][aj] = CUTOFF_FACTOR * np.power(c12, 1.0 / 12.0)
-                    c12_cutoff[aj][ai] = CUTOFF_FACTOR * np.power(c12, 1.0 / 12.0)
-
-        mismatched = topology_df.loc[topology_df["ref_type"].str[0] != topology_df["mego_name"].str[0]]
-        if not mismatched.empty:
-            raise ValueError(f"Mismatch found:\n{mismatched}, target and mego topology are not compatible")
-
-        if np.any(c12_cutoff > args.cutoff):
-            warning_cutoff_histo(args.cutoff, np.max(c12_cutoff))
-        if np.isnan(c12_cutoff.astype(float)).any():
-            warning_cutoff_histo(args.cutoff, np.max(c12_cutoff))
-
-        ########################
-        # PARALLEL PROCESS START
-        ########################
-
-        chunks = np.array_split(target_list, args.proc)
-        pool = multiprocessing.Pool(args.proc)
-        results = pool.map(
-            run_intra_,
-            [
-                (
-                    args,
-                    protein_ref_indices,
-                    protein_ref_indices,
-                    original_size,
-                    c12_cutoff,
-                    mol_list[i],
-                    mol_list[i],
-                    x,
-                )
-                for x in chunks
-            ],
-        )
-        pool.close()
-        pool.join()
-
-        ########################
-        # PARALLEL PROCESS END
-        ########################
-
-        # concatenate and remove partial dataframes
-        for name in results:
-            try:
-                part_df = pd.read_csv(name)
-                df = pd.concat([df, part_df])
-            except pd.errors.EmptyDataError:
-                print(f"Ignoring partial dataframe in {name} as csv is empty")
-        [os.remove(name) for name in results]
-
-        df = df.astype(
-            {
-                "mi": "int32",
-                "mj": "int32",
-                "ai": "int32",
-                "aj": "int32",
-            }
-        )
-
-        df = df.sort_values(by=["mi", "mj", "ai", "aj"])
-        df.drop_duplicates(subset=["mi", "ai", "mj", "aj"], inplace=True)
-
-        df["mi"] = df["mi"].map("{:}".format)
-        df["mj"] = df["mj"].map("{:}".format)
-        df["ai"] = df["ai"].map("{:}".format)
-        df["aj"] = df["aj"].map("{:}".format)
-        df["c12dist"] = df["c12dist"].map("{:,.6f}".format)
-        df["p"] = df["p"].map("{:,.6e}".format)
-        df["cutoff"] = df["cutoff"].map("{:,.6f}".format)
-
-        df.index = range(len(df.index))
-        out_name = args.out_name + "_" if args.out_name else ""
-        output_file = f"{args.out}/intramat_{out_name}{mol_list[i]}_{mol_list[i]}.ndx.gz"
-        print(f"Saving output for molecule {mol_list[i]} in {output_file}")
-        write_mat(df, output_file)
-
-
-def calculate_inter_probabilities(args):
+def calculate_matrices(args):
     """
     Starts the main routine for calculating the intermat by:
      - reading the topologies
@@ -792,126 +576,148 @@ def calculate_inter_probabilities(args):
     Calculating intermat for all species\n\n
     """
     )
-    columns = ["mi", "ai", "mj", "aj", "c12dist", "p", "cutoff"]
-    for pair in pairs:
-        df = pd.DataFrame()
+    for mol_i in mol_list:
+        if args.intra:
+            prefix = f"intra_mol_{mol_i}_{mol_i}"
+            run_stuff(mol_i, mol_i, topology_mego, topology_ref, molecules_name, prefix)
+        for mol_j in mol_list[mol_i-1:]:           
+            if mol_i==mol_j and not args.same:  continue
+            if mol_i!=mol_j and not args.cross: continue
 
-        topology_df_i = pd.DataFrame()
-        topology_df_j = pd.DataFrame()
+            prefix = f"inter_mol_{mol_i}_{mol_j}"
+            run_stuff(mol_i, mol_j, topology_mego, topology_ref, molecules_name, prefix)
 
-        mol_i = pair[0]
-        mol_j = pair[1]
 
-        print(
-            f"\nCalculating intermat between molecule {mol_i} and {mol_j}: {molecules_name[mol_i-1]} and {molecules_name[mol_j-1]}"
-        )
-        prefix = f"inter_mol_{mol_i}_{mol_j}"
-        if args.tar:
-            with tarfile.open(args.histo, "r:*") as tar:
-                target_list = [x.name for x in tar.getmembers() if prefix in x.name]
-        else:
-            target_list = [x for x in os.listdir(args.histo) if prefix in x]
+def run_stuff(mol_i, mol_j, topology_mego, topology_ref, molecules_name, prefix):
+
+    df = pd.DataFrame()
 
-        protein_mego_i = topology_mego.molecules[list(topology_mego.molecules.keys())[mol_i - 1]][0]
-        protein_mego_j = topology_mego.molecules[list(topology_mego.molecules.keys())[mol_j - 1]][0]
+    topology_df_i = pd.DataFrame()
+    topology_df_j = pd.DataFrame()
 
-        protein_ref_i = topology_ref.molecules[list(topology_ref.molecules.keys())[mol_i - 1]][0]
-        protein_ref_j = topology_ref.molecules[list(topology_ref.molecules.keys())[mol_j - 1]][0]
+    # define matrix type (intra o inter)
+    mat_type = prefix.split("_")[0]
+    print(
+        f"\nCalculating {mat_type} between molecule {mol_i} and {mol_j}: {molecules_name[mol_i-1]} and {molecules_name[mol_j-1]}"
+    )
+    if args.tar:
+        with tarfile.open(args.histo, "r:*") as tar:
+            target_list = [x.name for x in tar.getmembers() if prefix in x.name]
+    else:
+        target_list = [x for x in os.listdir(args.histo) if prefix in x]
 
-        original_size_i = len(protein_ref_i.atoms)
-        original_size_j = len(protein_ref_j.atoms)
+    protein_mego_i = topology_mego.molecules[list(topology_mego.molecules.keys())[mol_i - 1]][0]
+    protein_mego_j = topology_mego.molecules[list(topology_mego.molecules.keys())[mol_j - 1]][0]
 
-        if mol_i == mol_j:
-            if N_mols[mol_i - 1] == 0:
-                print(
-                    f"Skipping intermolecular calculation between {mol_i} and {mol_j} cause the number of molecules of this species is only {N_mols[mol_i-1]}"
-                )
-                matrix_index = pd.MultiIndex.from_product(
-                    [range(1, original_size_i + 1), range(1, original_size_j + 1)],
-                    names=["ai", "aj"],
-                )
-                indeces_ai = np.array(list(matrix_index)).T[0]
-                indeces_aj = np.array(list(matrix_index)).T[1]
-                df = pd.DataFrame(columns=columns)
-                df["mi"] = [mol_i for x in range(1, original_size_i * original_size_j + 1)]
-                df["mj"] = [mol_j for x in range(1, original_size_i * original_size_j + 1)]
-                df["ai"] = indeces_ai
-                df["aj"] = indeces_aj
-                df["c12dist"] = 0.0
-                df["p"] = 0.0
-                df["cutoff"] = 0.0
-                df["mi"] = df["mi"].map("{:}".format)
-                df["mj"] = df["mj"].map("{:}".format)
-                df["ai"] = df["ai"].map("{:}".format)
-                df["aj"] = df["aj"].map("{:}".format)
-                df["c12dist"] = df["c12dist"].map("{:,.6f}".format)
-                df["p"] = df["p"].map("{:,.6e}".format)
-                df["cutoff"] = df["cutoff"].map("{:,.6f}".format)
-
-                df.index = range(len(df.index))
-                out_name = args.out_name + "_" if args.out_name else ""
-                output_file = f"{args.out}/intermat_{out_name}{mol_i}_{mol_j}.ndx"
-
-                df.to_csv(output_file, index=False, sep=" ", header=False, columns=columns)
-                continue
-
-        protein_ref_indices_i = np.array(
-            [i + 1 for i in range(len(protein_ref_i.atoms)) if protein_ref_i[i].element_name != "H"]
-        )
-        protein_ref_indices_j = np.array(
-            [i + 1 for i in range(len(protein_ref_j.atoms)) if protein_ref_j[i].element_name != "H"]
-        )
+    protein_ref_i = topology_ref.molecules[list(topology_ref.molecules.keys())[mol_i - 1]][0]
+    protein_ref_j = topology_ref.molecules[list(topology_ref.molecules.keys())[mol_j - 1]][0]
 
-        protein_ref_i = [a for a in protein_ref_i.atoms if a.element_name != "H"]
-        protein_ref_j = [a for a in protein_ref_j.atoms if a.element_name != "H"]
-
-        sorter_i = [str(x.residue.number) + map_if_exists(x.name) for x in protein_ref_i]
-        sorter_mego_i = [str(x.residue.number) + x.name for x in protein_mego_i]
-
-        sorter_j = [str(x.residue.number) + map_if_exists(x.name) for x in protein_ref_j]
-        sorter_mego_j = [str(x.residue.number) + x.name for x in protein_mego_j]
-
-        # preparing topology of molecule i
-        topology_df_i["ref_ai"] = protein_ref_indices_i
-        topology_df_i["ref_type"] = [a.name for a in protein_ref_i]
-        topology_df_i["sorter"] = sorter_i
-        topology_df_i["ref_ri"] = topology_df_i["sorter"].str.replace("[a-zA-Z]+[0-9]*", "", regex=True).astype(int)
-        topology_df_i.sort_values(by="sorter", inplace=True)
-        topology_df_i["mego_type"] = [a[0].type for a in sorted(zip(protein_mego_i, sorter_mego_i), key=lambda x: x[1])]
-        topology_df_i["mego_name"] = [a[0].name for a in sorted(zip(protein_mego_i, sorter_mego_i), key=lambda x: x[1])]
-        # need to sort back otherwise c12_cutoff are all wrong
-        topology_df_i.sort_values(by="ref_ai", inplace=True)
-        if args.custom_c12 is not None:
-            custom_c12_dict = io.read_custom_c12_parameters(args.custom_c12)
-            d_appo = {key: val for key, val in zip(custom_c12_dict.name, custom_c12_dict.c12)}
-            d.update(d_appo)
-
-        topology_df_i["c12"] = topology_df_i["mego_type"].map(d)
-
-        # preparing topology of molecule j
-        topology_df_j["ref_ai"] = protein_ref_indices_j
-        topology_df_j["ref_type"] = [a.name for a in protein_ref_j]
-        topology_df_j["sorter"] = sorter_j
-        topology_df_j["ref_ri"] = topology_df_j["sorter"].str.replace("[a-zA-Z]+[0-9]*", "", regex=True).astype(int)
-        topology_df_j.sort_values(by="sorter", inplace=True)
-        topology_df_j["mego_type"] = [a[0].type for a in sorted(zip(protein_mego_j, sorter_mego_j), key=lambda x: x[1])]
-        topology_df_j["mego_name"] = [a[0].name for a in sorted(zip(protein_mego_j, sorter_mego_j), key=lambda x: x[1])]
-        # need to sort back otherwise c12_cutoff are all wrong
-        topology_df_j.sort_values(by="ref_ai", inplace=True)
-        if args.custom_c12 is not None:
-            custom_c12_dict = io.read_custom_c12_parameters(args.custom_c12)
-            d_appo = {key: val for key, val in zip(custom_c12_dict.name, custom_c12_dict.c12)}
-            d.update(d_appo)
-
-        topology_df_j["c12"] = topology_df_j["mego_type"].map(d)
-
-        oxygen_mask = masking.create_matrix_mask(
-            topology_df_i["mego_type"].to_numpy(),
-            topology_df_j["mego_type"].to_numpy(),
-            [("OM", "OM"), ("O", "O"), ("OM", "O")],
-            symmetrize=True,
-        )
+    original_size_i = len(protein_ref_i.atoms)
+    original_size_j = len(protein_ref_j.atoms)
+
+    protein_ref_indices_i = np.array(
+        [i + 1 for i in range(len(protein_ref_i.atoms)) if protein_ref_i[i].element_name != "H"]
+    )
+    protein_ref_indices_j = np.array(
+        [i + 1 for i in range(len(protein_ref_j.atoms)) if protein_ref_j[i].element_name != "H"]
+    )
+
+    protein_ref_i = [a for a in protein_ref_i.atoms if a.element_name != "H"]
+    protein_ref_j = [a for a in protein_ref_j.atoms if a.element_name != "H"]
+
+    sorter_i = [str(x.residue.number) + map_if_exists(x.name) for x in protein_ref_i]
+    sorter_mego_i = [str(x.residue.number) + x.name for x in protein_mego_i]
+
+    sorter_j = [str(x.residue.number) + map_if_exists(x.name) for x in protein_ref_j]
+    sorter_mego_j = [str(x.residue.number) + x.name for x in protein_mego_j]
+
+    # preparing topology of molecule i
+    topology_df_i["ref_ai"] = protein_ref_indices_i
+    topology_df_i["ref_type"] = [a.name for a in protein_ref_i]
+    topology_df_i["resname"] = [a.residue.name for a in protein_ref_i]
+    topology_df_i["resnum"] = [a.residue.idx for a in protein_ref_i]
+    topology_df_i["sorter"] = sorter_i
+    topology_df_i["ref_ri"] = topology_df_i["sorter"].str.replace("[a-zA-Z]+[0-9]*", "", regex=True).astype(int)
+    topology_df_i.sort_values(by="sorter", inplace=True)
+    topology_df_i["mego_ai"] = [a[0].idx for a in sorted(zip(protein_mego_i, sorter_mego_i), key=lambda x: x[1])]
+    topology_df_i["mego_type"] = [a[0].type for a in sorted(zip(protein_mego_i, sorter_mego_i), key=lambda x: x[1])]
+    topology_df_i["mego_name"] = [a[0].name for a in sorted(zip(protein_mego_i, sorter_mego_i), key=lambda x: x[1])]
+    topology_df_i["name"] = topology_df_i["mego_name"]
+    topology_df_i["type"] = topology_df_i["mego_type"]
+    # need to sort back otherwise c12_cutoff are all wrong
+    topology_df_i.sort_values(by="ref_ai", inplace=True)
+    if args.custom_c12 is not None:
+        custom_c12_dict = io.read_custom_c12_parameters(args.custom_c12)
+        d_appo = {key: val for key, val in zip(custom_c12_dict.name, custom_c12_dict.c12)}
+        d.update(d_appo)
+
+    topology_df_i["c12"] = topology_df_i["mego_type"].map(d)
+
+    # preparing topology of molecule j
+    topology_df_j["ref_ai"] = protein_ref_indices_j
+    topology_df_j["ref_type"] = [a.name for a in protein_ref_j]
+    topology_df_j["sorter"] = sorter_j
+    topology_df_j["resname"] = [a.residue.name for a in protein_ref_j]
+    topology_df_j["resnum"] = [a.residue.idx for a in protein_ref_j]
+    topology_df_j["ref_ri"] = topology_df_j["sorter"].str.replace("[a-zA-Z]+[0-9]*", "", regex=True).astype(int)
+    topology_df_j.sort_values(by="sorter", inplace=True)
+    topology_df_j["mego_type"] = [a[0].type for a in sorted(zip(protein_mego_j, sorter_mego_j), key=lambda x: x[1])]
+    topology_df_j["mego_name"] = [a[0].name for a in sorted(zip(protein_mego_j, sorter_mego_j), key=lambda x: x[1])]
+    topology_df_j["name"] = topology_df_j["mego_name"]
+    topology_df_j["type"] = topology_df_j["mego_type"]
+    # need to sort back otherwise c12_cutoff are all wrong
+    topology_df_j.sort_values(by="ref_ai", inplace=True)
+    if args.custom_c12 is not None:
+        custom_c12_dict = io.read_custom_c12_parameters(args.custom_c12)
+        d_appo = {key: val for key, val in zip(custom_c12_dict.name, custom_c12_dict.c12)}
+        d.update(d_appo)
+
+    topology_df_j["c12"] = topology_df_j["mego_type"].map(d)
+    oxygen_mask = masking.create_matrix_mask(
+        topology_df_i["mego_type"].to_numpy(),
+        topology_df_j["mego_type"].to_numpy(),
+        [("OM", "OM"), ("O", "O"), ("OM", "O")],
+        symmetrize=True,
+    )
+
+    if mat_type == "intra":
+        first_aminoacid = topology_mego.residues[0].name
+        if first_aminoacid in type_definitions.aminoacids_list:
+            molecule_type = "protein"
+        elif first_aminoacid in type_definitions.nucleic_acid_list:
+            molecule_type = "nucleic_acid"
+        else:
+            molecule_type = "other"
+
+        types = type_definitions.lj14_generator(topology_df_i)
+
+        if molecule_type == "other":
+            # read user pairs
+            molecule_keys = list(topology_mego.molecules.keys())
+            user_pairs = [
+                (pair.atom1.idx, pair.atom2.idx, pair.type.epsilon * 4.184)
+                for pair in topology_mego.molecules[molecule_keys[mol_i-1]][0].adjusts
+            ]
+            user_pairs = [
+                (topology_df_i[topology_df_i["mego_ai"] == ai].index[0], topology_df_i[topology_df_i["mego_ai"] == aj].index[0], c12)
+                for ai, aj, c12 in user_pairs
+            ]
+        
+        c12_values = generate_c12_values(topology_df_i, types, type_definitions.atom_type_combinations, molecule_type)
 
+        # define all cutoff
+        c12_cutoff = CUTOFF_FACTOR * np.power(np.where(oxygen_mask, 11.4 * c12_values, c12_values), 1.0 / 12.0)
+
+        # apply the user pairs (overwrite all other rules)
+        if molecule_type == "other":
+            for ai, aj, c12 in user_pairs:
+                ai = int(ai)
+                aj = int(aj)
+                if c12 > 0.0:
+                    c12_cutoff[ai][aj] = CUTOFF_FACTOR * np.power(c12, 1.0 / 12.0)
+                    c12_cutoff[aj][ai] = CUTOFF_FACTOR * np.power(c12, 1.0 / 12.0)
+    
+    if mat_type == "inter":
         # define all cutoff
         c12_cutoff = CUTOFF_FACTOR * np.where(
             oxygen_mask,
@@ -925,157 +731,139 @@ def calculate_inter_probabilities(args):
             ),
         )
 
-        mismatched = topology_df_i.loc[topology_df_i["ref_type"].str[0] != topology_df_i["mego_name"].str[0]]
-        if not mismatched.empty:
-            raise ValueError(f"Mismatch found:\n{mismatched}, target and mego topology are not compatible")
-        mismatched = topology_df_j.loc[topology_df_j["ref_type"].str[0] != topology_df_j["mego_name"].str[0]]
-        if not mismatched.empty:
-            raise ValueError(f"Mismatch found:\n{mismatched}, target and mego topology are not compatible")
-
-        if args.residue:
-            c12_cutoff = args.cutoff * np.ones(c12_cutoff.shape)
-        if np.any(c12_cutoff > args.cutoff):
-            warning_cutoff_histo(args.cutoff, np.max(c12_cutoff))
-        if np.isnan(c12_cutoff.astype(float)).any():
-            warning_cutoff_histo(args.cutoff, np.max(c12_cutoff))
-
-        # create dictionary with ref_ai to ri
-        ref_ai_to_ri_i = dict(zip(topology_df_i["ref_ai"], topology_df_i["ref_ri"]))
-        ref_ai_to_ri_j = dict(zip(topology_df_j["ref_ai"], topology_df_j["ref_ri"]))
-        # index_ai_to_ri_i = {k: v for k, v in enumerate(topology_df_i["ref_ri"])}
-        index_ai_to_ri_j = {k: v for k, v in enumerate(topology_df_j["ref_ri"])}
-        # create a dictionary with ref_ri to ai as a list of ai
-        ref_ri_to_ai_i = {f"{mol_i}_{ri}": [] for ri in topology_df_i["ref_ri"]}
-        ref_ri_to_ai_j = {f"{mol_j}_{ri}": [] for ri in topology_df_j["ref_ri"]}
-        for ai, ri in ref_ai_to_ri_i.items():
-            ref_ri_to_ai_i[f"{mol_i}_{ri}"].append(ai)
-        for ai, ri in ref_ai_to_ri_j.items():
-            ref_ri_to_ai_j[f"{mol_j}_{ri}"].append(ai)
-
-        dict_m_m_r = {}
-        for target in target_list:
-            target_fields = target.replace(".dat", "").split("_")
-            mi = int(target_fields[-4])
-            mj = int(target_fields[-3])
-            ai = int(target_fields[-1])
-            if ai not in protein_ref_indices_i:
-                continue
-            ri = ref_ai_to_ri_i[ai]
-            if (mi, mj, ri) in dict_m_m_r:
-                dict_m_m_r[(mi, mj, ri)].append(target)
-            else:
-                dict_m_m_r[(mi, mj, ri)] = [target]
-
-        ########################
-        # PARALLEL PROCESS START
-        ########################
-
-        if not args.residue:
-            chunks = np.array_split(target_list, args.proc)
-        else:
-            chunks = []
-            n_threshold = sum([len(v) for v in dict_m_m_r.values()]) // args.proc
-            chunk = []
-            n = 0
-            for k, v in dict_m_m_r.items():
-                chunk.append(v)
-                n += len(v)
-                if n > n_threshold:
-                    chunks.append(chunk)
-                    chunk = []
-                    n = 0
-            chunks.append(chunk)
-        pool = multiprocessing.Pool(args.proc)
-        if args.residue:
-            results = pool.map(
-                run_residue_inter_,
-                [
-                    (
-                        args,
-                        protein_ref_indices_i,
-                        protein_ref_indices_j,
-                        len(ref_ri_to_ai_j),
-                        c12_cutoff,
-                        mol_i,
-                        mol_j,
-                        (ref_ai_to_ri_i, index_ai_to_ri_j),
-                        x,
-                    )
-                    for x in chunks
-                ],
-            )
+    mismatched = topology_df_i.loc[topology_df_i["ref_type"].str[0] != topology_df_i["mego_name"].str[0]]
+    if not mismatched.empty:
+        raise ValueError(f"Mismatch found:\n{mismatched}, target and mego topology are not compatible")
+    mismatched = topology_df_j.loc[topology_df_j["ref_type"].str[0] != topology_df_j["mego_name"].str[0]]
+    if not mismatched.empty:
+        raise ValueError(f"Mismatch found:\n{mismatched}, target and mego topology are not compatible")
+
+    if args.residue:
+        c12_cutoff = args.cutoff * np.ones(c12_cutoff.shape)
+    if np.any(c12_cutoff > args.cutoff):
+        warning_cutoff_histo(args.cutoff, np.max(c12_cutoff))
+    if np.isnan(c12_cutoff.astype(float)).any():
+        warning_cutoff_histo(args.cutoff, np.max(c12_cutoff))
+
+    # create dictionary with ref_ai to ri
+    ref_ai_to_ri_i = dict(zip(topology_df_i["ref_ai"], topology_df_i["ref_ri"]))
+    ref_ai_to_ri_j = dict(zip(topology_df_j["ref_ai"], topology_df_j["ref_ri"]))
+    # index_ai_to_ri_i = {k: v for k, v in enumerate(topology_df_i["ref_ri"])}
+    index_ai_to_ri_j = {k: v for k, v in enumerate(topology_df_j["ref_ri"])}
+    # create a dictionary with ref_ri to ai as a list of ai
+    ref_ri_to_ai_i = {f"{mol_i}_{ri}": [] for ri in topology_df_i["ref_ri"]}
+    ref_ri_to_ai_j = {f"{mol_j}_{ri}": [] for ri in topology_df_j["ref_ri"]}
+    for ai, ri in ref_ai_to_ri_i.items():
+        ref_ri_to_ai_i[f"{mol_i}_{ri}"].append(ai)
+    for ai, ri in ref_ai_to_ri_j.items():
+        ref_ri_to_ai_j[f"{mol_j}_{ri}"].append(ai)
+
+    dict_m_m_r = {}
+    for target in target_list:
+        target_fields = target.replace(".dat", "").split("_")
+        mi = int(target_fields[-4])
+        mj = int(target_fields[-3])
+        ai = int(target_fields[-1])
+        if ai not in protein_ref_indices_i:
+            continue
+        ri = ref_ai_to_ri_i[ai]
+        if (mi, mj, ri) in dict_m_m_r:
+            dict_m_m_r[(mi, mj, ri)].append(target)
         else:
-            results = pool.map(
-                run_inter_,
-                [
-                    (
-                        args,
-                        protein_ref_indices_i,
-                        protein_ref_indices_j,
-                        original_size_j,
-                        c12_cutoff,
-                        mol_i,
-                        mol_j,
-                        x,
-                    )
-                    for x in chunks
-                ],
-            )
-        pool.close()
-        pool.join()
-
-        ########################
-        # PARALLEL PROCESS END
-        ########################
-
-        # concatenate and remove partial dataframes
-        for name in results:
-            try:
-                part_df = pd.read_csv(name)
-                df = pd.concat([df, part_df])
-            except pd.errors.EmptyDataError:
-                print(f"Ignoring partial dataframe in {name} as csv is empty")
-        [os.remove(name) for name in results]
-        df = df.astype({"mi": "int32", "mj": "int32", "ai": "int32", "aj": "int32"})
-
-        df = df.sort_values(by=["mi", "mj", "ai", "aj"])
-        df.drop_duplicates(subset=["mi", "ai", "mj", "aj"], inplace=True)
-
-        df["mi"] = df["mi"].map("{:}".format)
-        df["mj"] = df["mj"].map("{:}".format)
-        df["ai"] = df["ai"].map("{:}".format)
-        df["aj"] = df["aj"].map("{:}".format)
-        df["c12dist"] = df["c12dist"].map("{:,.6f}".format)
-        df["p"] = df["p"].map("{:,.6e}".format)
-        df["cutoff"] = df["cutoff"].map("{:,.6f}".format)
-
-        df.index = range(len(df.index))
-        out_name = args.out_name + "_" if args.out_name else ""
-        output_file = f"{args.out}/intermat_{out_name}{mol_i}_{mol_j}.ndx.gz"
-        print(f"Saving output for molecule {mol_i} and {mol_j} in {output_file}")
-        write_mat(df, output_file)
+            dict_m_m_r[(mi, mj, ri)] = [target]
 
+    ########################
+    # PARALLEL PROCESS START
+    ########################
 
-def calculate_probability(values, weights, callback=allfunction):
-    """
-    Calculates a plain probability accoring to \sum_x x * dx
+    if not args.residue:
+        chunks = np.array_split(target_list, args.num_threads)
+    else:
+        chunks = []
+        n_threshold = sum([len(v) for v in dict_m_m_r.values()]) // args.num_threads
+        chunk = []
+        n = 0
+        for k, v in dict_m_m_r.items():
+            chunk.append(v)
+            n += len(v)
+            if n > n_threshold:
+                chunks.append(chunk)
+                chunk = []
+                n = 0
+        chunks.append(chunk)
+    pool = multiprocessing.Pool(args.num_threads)
+    if args.residue and not args.intra:
+        results = pool.map(
+            run_residue_inter_,
+            [
+                (
+                    args,
+                    protein_ref_indices_i,
+                    protein_ref_indices_j,
+                    len(ref_ri_to_ai_j),
+                    c12_cutoff,
+                    mol_i,
+                    mol_j,
+                    (ref_ai_to_ri_i, index_ai_to_ri_j),
+                    x,
+                )
+                for x in chunks
+            ],
+        )
+    else:
 
-    Parameters
-    ----------
-    values : np.array
-        The array of the histograms x values
-    weights : np.array
-        The array with the respective weights
-    callback : function
-        Preprocesses the data before going in to the analysis
+        results = pool.map(
+            run_mat_,
+            [
+                (
+                    args,
+                    protein_ref_indices_i,
+                    protein_ref_indices_j,
+                    original_size_j,
+                    c12_cutoff,
+                    mol_i,
+                    mol_j,
+                    x,
+                    mat_type,
+                )
+                for x in chunks
+            ],
+        )
+       
+    pool.close()
+    pool.join()
 
-    Returns
-    -------
-    The probability of the histogram
-    """
-    dx = values[1] - values[0]
-    cutoff, i, norm, v, w = callback(values, weights)
-    return np.minimum(np.sum(w * dx), 1)
+    ########################
+    # PARALLEL PROCESS END
+    ########################
+
+    # concatenate and remove partial dataframes
+    for name in results:
+        try:
+            part_df = pd.read_csv(name)
+            df = pd.concat([df, part_df])
+        except pd.errors.EmptyDataError:
+            print(f"Ignoring partial dataframe in {name} as csv is empty")
+    [os.remove(name) for name in results]
+    df = df.astype({"mi": "int32", "mj": "int32", "ai": "int32", "aj": "int32"})
+
+    df = df.sort_values(by=["mi", "mj", "ai", "aj"])
+    df.drop_duplicates(subset=["mi", "ai", "mj", "aj"], inplace=True)
+
+    df["mi"] = df["mi"].map("{:}".format)
+    df["mj"] = df["mj"].map("{:}".format)
+    df["ai"] = df["ai"].map("{:}".format)
+    df["aj"] = df["aj"].map("{:}".format)
+    df["c12dist"] = df["c12dist"].map("{:,.6f}".format)
+    df["p"] = df["p"].map("{:,.6e}".format)
+    df["cutoff"] = df["cutoff"].map("{:,.6f}".format)
+
+    df.index = range(len(df.index))
+    out_name = args.out_name + "_" if args.out_name else ""
+    output_file = f"{args.out}/{mat_type}mat_{out_name}{mol_i}_{mol_j}.ndx.gz"
+    if args.residue: output_file = f"{args.out}/{mat_type}mat_res_{out_name}{mol_i}_{mol_j}.ndx.gz"
+    print(f"Saving output for molecule {mol_i} and {mol_j} in {output_file}")
+    write_mat(df, output_file)
 
 
 if __name__ == "__main__":
@@ -1097,9 +885,9 @@ def calculate_probability(values, weights, callback=allfunction):
         help="""Path to the standard multi-eGO topology of the system generated by pdb2gmx""",
     )
     parser.add_argument(
-        "--inter",
-        action="store_true",
-        help="Sets the caculation to be adapted to intermolecular calculations of histograms",
+        "--mode",
+        help="Sets the caculation to be intra/same/cross for histograms processing",
+        default="intra+same+cross"
     )
     parser.add_argument("--out", default="./", help="""Sets the output path""")
     parser.add_argument(
@@ -1107,7 +895,7 @@ def calculate_probability(values, weights, callback=allfunction):
         help="""Sets the output name of files to be added to the default one: intermat_<out_name>_mi_mj.ndx or intramat_<out_name>_mi_mj.ndx""",
     )
     parser.add_argument(
-        "--proc",
+        "--num_threads",
         default=1,
         type=int,
         help="Sets the number of processes to perform the calculation",
@@ -1132,6 +920,11 @@ def calculate_probability(values, weights, callback=allfunction):
         "--residue",
         action="store_true",
     )
+    parser.add_argument(
+        "--zero",
+        action="store_true",
+        default=False,
+    )
     args = parser.parse_args()
 
     # check if output file exists
@@ -1147,24 +940,33 @@ def calculate_probability(values, weights, callback=allfunction):
         print(f"The path '{args.histo}' is not a tar file.")
         sys.exit()
 
-    if args.residue and not args.inter:
+    # Sets mode
+    modes = np.array(args.mode.split("+"),dtype=str)
+    modes_possible =  np.array(["intra", "same", "cross"])
+    args.intra = False
+    args.same  = False
+    args.cross = False
+    if np.any(np.isin(modes, modes_possible) == False):
+        raise ValueError(f"inserted mode {args.mode} is not correct and got evaluated to {modes}. Choose intra,same and or cross separated by '+', e.g.: intra+same or same+cross")
+
+    if "intra" in modes: args.intra = True
+    if "same"  in modes: args.same  = True
+    if "cross" in modes: args.cross = True
+
+    if args.residue and args.intra:
         print("Residue calculation is only possible for intermolecular calculations (not implemented yet for intramolecular).")
         sys.exit()
 
     N_BINS = args.cutoff / (0.01 / 4)
     DX = args.cutoff / N_BINS
-    PREFIX = "inter_mol_" if args.inter else "intra_mol_"
     CUTOFF_FACTOR = 1.45
     print(
         f"""
     Starting with cutoff = {args.cutoff},
                   n_bins = {N_BINS},
                   dx     = {DX}
-                  on {args.proc} threads
+                  on {args.num_threads} threads
     """
     )
 
-    if not args.inter:
-        calculate_intra_probabilities(args)
-    else:
-        calculate_inter_probabilities(args)
+    calculate_matrices(args)

From f3d0d9ff32be00a223f6f45cb99742e075edcae6 Mon Sep 17 00:00:00 2001
From: Bruno Stegani <bruno.stega8@gmail.com>
Date: Fri, 29 Nov 2024 01:21:09 +0100
Subject: [PATCH 2/5] code cleaning and modified test command to new make_mat

---
 test/run_make_mat.sh       |  5 ++-
 tools/make_mat/make_mat.py | 71 ++++++++++++++++++++++----------------
 2 files changed, 44 insertions(+), 32 deletions(-)

diff --git a/test/run_make_mat.sh b/test/run_make_mat.sh
index 76f888a5..6f6ea545 100755
--- a/test/run_make_mat.sh
+++ b/test/run_make_mat.sh
@@ -4,12 +4,11 @@ set -e
 set -o pipefail
 
 tar -zxf test_inputs/make_mat_ttr/hh.tgz -C test_inputs/make_mat_ttr/
-python ../tools/make_mat/make_mat.py --histo test_inputs/make_mat_ttr/hh.tgz --target_top test_inputs/make_mat_ttr/topol_md.top --mego_top test_inputs/make_mat_ttr/topol_ref.top --cutoff 0.75 --inter --out  test_inputs/make_mat_ttr/ --tar
-python ../tools/make_mat/make_mat.py --histo test_inputs/make_mat_ttr/histo --target_top test_inputs/make_mat_ttr/topol_md.top --mego_top test_inputs/make_mat_ttr/topol_ref.top --cutoff 0.75 --out  test_inputs/make_mat_ttr/
+python ../tools/make_mat/make_mat.py --histo test_inputs/make_mat_ttr/hh.tgz --target_top test_inputs/make_mat_ttr/topol_md.top --mego_top test_inputs/make_mat_ttr/topol_ref.top --cutoff 0.75 --mode intra+same --out  test_inputs/make_mat_ttr/ --tar
 diff <(gzip -dc test_inputs/make_mat_ttr/intramat_1_1.ndx.gz) <(gzip -dc test_outputs/make_mat_ttr/intramat_1_1.ndx.gz)
 diff <(gzip -dc test_inputs/make_mat_ttr/intermat_1_1.ndx.gz) <(gzip -dc test_outputs/make_mat_ttr/intermat_1_1.ndx.gz)
 
 tar -zxf test_inputs/make_mat_popc/hh.tgz -C test_inputs/make_mat_popc/
-python ../tools/make_mat/make_mat.py --histo test_inputs/make_mat_popc/histo --target_top test_inputs/make_mat_popc/topol_md.top --mego_top test_inputs/make_mat_popc/topol_ref.top --cutoff 0.75 --out  test_inputs/make_mat_popc/
+python ../tools/make_mat/make_mat.py --histo test_inputs/make_mat_popc/histo --target_top test_inputs/make_mat_popc/topol_md.top --mego_top test_inputs/make_mat_popc/topol_ref.top --cutoff 0.75 --mode intra --out  test_inputs/make_mat_popc/ 
 diff <(gzip -dc test_inputs/make_mat_popc/intramat_1_1.ndx.gz) <(gzip -dc test_outputs/make_mat_popc/intramat_1_1.ndx.gz)
 
diff --git a/tools/make_mat/make_mat.py b/tools/make_mat/make_mat.py
index e9c38fdd..cd20e706 100644
--- a/tools/make_mat/make_mat.py
+++ b/tools/make_mat/make_mat.py
@@ -51,18 +51,23 @@ def read_mat(name, protein_ref_indices, args, cumulative=False):
 
     return ref_df
 
+
 def zero_probability_decorator(flag):
     """
     Decorator of function to return 0 if flag is rased
     """
+
     def decorator(func):
         def wrapper(*args, **kwargs):
             if flag:
                 return 0  # Return 0 if the flag is set
             return func(*args, **kwargs)  # Otherwise, execute the original function
+
         return wrapper
+
     return decorator
 
+
 def run_mat_(arguments):
     """
     Preforms the main routine of the histogram analysis to obtain the intra- and intermat files.
@@ -119,10 +124,10 @@ def run_mat_(arguments):
             ref_df.loc[len(ref_df)] = c12_cutoff[cut_i]
 
             # calculate data
-            c12_avg_ = zero_probability_decorator( args.zero)(c12_avg)
-            calculate_probability_ = zero_probability_decorator( args.zero)(calculate_probability)
-            get_cumulative_probability_ = zero_probability_decorator( args.zero)(get_cumulative_probability)
-           
+            c12_avg_ = zero_probability_decorator(args.zero)(c12_avg)
+            calculate_probability_ = zero_probability_decorator(args.zero)(calculate_probability)
+            get_cumulative_probability_ = zero_probability_decorator(args.zero)(get_cumulative_probability)
+
             c12dist = ref_df.apply(lambda x: c12_avg_(ref_df.index.to_numpy(), weights=x.to_numpy()), axis=0).values
             if mat_type == "intra":
                 p = ref_df.apply(
@@ -322,6 +327,7 @@ def map_if_exists(atom_name):
     else:
         return atom_name
 
+
 # TODO is it needed anymore?
 def hallfunction(values, weights):
     """
@@ -419,6 +425,7 @@ def zero_callback(values, weights):
     """
     return None, None, np.sum(weights), values, weights
 
+
 def calculate_probability(values, weights, callback=allfunction):
     """
     Calculates a plain probability accoring to \sum_x x * dx
@@ -580,9 +587,11 @@ def calculate_matrices(args):
         if args.intra:
             prefix = f"intra_mol_{mol_i}_{mol_i}"
             run_stuff(mol_i, mol_i, topology_mego, topology_ref, molecules_name, prefix)
-        for mol_j in mol_list[mol_i-1:]:           
-            if mol_i==mol_j and not args.same:  continue
-            if mol_i!=mol_j and not args.cross: continue
+        for mol_j in mol_list[mol_i - 1 :]:
+            if mol_i == mol_j and not args.same:
+                continue
+            if mol_i != mol_j and not args.cross:
+                continue
 
             prefix = f"inter_mol_{mol_i}_{mol_j}"
             run_stuff(mol_i, mol_j, topology_mego, topology_ref, molecules_name, prefix)
@@ -615,12 +624,8 @@ def run_stuff(mol_i, mol_j, topology_mego, topology_ref, molecules_name, prefix)
     original_size_i = len(protein_ref_i.atoms)
     original_size_j = len(protein_ref_j.atoms)
 
-    protein_ref_indices_i = np.array(
-        [i + 1 for i in range(len(protein_ref_i.atoms)) if protein_ref_i[i].element_name != "H"]
-    )
-    protein_ref_indices_j = np.array(
-        [i + 1 for i in range(len(protein_ref_j.atoms)) if protein_ref_j[i].element_name != "H"]
-    )
+    protein_ref_indices_i = np.array([i + 1 for i in range(len(protein_ref_i.atoms)) if protein_ref_i[i].element_name != "H"])
+    protein_ref_indices_j = np.array([i + 1 for i in range(len(protein_ref_j.atoms)) if protein_ref_j[i].element_name != "H"])
 
     protein_ref_i = [a for a in protein_ref_i.atoms if a.element_name != "H"]
     protein_ref_j = [a for a in protein_ref_j.atoms if a.element_name != "H"]
@@ -696,13 +701,17 @@ def run_stuff(mol_i, mol_j, topology_mego, topology_ref, molecules_name, prefix)
             molecule_keys = list(topology_mego.molecules.keys())
             user_pairs = [
                 (pair.atom1.idx, pair.atom2.idx, pair.type.epsilon * 4.184)
-                for pair in topology_mego.molecules[molecule_keys[mol_i-1]][0].adjusts
+                for pair in topology_mego.molecules[molecule_keys[mol_i - 1]][0].adjusts
             ]
             user_pairs = [
-                (topology_df_i[topology_df_i["mego_ai"] == ai].index[0], topology_df_i[topology_df_i["mego_ai"] == aj].index[0], c12)
+                (
+                    topology_df_i[topology_df_i["mego_ai"] == ai].index[0],
+                    topology_df_i[topology_df_i["mego_ai"] == aj].index[0],
+                    c12,
+                )
                 for ai, aj, c12 in user_pairs
             ]
-        
+
         c12_values = generate_c12_values(topology_df_i, types, type_definitions.atom_type_combinations, molecule_type)
 
         # define all cutoff
@@ -716,7 +725,7 @@ def run_stuff(mol_i, mol_j, topology_mego, topology_ref, molecules_name, prefix)
                 if c12 > 0.0:
                     c12_cutoff[ai][aj] = CUTOFF_FACTOR * np.power(c12, 1.0 / 12.0)
                     c12_cutoff[aj][ai] = CUTOFF_FACTOR * np.power(c12, 1.0 / 12.0)
-    
+
     if mat_type == "inter":
         # define all cutoff
         c12_cutoff = CUTOFF_FACTOR * np.where(
@@ -829,7 +838,7 @@ def run_stuff(mol_i, mol_j, topology_mego, topology_ref, molecules_name, prefix)
                 for x in chunks
             ],
         )
-       
+
     pool.close()
     pool.join()
 
@@ -861,7 +870,8 @@ def run_stuff(mol_i, mol_j, topology_mego, topology_ref, molecules_name, prefix)
     df.index = range(len(df.index))
     out_name = args.out_name + "_" if args.out_name else ""
     output_file = f"{args.out}/{mat_type}mat_{out_name}{mol_i}_{mol_j}.ndx.gz"
-    if args.residue: output_file = f"{args.out}/{mat_type}mat_res_{out_name}{mol_i}_{mol_j}.ndx.gz"
+    if args.residue:
+        output_file = f"{args.out}/{mat_type}mat_res_{out_name}{mol_i}_{mol_j}.ndx.gz"
     print(f"Saving output for molecule {mol_i} and {mol_j} in {output_file}")
     write_mat(df, output_file)
 
@@ -885,9 +895,7 @@ def run_stuff(mol_i, mol_j, topology_mego, topology_ref, molecules_name, prefix)
         help="""Path to the standard multi-eGO topology of the system generated by pdb2gmx""",
     )
     parser.add_argument(
-        "--mode",
-        help="Sets the caculation to be intra/same/cross for histograms processing",
-        default="intra+same+cross"
+        "--mode", help="Sets the caculation to be intra/same/cross for histograms processing", default="intra+same+cross"
     )
     parser.add_argument("--out", default="./", help="""Sets the output path""")
     parser.add_argument(
@@ -941,17 +949,22 @@ def run_stuff(mol_i, mol_j, topology_mego, topology_ref, molecules_name, prefix)
         sys.exit()
 
     # Sets mode
-    modes = np.array(args.mode.split("+"),dtype=str)
-    modes_possible =  np.array(["intra", "same", "cross"])
+    modes = np.array(args.mode.split("+"), dtype=str)
+    modes_possible = np.array(["intra", "same", "cross"])
     args.intra = False
-    args.same  = False
+    args.same = False
     args.cross = False
     if np.any(np.isin(modes, modes_possible) == False):
-        raise ValueError(f"inserted mode {args.mode} is not correct and got evaluated to {modes}. Choose intra,same and or cross separated by '+', e.g.: intra+same or same+cross")
+        raise ValueError(
+            f"inserted mode {args.mode} is not correct and got evaluated to {modes}. Choose intra,same and or cross separated by '+', e.g.: intra+same or same+cross"
+        )
 
-    if "intra" in modes: args.intra = True
-    if "same"  in modes: args.same  = True
-    if "cross" in modes: args.cross = True
+    if "intra" in modes:
+        args.intra = True
+    if "same" in modes:
+        args.same = True
+    if "cross" in modes:
+        args.cross = True
 
     if args.residue and args.intra:
         print("Residue calculation is only possible for intermolecular calculations (not implemented yet for intramolecular).")

From 8ac6e8ff10e1f6ee49cc505335726651b28fc124 Mon Sep 17 00:00:00 2001
From: Bruno Stegani <bruno.stega8@gmail.com>
Date: Fri, 29 Nov 2024 11:25:19 +0100
Subject: [PATCH 3/5] code cleaning

---
 tools/make_mat/make_mat.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/tools/make_mat/make_mat.py b/tools/make_mat/make_mat.py
index cd20e706..24ba672c 100644
--- a/tools/make_mat/make_mat.py
+++ b/tools/make_mat/make_mat.py
@@ -52,20 +52,20 @@ def read_mat(name, protein_ref_indices, args, cumulative=False):
     return ref_df
 
 
-def zero_probability_decorator(flag):
+def zero_probability_decorator(func, flag):
     """
     Decorator of function to return 0 if flag is rased
     """
 
-    def decorator(func):
-        def wrapper(*args, **kwargs):
-            if flag:
-                return 0  # Return 0 if the flag is set
-            return func(*args, **kwargs)  # Otherwise, execute the original function
+    # def decorator(func):
+    def wrapper(*args, **kwargs):
+        if flag:
+            return 0  # Return 0 if the flag is set
+        return func(*args, **kwargs)  # Otherwise, execute the original function
 
-        return wrapper
+    return wrapper
 
-    return decorator
+    # return decorator
 
 
 def run_mat_(arguments):
@@ -124,9 +124,9 @@ def run_mat_(arguments):
             ref_df.loc[len(ref_df)] = c12_cutoff[cut_i]
 
             # calculate data
-            c12_avg_ = zero_probability_decorator(args.zero)(c12_avg)
-            calculate_probability_ = zero_probability_decorator(args.zero)(calculate_probability)
-            get_cumulative_probability_ = zero_probability_decorator(args.zero)(get_cumulative_probability)
+            c12_avg_ = zero_probability_decorator(c12_avg, args.zero)
+            calculate_probability_ = zero_probability_decorator(calculate_probability, args.zero)
+            get_cumulative_probability_ = zero_probability_decorator(get_cumulative_probability, args.zero)
 
             c12dist = ref_df.apply(lambda x: c12_avg_(ref_df.index.to_numpy(), weights=x.to_numpy()), axis=0).values
             if mat_type == "intra":

From f87eda62f0d425df45974d7c7ed1d70e62e0709a Mon Sep 17 00:00:00 2001
From: Bruno Stegani <bruno.stega8@gmail.com>
Date: Tue, 3 Dec 2024 15:41:51 +0100
Subject: [PATCH 4/5] fixed README and formatting

---
 tools/make_mat/README.md   |  6 ++++--
 tools/make_mat/make_mat.py | 19 +++++++++----------
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/tools/make_mat/README.md b/tools/make_mat/README.md
index 39916bce..b61afb25 100644
--- a/tools/make_mat/README.md
+++ b/tools/make_mat/README.md
@@ -17,17 +17,19 @@ Parameters:
     
 `--mego_top`: Path to the standard multi-eGO topology of the system generated by pdb2gmx.
     
-`--inter`: Flag to indicate that the calculation is adapted to intermolecular histograms. If not provided, intramolecular calculations are assumed.
+`--mode`: modality of the histogram analysis. Can be intra,same or cross (intramolecular, itermolecular same and intermolecular cross) or any combination by "+", e.g --mode intra+cross, --mode same+cross. If not provided all of them will be calculated --mode intra+same+cross.
     
 -`-out`: Optional parameter to set the output path. Default is the current directory.
     
 `--out_name`: Optional parameter to set the output name of files. It will be added to the default one.
                 Example: intermat_<out_name>_mi_mj.ndx or intramat_<out_name>_mi_mj.ndx
     
-`--proc`: Optional parameter to set the number of processes to perform the calculation. Default is 1.
+`--num_threads`: Optional parameter to set the number of processes to perform the calculation. Default is 1.
     
 `--cutoff`: The maximum cutoff used for the accumulation of the histograms.
 
+`--zero`: Optional flag returning 0 distances and 0 probability matrices with the correct cutoff values and indeces.
+
 ## ndx2HDF5.py
 
 This scripts convert a text or a text.gz matrix in HDF5 format, this is usefull for large system because it is much faster to process.
diff --git a/tools/make_mat/make_mat.py b/tools/make_mat/make_mat.py
index 24ba672c..774b6f60 100644
--- a/tools/make_mat/make_mat.py
+++ b/tools/make_mat/make_mat.py
@@ -103,18 +103,17 @@ def run_mat_(arguments):
         results_df = pd.DataFrame()
         ai = ref_f.split(".")[-2].split("_")[-1]
 
-        if True:
-            all_ai = [ai for _ in range(1, original_size_j + 1)]
-            range_list = [str(x) for x in range(1, original_size_j + 1)]
+        all_ai = [ai for _ in range(1, original_size_j + 1)]
+        range_list = [str(x) for x in range(1, original_size_j + 1)]
 
-            results_df["ai"] = np.array(all_ai).astype(int)
-            results_df["aj"] = np.array(range_list).astype(int)
+        results_df["ai"] = np.array(all_ai).astype(int)
+        results_df["aj"] = np.array(range_list).astype(int)
 
-            results_df["mi"] = mi
-            results_df["mj"] = mj
-            results_df["c12dist"] = 0.0
-            results_df["p"] = 0.0
-            results_df["cutoff"] = 0.0
+        results_df["mi"] = mi
+        results_df["mj"] = mj
+        results_df["c12dist"] = 0.0
+        results_df["p"] = 0.0
+        results_df["cutoff"] = 0.0
 
         if np.isin(int(ai), protein_ref_indices_i):
             cut_i = np.where(protein_ref_indices_i == int(ai))[0][0]

From c1f95804afc9c5356d6e4864a5c384fe6a4af949 Mon Sep 17 00:00:00 2001
From: Bruno Stegani <bruno.stega8@gmail.com>
Date: Tue, 3 Dec 2024 18:43:23 +0100
Subject: [PATCH 5/5] last formating and cleaning

---
 tools/make_mat/make_mat.py | 83 +++++---------------------------------
 1 file changed, 10 insertions(+), 73 deletions(-)

diff --git a/tools/make_mat/make_mat.py b/tools/make_mat/make_mat.py
index 774b6f60..d1892932 100644
--- a/tools/make_mat/make_mat.py
+++ b/tools/make_mat/make_mat.py
@@ -57,7 +57,6 @@ def zero_probability_decorator(func, flag):
     Decorator of function to return 0 if flag is rased
     """
 
-    # def decorator(func):
     def wrapper(*args, **kwargs):
         if flag:
             return 0  # Return 0 if the flag is set
@@ -65,8 +64,6 @@ def wrapper(*args, **kwargs):
 
     return wrapper
 
-    # return decorator
-
 
 def run_mat_(arguments):
     """
@@ -327,35 +324,7 @@ def map_if_exists(atom_name):
         return atom_name
 
 
-# TODO is it needed anymore?
-def hallfunction(values, weights):
-    """
-    A substitute to the allfunction without considering the cutoff.
-    Does not change the data except for the removal of the cutoff from the array.
-
-    Parameters
-    ----------
-    values : np.array
-        The array of the histograms x values
-    weights : np.array
-        The array with the respective weights
-
-    Return
-    ------
-    norm : float
-        The normalization constant
-    v : np.array
-        The unchanged x values of the histogram
-    w : np.array
-        The unchanged weights of the histogram
-    """
-    v = values[:-1]
-    w = weights[:-1]
-    norm = np.sum(w)
-    return norm, v, w
-
-
-def allfunction(values, weights):
+def get_col_params(values, weights):
     """
     TODO rename pls
 
@@ -397,35 +366,7 @@ def allfunction(values, weights):
     return cutoff, i, norm, v, w
 
 
-def zero_callback(values, weights):
-    """
-    A zero callback doing nothing but returning the normalization constant, values and weights.
-    The first two return values so the function can be switched with allfunction.
-
-    Parameters
-    ----------
-    values : np.array
-        The array of the histograms x values
-    weights : np.array
-        The array with the respective weights
-
-    Returns
-    -------
-    None : None
-        None
-    None : None
-        None
-    np.sum(weights) : float
-        The normalization constant
-    values : np.array
-        The array of the histograms x values
-    weights : np.array
-        The array with the respective weights
-    """
-    return None, None, np.sum(weights), values, weights
-
-
-def calculate_probability(values, weights, callback=allfunction):
+def calculate_probability(values, weights):
     """
     Calculates a plain probability accoring to \sum_x x * dx
 
@@ -435,24 +376,22 @@ def calculate_probability(values, weights, callback=allfunction):
         The array of the histograms x values
     weights : np.array
         The array with the respective weights
-    callback : function
-        Preprocesses the data before going in to the analysis
 
     Returns
     -------
     The probability of the histogram
     """
     dx = values[1] - values[0]
-    cutoff, i, norm, v, w = callback(values, weights)
+    cutoff, i, norm, v, w = get_col_params(values, weights)
     return np.minimum(np.sum(w * dx), 1)
 
 
-def get_cumulative_probability(values, weights, callback=allfunction):
-    cutoff, i, norm, v, w = callback(values, weights)
+def get_cumulative_probability(values, weights):
+    cutoff, i, norm, v, w = get_col_params(values, weights)
     return weights[i]
 
 
-def c12_avg(values, weights, callback=allfunction):
+def c12_avg(values, weights):
     """
     Calculates the c12 averaging of a histogram as 1 / ( (\sum_i^n w[i] * (1 / x[i])^12 ) / norm )^(1/12)
 
@@ -462,14 +401,12 @@ def c12_avg(values, weights, callback=allfunction):
         The array of the histograms x values
     weights : np.array
         The array with the respective weights
-    callback : function
-        Preprocesses the data before going in to the analysis
 
     Returns
     -------
     The c12 average
     """
-    cutoff, i, norm, v, w = callback(values, weights)
+    cutoff, i, norm, v, w = get_col_params(values, weights)
     if np.sum(w) == 0:
         return 0
     r = np.where(w > 0.0)
@@ -585,7 +522,7 @@ def calculate_matrices(args):
     for mol_i in mol_list:
         if args.intra:
             prefix = f"intra_mol_{mol_i}_{mol_i}"
-            run_stuff(mol_i, mol_i, topology_mego, topology_ref, molecules_name, prefix)
+            main_routine(mol_i, mol_i, topology_mego, topology_ref, molecules_name, prefix)
         for mol_j in mol_list[mol_i - 1 :]:
             if mol_i == mol_j and not args.same:
                 continue
@@ -593,10 +530,10 @@ def calculate_matrices(args):
                 continue
 
             prefix = f"inter_mol_{mol_i}_{mol_j}"
-            run_stuff(mol_i, mol_j, topology_mego, topology_ref, molecules_name, prefix)
+            main_routine(mol_i, mol_j, topology_mego, topology_ref, molecules_name, prefix)
 
 
-def run_stuff(mol_i, mol_j, topology_mego, topology_ref, molecules_name, prefix):
+def main_routine(mol_i, mol_j, topology_mego, topology_ref, molecules_name, prefix):
 
     df = pd.DataFrame()