Joon-Klaps
diff --git a/‎CHANGELOG.md
+1-1 b/‎CHANGELOG.md
+1-1
diff --git a/‎assets/custom_table_headers.yml
-1 b/‎assets/custom_table_headers.yml
-1
diff --git a/‎assets/multiqc_config.yml
+2-1 b/‎assets/multiqc_config.yml
+2-1
diff --git a/‎bin/custom_multiqc.py
+116-146 b/‎bin/custom_multiqc.py
+116-146
diff --git a/‎bin/extract_clust.py
+9-6 b/‎bin/extract_clust.py
+9-6
diff --git a/‎bin/utils/constant_variables.py
+7-2 b/‎bin/utils/constant_variables.py
+7-2
diff --git a/‎bin/utils/module_data_processing.py
+94-10 b/‎bin/utils/module_data_processing.py
+94-10
diff --git a/‎bin/utils/pandas_tools.py
+13-9 b/‎bin/utils/pandas_tools.py
+13-9
@@ -19,12 +19,12 @@ Initial release of Joon-Klaps/viralgenie, created with the [nf-core](https://nf-
 - Add logic to allow samples with no reference hits to be analysed ([#141](https://github.com/Joon-Klaps/viralgenie/pull/141))
 - Add visualisation for hybrid scaffold ([#143](https://github.com/Joon-Klaps/viralgenie/pull/143))
 
-
 ### `Fixed`
 
 - OOM with longer contigs for lowcov_to_reference, uses more RAM now ([#125](https://github.com/Joon-Klaps/viralgenie/pull/125))
 - fixing null output from global prefix ([#147](https://github.com/Joon-Klaps/viralgenie/pull/147))
 - Fix empty filtered clusters ([#148](https://github.com/Joon-Klaps/viralgenie/pull/148))
+- Fixing missing columns from general stats & add general stats sample filtering ([#149](https://github.com/Joon-Klaps/viralgenie/pull/149))
 
 ### `Parameters`
 - New parameter mmseqs_cluster_mode default to 0 ([#130](https://github.com/Joon-Klaps/viralgenie/pull/130))
@@ -25,7 +25,6 @@ picard:
       - LIBRARY: "library"
 samtools:
   - multiqc_samtools_stats:
-      - sequences: "reads mapped"
       - reads_paired_percent: "reads paired %"
       - average_length
       - is_sorted
 
@@ -5,7 +5,7 @@ report_comment: >
 
 export_plots: true
 data_format: "yaml"
-max_table_rows: 100000
+max_table_rows: 20
 
 report_section_order:
   samples_low_reads:
@@ -238,3 +238,4 @@ extra_fn_clean_exts:
   - ".sort"
   - ".consensus_bcftools"
   - ".consensus_ivar"
+  - ".umi_deduplicated"
@@ -122,8 +122,8 @@ def _to_line(self, prefix):
         return "\t".join(
             [
                 str(prefix),
-                str(self.taxid),
                 str(self.cluster_id),
+                str(self.taxid),
                 str(self.centroid),
                 str(self.cluster_size),
                 "\t".join(map(str, rounded_depth)),
@@ -326,7 +326,7 @@ def write_clusters_to_tsv(clusters, prefix):
     """
     with open(f"{prefix}.clusters.tsv", "w") as file:
         assemblers = [f"cumulative read depth - {assembler} [%]" for assembler in clusters[0].cumulative_read_depth.keys()]
-        file.write("\t".join(["sample", "taxon-id", "cluster-id", "centroid", "size"] + assemblers + ["members"]))
+        file.write("\t".join(["sample", "cluster", "taxon-id", "centroid", "number of members"] + assemblers + ["members"]))
         file.write("\n")
         for cluster in clusters:
             file.write(cluster._to_line(prefix))
@@ -458,7 +458,10 @@ def filter_members(clusters, pattern):
             filtered_clusters.append(cluster)
     return filtered_clusters
 
-def filter_clusters_by_coverage(clusters: List[Cluster], coverages: Dict, threshold: float,keep_n_clusters: int) -> Tuple[List[Cluster], List[Cluster]]:
+
+def filter_clusters_by_coverage(
+    clusters: List[Cluster], coverages: Dict, threshold: float, keep_n_clusters: int
+) -> Tuple[List[Cluster], List[Cluster]]:
     """
     Filter clusters on coverage, only keep clusters with a coverage above the threshold. If no clusters are kept, return top 5.
     """
@@ -471,9 +474,9 @@ def filter_clusters_by_coverage(clusters: List[Cluster], coverages: Dict, thresh
             filtered_clusters.append(cluster)
 
     if filtered_clusters:
-        return clusters,filtered_clusters
+        return clusters, filtered_clusters
 
-    sorted_clusters = sorted(clusters, key=lambda x: sum(x.cumulative_read_depth), reverse= True)
+    sorted_clusters = sorted(clusters, key=lambda x: sum(x.cumulative_read_depth), reverse=True)
     return sorted_clusters, sorted_clusters[:keep_n_clusters]
 
 
@@ -610,7 +613,7 @@ def main(argv=None):
     # Filter clusters by coverage
     if args.coverages:
         coverages = read_coverages(args.coverages)
-        clusters,filtered_clusters = filter_clusters_by_coverage(filtered_clusters, coverages, args.perc_reads_contig, args.keep_clusters)
+        clusters, filtered_clusters = filter_clusters_by_coverage(filtered_clusters, coverages, args.perc_reads_contig, args.keep_clusters)
         logger.info("Filtered clusters by coverage, %d were removed.", len(clusters_renamed) - len(filtered_clusters))
 
     assert len(filtered_clusters) != 0, "No clusters left after filtering."
 
@@ -21,15 +21,20 @@
 ]
 
 CONSTRAIN_GENERAL_STATS_COLUMNS = [
-    "read_mapped",
+    "reads_mapped",
+    "reads_mapped_percent",
     "reads_unmapped",
     "number_of_SNPs",
-    "number_of_indels" "CLUSTER: mosdepth.mean_coverage",
+    "number_of_indels",
+    "CLUSTER: mosdepth.mean_coverage",
     "CLUSTER: mosdepth.min_coverage",
     "CLUSTER: mosdepth.max_coverage",
     "CLUSTER: mosdepth.median_coverage",
     "CLUSTER: mosdepth.1_x_pc",
     "CLUSTER: mosdepth.10_x_pc",
+    "CLUSTER: mosdepth.50_x_pc",
+    "CLUSTER: mosdepth.100_x_pc",
+    "CLUSTER: mosdepth.200_x_pc",
     "qlen",
     "(quast) % N's",
     "(mash-screen) query-ID",
 
@@ -2,7 +2,7 @@
 
 import logging
 import re
-from typing import Dict, List, Union
+from typing import Dict, List, Union, Tuple, Optional, Any
 
 import pandas as pd
 
@@ -15,6 +15,7 @@
     reorder_columns,
     reorder_rows,
     split_index_column,
+    filter_and_rename_columns,
 )
 
 logger = logging.getLogger()
@@ -113,7 +114,7 @@ def parse_annotation_data(annotation_str):
     return annotation_dict
 
 
-def reformat_custom_df(df):
+def reformat_custom_df(df: pd.DataFrame, cluster_df: pd.DataFrame) -> pd.DataFrame:
     """
     Reformat the custom dataframe.
     """
@@ -124,6 +125,10 @@ def reformat_custom_df(df):
 
     df = split_index_column(df)
 
+    if not cluster_df.empty:
+        df = pd.merge(df, cluster_df, on=["sample", "cluster"], how="left")
+        df.index = df["index"]
+
     # Reorder the columns
     logger.info("Reordering columns")
     final_columns = ["index", "sample", "cluster", "step"] + [
@@ -133,27 +138,28 @@ def reformat_custom_df(df):
             "mash-screen",
             "blast",
             "checkv",
-            "QC check",
+            "cluster",
             "quast",
         ]
         for column in df.columns
         if group in column
     ]
-    return reorder_columns(df, final_columns)
+    return reorder_columns(df.dropna(subset=["step"]), list(dict.fromkeys(final_columns)))
 
 
-def filter_constrain(df, column, value):
+def filter_constrain(dataframe, column, value):
     """
     Filter a dataframe based on a column and a regex value.
 
     Args:
-        df (pd.DataFrame): The dataframe to be filtered.
+        dataframe (pd.DataFrame): The dataframe to be filtered.
         column (str): The column to filter on.
         regex_value (str): The regex value to filter on.
 
     Returns:
         pd.DataFrame, pd.DataFrame: The filtered dataframe with the regex value and the filtered dataframe without the regex value.
     """
+    df = dataframe.copy()
     # Find rows with the regex value
     locations = df[column].str.contains(value) | df["step"].str.contains("constrain")
 
@@ -188,8 +194,12 @@ def create_constrain_summary(df_constrain: pd.DataFrame, file_columns: List[Unio
         else:
             dic_columns[item] = item
 
+    logger.debug("dic_columns: %s", dic_columns)
+
     columns_of_interest = [dic_columns.get(key, key) for key in CONSTRAIN_GENERAL_STATS_COLUMNS]
 
+    logger.debug("columns_of_interest: %s", columns_of_interest)
+
     if not columns_of_interest:
         logger.warning("No columns of interest were found to create the constrain summary table!")
         return pd.DataFrame()
@@ -219,6 +229,7 @@ def create_constrain_summary(df_constrain: pd.DataFrame, file_columns: List[Unio
     df_constrain = df_constrain[present_columns]
 
     if df_constrain.empty:
+        logger.warning("The constrain DataFrame is empty.")
         return df_constrain
 
     df_constrain = df_constrain.rename(columns=COLUMN_MAPPING)
@@ -260,7 +271,8 @@ def reformat_constrain_df(df, file_columns, args):
     """
     # Separate table for mapping constrains
     if df.empty:
-        return df
+        logger.warning("The constrain DataFrame is empty.")
+        return df, df
 
     # Add constrain metadata to the mapping constrain table
     constrain_meta = filelist_to_df([args.mapping_constrains])
@@ -296,14 +308,12 @@ def generate_ignore_samples(dataframe: pd.DataFrame) -> pd.Series:
     Generate a Series of indices that are not part of the df_snip dataframe.
 
     Parameters:
-    dataframe (pd.DataFrame): The input DataFrame to    ocess.
+    dataframe (pd.DataFrame): The input DataFrame to process.
 
     Returns:
     pd.Series: A Series containing the indices that are not in df_snip.
     """
     df = dataframe.copy()
-    df = drop_columns(df, ["index"])
-    df["index"] = df.index
     df = split_index_column(df)
 
     df = reorder_rows(df)
@@ -322,3 +332,77 @@ def add_prefix_to_values_dict(data: List[Union[str, Dict[str, str]]], prefix: st
         else:
             updated_items.extend({key: f"({prefix}) {value}"} for key, value in item.items())
     return updated_items
+
+
+def check_section_exists(module_data: Dict, section_key: str) -> bool:
+    """Check if a section exists in the module data."""
+    return any(section_key in key for key in module_data.keys())
+
+
+def extract_mqc_from_simple_section(all_module_data: Dict, section: Optional[str], module: str) -> Tuple[List[pd.DataFrame], List[Any]]:
+    """Handle simple string or None section cases."""
+    logger.debug("Extracting data from simple str %s", module)
+    if not section:
+        # Return all data if no specific section is specified
+        return [pd.DataFrame.from_dict(all_module_data, orient="index")], []
+
+    # Check if specific section exists
+    if check_section_exists(all_module_data, section):
+        return [pd.DataFrame.from_dict(all_module_data[section], orient="index")], []
+
+    logger.warning(f"Section {section} not found in module {module}")
+    return [pd.DataFrame()], []
+
+
+def extract_mqc_from_list_section(all_module_data: Dict, section: List, module: str) -> Tuple[List[pd.DataFrame], List[Any]]:
+    """Handle list-based section specifications."""
+    logger.debug("Extracting data from list %s : %s", module, section)
+    # Case for list of column names
+    if all(not isinstance(item, dict) or not isinstance(list(item.values())[0], list) for item in section):
+        full_df = pd.DataFrame.from_dict(all_module_data, orient="index")
+        return [filter_and_rename_columns(full_df, section)], section
+
+    # Handle nested section lists
+    result_dfs = []
+    result_columns = []
+    for subsection in section:
+        # Handle different types of subsections
+        if isinstance(subsection, str):
+            # Simple section name
+            subsection_dfs, subsection_columns = extract_mqc_from_simple_section(all_module_data, subsection, module)
+        if isinstance(subsection, list):
+            # Simple section name
+            subsection_dfs, subsection_columns = extract_mqc_from_list_section(all_module_data, subsection, module)
+        elif isinstance(subsection, dict):
+            # Dictionary-based section specification
+            subsection_dfs, subsection_columns = extract_mqc_from_dict_section(all_module_data, subsection, module)
+        else:
+            # Unsupported subsection type
+            logger.warning(f"Unsupported subsection type: {type(subsection)}")
+            continue
+
+        result_dfs.extend(subsection_dfs)
+        result_columns.extend(subsection_columns)
+
+    return result_dfs, result_columns
+
+
+def extract_mqc_from_dict_section(all_module_data: Dict, section: Dict, module: str) -> Tuple[List[pd.DataFrame], List[Any]]:
+    """Handle dictionary-based section specifications."""
+    logger.debug("Extracting data from dict %s, %s", module, section)
+    # Extract section name and column specifications
+    section_name, columns = next(iter(section.items()))
+
+    # Check if section exists
+    if check_section_exists(all_module_data, section_name):
+        # Find the matching section data
+        section_data = next((data for key, data in all_module_data.items() if section_name in key), None)
+
+        if section_data:
+            # Convert to DataFrame and filter columns
+            data = pd.DataFrame.from_dict(section_data, orient="index")
+            filtered_data = filter_and_rename_columns(data, columns)
+            return [filtered_data], columns
+
+    logger.warning(f"Section '{section_name}' not found in module '{module}'")
+    return [pd.DataFrame()], []
@@ -7,12 +7,13 @@
 
 import pandas as pd
 
+
 logger = logging.getLogger()
 
 
 def reorder_columns(df, columns):
     """
-    Try to reorder columns in a dataframe and return the dataframe.
+    Try to reorder columns in a dataframe and return the dataframe - keep all columns
 
     Args:
         df (pd.DataFrame): The dataframe to reorder columns in.
@@ -24,6 +25,7 @@ def reorder_columns(df, columns):
     df = df[[column for column in columns if column in df.columns] + df.columns.difference(columns, sort=False).tolist()]
     return df
 
+
 def reorder_rows(dataframe):
     """
     Reorder the rows in the DataFrame based on the ranking of the steps.
@@ -66,7 +68,7 @@ def coalesce_constrain(dataframe):
     return result.query('step == "constrain"')
 
 
-def split_index_column(df: pd.DataFrame, prefix: str = None, split_column: str = "index") -> pd.DataFrame:
+def split_index_column(dataframe: pd.DataFrame, prefix: str = None, split_column: str = "index") -> pd.DataFrame:
     """
     Split the index column of the DataFrame into separate columns for sample name, cluster, and step.
 
@@ -78,13 +80,14 @@ def split_index_column(df: pd.DataFrame, prefix: str = None, split_column: str =
     Returns:
         pd.DataFrame: The updated DataFrame with separate columns for sample name, cluster, and step.
     """
-    df_copy = df.copy()
+    df = dataframe.copy()
     # Reset the index and rename the index column
-    df_copy = df_copy.reset_index(drop=True).rename(columns={df_copy.index.name: split_column})
-    df_copy = df_copy[df_copy[split_column].str.contains("_", na=False)]
+    if split_column not in df.columns:
+        df[split_column] = df.index
+    df = df[df[split_column].str.contains("_", na=False)]
 
     # Apply the dynamic split function to each row in the column
-    split_data = df_copy[split_column].apply(dynamic_split).apply(pd.Series)
+    split_data = df[split_column].apply(dynamic_split).apply(pd.Series)
 
     # Take the first three columns and rename them
     split_data = split_data.iloc[:, :3]
@@ -97,11 +100,11 @@ def split_index_column(df: pd.DataFrame, prefix: str = None, split_column: str =
         inplace=True,
     )
 
-    df_copy = drop_columns(df_copy, ["sample", "cluster", "step"])
+    df = drop_columns(df, ["sample", "cluster", "step"])
     # Concatenate the original DataFrame and the split data
-    df_copy = pd.concat([df_copy, split_data], axis=1)
+    df = pd.concat([df, split_data], axis=1)
 
-    return df_copy
+    return df
 
 
 def fill_group_na(s):
@@ -175,6 +178,7 @@ def generate_indexed_df(df: pd.DataFrame, prefix: str = None, column_to_split: s
     )
     return result_df
 
+
 def filter_and_rename_columns(data: pd.DataFrame, columns: List[Union[str, Dict[str, str]]]) -> pd.DataFrame:
     """
     Filter and rename columns in a DataFrame.