patch inclusion of cluster info

Joon-Klaps · Joon-Klaps · commit d5f6cb225d40 · 2024-12-02T10:44:39.000Z
diff --git a/bin/custom_multiqc.py b/bin/custom_multiqc.py
@@ -257,7 +257,7 @@ def load_custom_data(args) -> List[pd.DataFrame]:
         result.extend([annotation_df])
 
     # Cluster table - cluster summary of members & centroids
-    clusters_df  = filelist_to_df(args.clusters_files)
+    clusters_df = filelist_to_df(args.clusters_files)
     if not clusters_df.empty:
         clusters_df = clusters_df.add_prefix("(cluster) ")
         clusters_df = clusters_df.rename(columns={"(cluster) sample": "sample", "(cluster) cluster": "cluster"})
@@ -282,8 +282,8 @@ def get_general_stats_data_mod(sample: Optional[str] = None) -> Dict:
             for row in rows:
                 for key, val in row.data.items():
                     if key in header:
-                        namespace = header[key].get('namespace', key).replace("SAMPLE: ", "")
-                        final_key = f"{namespace}. {header[key].get('title', key)}" if header[key].get('title') else key
+                        namespace = header[key].get("namespace", key).replace("SAMPLE: ", "")
+                        final_key = f"{namespace}. {header[key].get('title', key)}" if header[key].get("title") else key
                         data[s][final_key] = val
     if sample:
         if not data:
@@ -511,6 +511,7 @@ def main(argv=None):
 
     # 5.2 reformat the dataframe
     mqc_custom_df = reformat_custom_df(mqc_custom_df, cluster_df)
+    mqc_custom_df.to_csv("mqc_custom_df.after.tsv", sep="\t")
 
     # 5.3 split up denovo constructs and mapping (-CONSTRAIN) results
     logger.info("Splitting up denovo constructs and mapping (-CONSTRAIN) results")
diff --git a/bin/extract_clust.py b/bin/extract_clust.py
@@ -122,8 +122,8 @@ def _to_line(self, prefix):
         return "\t".join(
             [
                 str(prefix),
-                str(self.taxid),
                 str(self.cluster_id),
+                str(self.taxid),
                 str(self.centroid),
                 str(self.cluster_size),
                 "\t".join(map(str, rounded_depth)),
@@ -458,7 +458,10 @@ def filter_members(clusters, pattern):
             filtered_clusters.append(cluster)
     return filtered_clusters
 
-def filter_clusters_by_coverage(clusters: List[Cluster], coverages: Dict, threshold: float,keep_n_clusters: int) -> Tuple[List[Cluster], List[Cluster]]:
+
+def filter_clusters_by_coverage(
+    clusters: List[Cluster], coverages: Dict, threshold: float, keep_n_clusters: int
+) -> Tuple[List[Cluster], List[Cluster]]:
     """
     Filter clusters on coverage, only keep clusters with a coverage above the threshold. If no clusters are kept, return top 5.
     """
@@ -471,9 +474,9 @@ def filter_clusters_by_coverage(clusters: List[Cluster], coverages: Dict, thresh
             filtered_clusters.append(cluster)
 
     if filtered_clusters:
-        return clusters,filtered_clusters
+        return clusters, filtered_clusters
 
-    sorted_clusters = sorted(clusters, key=lambda x: sum(x.cumulative_read_depth), reverse= True)
+    sorted_clusters = sorted(clusters, key=lambda x: sum(x.cumulative_read_depth), reverse=True)
     return sorted_clusters, sorted_clusters[:keep_n_clusters]
 
 
@@ -610,7 +613,7 @@ def main(argv=None):
     # Filter clusters by coverage
     if args.coverages:
         coverages = read_coverages(args.coverages)
-        clusters,filtered_clusters = filter_clusters_by_coverage(filtered_clusters, coverages, args.perc_reads_contig, args.keep_clusters)
+        clusters, filtered_clusters = filter_clusters_by_coverage(filtered_clusters, coverages, args.perc_reads_contig, args.keep_clusters)
         logger.info("Filtered clusters by coverage, %d were removed.", len(clusters_renamed) - len(filtered_clusters))
 
     assert len(filtered_clusters) != 0, "No clusters left after filtering."
diff --git a/bin/utils/module_data_processing.py b/bin/utils/module_data_processing.py
@@ -113,6 +113,7 @@ def parse_annotation_data(annotation_str):
         annotation_dict[key] = value
     return annotation_dict
 
+
 def reformat_custom_df(df: pd.DataFrame, cluster_df: pd.DataFrame) -> pd.DataFrame:
     """
     Reformat the custom dataframe.
@@ -125,7 +126,7 @@ def reformat_custom_df(df: pd.DataFrame, cluster_df: pd.DataFrame) -> pd.DataFra
     df = split_index_column(df)
 
     if not cluster_df.empty:
-        df = pd.merge(df, cluster_df, on=['sample', 'cluster'], how = "left")
+        df = pd.merge(df, cluster_df, on=["sample", "cluster"], how="left")
         df.index = df["index"]
 
     # Reorder the columns
@@ -143,21 +144,22 @@ def reformat_custom_df(df: pd.DataFrame, cluster_df: pd.DataFrame) -> pd.DataFra
         for column in df.columns
         if group in column
     ]
-    return reorder_columns(df.dropna(subset=['step']), final_columns)
+    return reorder_columns(df.dropna(subset=["step"]), list(dict.fromkeys(final_columns)))
 
 
-def filter_constrain(df, column, value):
+def filter_constrain(dataframe, column, value):
     """
     Filter a dataframe based on a column and a regex value.
 
     Args:
-        df (pd.DataFrame): The dataframe to be filtered.
+        dataframe (pd.DataFrame): The dataframe to be filtered.
         column (str): The column to filter on.
         regex_value (str): The regex value to filter on.
 
     Returns:
         pd.DataFrame, pd.DataFrame: The filtered dataframe with the regex value and the filtered dataframe without the regex value.
     """
+    df = dataframe.copy()
     # Find rows with the regex value
     locations = df[column].str.contains(value) | df["step"].str.contains("constrain")
 
diff --git a/conf/modules.config b/conf/modules.config
@@ -892,6 +892,7 @@ process {
             }
 
             withName: RENAME_FASTA_HEADER_SINGLETON {
+                ext.prefix = { "${meta.id}_singleton" } // DON'T CHANGE
                 publishDir = [
                     path: { "${params.outdir}/consensus/seq/consensus/${meta.sample}"},
                     mode: params.publish_dir_mode,
@@ -954,6 +955,7 @@ process {
             }
 
             withName: RENAME_FASTA_HEADER_CONTIG_CONSENSUS{
+                ext.prefix = { "${meta.id}_consensus" } // DON'T CHANGE
                 publishDir = [
                     path: { "${params.outdir}/consensus/seq/consensus/${meta.sample}"},
                     mode: params.publish_dir_mode,
diff --git a/subworkflows/local/align_collapse_contigs.nf b/subworkflows/local/align_collapse_contigs.nf
@@ -64,7 +64,7 @@ workflow ALIGN_COLLAPSE_CONTIGS {
     )
     ch_versions= ch_versions.mix(IVAR_CONTIG_CONSENSUS.out.versions.first())
 
-    RENAME_FASTA_HEADER_CONTIG_CONSENSUS( IVAR_CONTIG_CONSENSUS.out.fasta, "consensus" )
+    RENAME_FASTA_HEADER_CONTIG_CONSENSUS( IVAR_CONTIG_CONSENSUS.out.fasta, [])
     ch_versions = ch_versions.mix(RENAME_FASTA_HEADER_CONTIG_CONSENSUS.out.versions.first())
 
     // If external, there possibly regions that require patching
diff --git a/subworkflows/local/singleton_filtering.nf b/subworkflows/local/singleton_filtering.nf
@@ -19,7 +19,7 @@ workflow SINGLETON_FILTERING {
     // Rename to avoid errors downstream
     RENAME_FASTA_HEADER_SINGLETON(
         contig,
-        "singleton.contig"
+        []
         )
     ch_versions = ch_versions.mix(RENAME_FASTA_HEADER_SINGLETON.out.versions)
 

Original file line number	Diff line number	Diff line change
`@@ -892,6 +892,7 @@ process {`
`892`	`892`	`}`
`893`	`893`
`894`	`894`	`withName: RENAME_FASTA_HEADER_SINGLETON {`
	`895`	`+ ext.prefix = { "${meta.id}_singleton" } // DON'T CHANGE`
`895`	`896`	`publishDir = [`
`896`	`897`	`path: { "${params.outdir}/consensus/seq/consensus/${meta.sample}"},`
`897`	`898`	`mode: params.publish_dir_mode,`
`@@ -954,6 +955,7 @@ process {`
`954`	`955`	`}`
`955`	`956`
`956`	`957`	`withName: RENAME_FASTA_HEADER_CONTIG_CONSENSUS{`
	`958`	`+ ext.prefix = { "${meta.id}_consensus" } // DON'T CHANGE`
`957`	`959`	`publishDir = [`
`958`	`960`	`path: { "${params.outdir}/consensus/seq/consensus/${meta.sample}"},`
`959`	`961`	`mode: params.publish_dir_mode,`
Original file line number	Diff line number	Diff line change
`@@ -64,7 +64,7 @@ workflow ALIGN_COLLAPSE_CONTIGS {`
`64`	`64`	`)`
`65`	`65`	`ch_versions= ch_versions.mix(IVAR_CONTIG_CONSENSUS.out.versions.first())`
`66`	`66`
`67`		`- RENAME_FASTA_HEADER_CONTIG_CONSENSUS( IVAR_CONTIG_CONSENSUS.out.fasta, "consensus" )`
	`67`	`+ RENAME_FASTA_HEADER_CONTIG_CONSENSUS( IVAR_CONTIG_CONSENSUS.out.fasta, [])`
`68`	`68`	`ch_versions = ch_versions.mix(RENAME_FASTA_HEADER_CONTIG_CONSENSUS.out.versions.first())`
`69`	`69`
`70`	`70`	`// If external, there possibly regions that require patching`
Original file line number	Diff line number	Diff line change
`@@ -19,7 +19,7 @@ workflow SINGLETON_FILTERING {`
`19`	`19`	`// Rename to avoid errors downstream`
`20`	`20`	`RENAME_FASTA_HEADER_SINGLETON(`
`21`	`21`	`contig,`
`22`		`- "singleton.contig"`
	`22`	`+ []`
`23`	`23`	`)`
`24`	`24`	`ch_versions = ch_versions.mix(RENAME_FASTA_HEADER_SINGLETON.out.versions)`
`25`	`25`