Skip to content

Commit 1052083

Browse files
authored
Merge pull request #149 from Joon-Klaps/fixing-new-mqc-implementation
Fixing missing columns from general stats & add general stats sample filtering
2 parents c574da7 + d5f6cb2 commit 1052083

17 files changed

+310
-293
lines changed

CHANGELOG.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,12 @@ Initial release of Joon-Klaps/viralgenie, created with the [nf-core](https://nf-
1919
- Add logic to allow samples with no reference hits to be analysed ([#141](https://github.com/Joon-Klaps/viralgenie/pull/141))
2020
- Add visualisation for hybrid scaffold ([#143](https://github.com/Joon-Klaps/viralgenie/pull/143))
2121

22-
2322
### `Fixed`
2423

2524
- OOM with longer contigs for lowcov_to_reference, uses more RAM now ([#125](https://github.com/Joon-Klaps/viralgenie/pull/125))
2625
- fixing null output from global prefix ([#147](https://github.com/Joon-Klaps/viralgenie/pull/147))
2726
- Fix empty filtered clusters ([#148](https://github.com/Joon-Klaps/viralgenie/pull/148))
27+
- Fixing missing columns from general stats & add general stats sample filtering ([#149](https://github.com/Joon-Klaps/viralgenie/pull/149))
2828

2929
### `Parameters`
3030
- New parameter mmseqs_cluster_mode default to 0 ([#130](https://github.com/Joon-Klaps/viralgenie/pull/130))

assets/custom_table_headers.yml

-1
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@ picard:
2525
- LIBRARY: "library"
2626
samtools:
2727
- multiqc_samtools_stats:
28-
- sequences: "reads mapped"
2928
- reads_paired_percent: "reads paired %"
3029
- average_length
3130
- is_sorted

assets/multiqc_config.yml

+2-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ report_comment: >
55
66
export_plots: true
77
data_format: "yaml"
8-
max_table_rows: 100000
8+
max_table_rows: 20
99

1010
report_section_order:
1111
samples_low_reads:
@@ -238,3 +238,4 @@ extra_fn_clean_exts:
238238
- ".sort"
239239
- ".consensus_bcftools"
240240
- ".consensus_ivar"
241+
- ".umi_deduplicated"

bin/custom_multiqc.py

+116-146
Large diffs are not rendered by default.

bin/extract_clust.py

+9-6
Original file line numberDiff line numberDiff line change
@@ -122,8 +122,8 @@ def _to_line(self, prefix):
122122
return "\t".join(
123123
[
124124
str(prefix),
125-
str(self.taxid),
126125
str(self.cluster_id),
126+
str(self.taxid),
127127
str(self.centroid),
128128
str(self.cluster_size),
129129
"\t".join(map(str, rounded_depth)),
@@ -326,7 +326,7 @@ def write_clusters_to_tsv(clusters, prefix):
326326
"""
327327
with open(f"{prefix}.clusters.tsv", "w") as file:
328328
assemblers = [f"cumulative read depth - {assembler} [%]" for assembler in clusters[0].cumulative_read_depth.keys()]
329-
file.write("\t".join(["sample", "taxon-id", "cluster-id", "centroid", "size"] + assemblers + ["members"]))
329+
file.write("\t".join(["sample", "cluster", "taxon-id", "centroid", "number of members"] + assemblers + ["members"]))
330330
file.write("\n")
331331
for cluster in clusters:
332332
file.write(cluster._to_line(prefix))
@@ -458,7 +458,10 @@ def filter_members(clusters, pattern):
458458
filtered_clusters.append(cluster)
459459
return filtered_clusters
460460

461-
def filter_clusters_by_coverage(clusters: List[Cluster], coverages: Dict, threshold: float,keep_n_clusters: int) -> Tuple[List[Cluster], List[Cluster]]:
461+
462+
def filter_clusters_by_coverage(
463+
clusters: List[Cluster], coverages: Dict, threshold: float, keep_n_clusters: int
464+
) -> Tuple[List[Cluster], List[Cluster]]:
462465
"""
463466
Filter clusters on coverage, only keep clusters with a coverage above the threshold. If no clusters are kept, return top 5.
464467
"""
@@ -471,9 +474,9 @@ def filter_clusters_by_coverage(clusters: List[Cluster], coverages: Dict, thresh
471474
filtered_clusters.append(cluster)
472475

473476
if filtered_clusters:
474-
return clusters,filtered_clusters
477+
return clusters, filtered_clusters
475478

476-
sorted_clusters = sorted(clusters, key=lambda x: sum(x.cumulative_read_depth), reverse= True)
479+
sorted_clusters = sorted(clusters, key=lambda x: sum(x.cumulative_read_depth), reverse=True)
477480
return sorted_clusters, sorted_clusters[:keep_n_clusters]
478481

479482

@@ -610,7 +613,7 @@ def main(argv=None):
610613
# Filter clusters by coverage
611614
if args.coverages:
612615
coverages = read_coverages(args.coverages)
613-
clusters,filtered_clusters = filter_clusters_by_coverage(filtered_clusters, coverages, args.perc_reads_contig, args.keep_clusters)
616+
clusters, filtered_clusters = filter_clusters_by_coverage(filtered_clusters, coverages, args.perc_reads_contig, args.keep_clusters)
614617
logger.info("Filtered clusters by coverage, %d were removed.", len(clusters_renamed) - len(filtered_clusters))
615618

616619
assert len(filtered_clusters) != 0, "No clusters left after filtering."

bin/utils/constant_variables.py

+7-2
Original file line numberDiff line numberDiff line change
@@ -21,15 +21,20 @@
2121
]
2222

2323
CONSTRAIN_GENERAL_STATS_COLUMNS = [
24-
"read_mapped",
24+
"reads_mapped",
25+
"reads_mapped_percent",
2526
"reads_unmapped",
2627
"number_of_SNPs",
27-
"number_of_indels" "CLUSTER: mosdepth.mean_coverage",
28+
"number_of_indels",
29+
"CLUSTER: mosdepth.mean_coverage",
2830
"CLUSTER: mosdepth.min_coverage",
2931
"CLUSTER: mosdepth.max_coverage",
3032
"CLUSTER: mosdepth.median_coverage",
3133
"CLUSTER: mosdepth.1_x_pc",
3234
"CLUSTER: mosdepth.10_x_pc",
35+
"CLUSTER: mosdepth.50_x_pc",
36+
"CLUSTER: mosdepth.100_x_pc",
37+
"CLUSTER: mosdepth.200_x_pc",
3338
"qlen",
3439
"(quast) % N's",
3540
"(mash-screen) query-ID",

bin/utils/module_data_processing.py

+94-10
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
import logging
44
import re
5-
from typing import Dict, List, Union
5+
from typing import Dict, List, Union, Tuple, Optional, Any
66

77
import pandas as pd
88

@@ -15,6 +15,7 @@
1515
reorder_columns,
1616
reorder_rows,
1717
split_index_column,
18+
filter_and_rename_columns,
1819
)
1920

2021
logger = logging.getLogger()
@@ -113,7 +114,7 @@ def parse_annotation_data(annotation_str):
113114
return annotation_dict
114115

115116

116-
def reformat_custom_df(df):
117+
def reformat_custom_df(df: pd.DataFrame, cluster_df: pd.DataFrame) -> pd.DataFrame:
117118
"""
118119
Reformat the custom dataframe.
119120
"""
@@ -124,6 +125,10 @@ def reformat_custom_df(df):
124125

125126
df = split_index_column(df)
126127

128+
if not cluster_df.empty:
129+
df = pd.merge(df, cluster_df, on=["sample", "cluster"], how="left")
130+
df.index = df["index"]
131+
127132
# Reorder the columns
128133
logger.info("Reordering columns")
129134
final_columns = ["index", "sample", "cluster", "step"] + [
@@ -133,27 +138,28 @@ def reformat_custom_df(df):
133138
"mash-screen",
134139
"blast",
135140
"checkv",
136-
"QC check",
141+
"cluster",
137142
"quast",
138143
]
139144
for column in df.columns
140145
if group in column
141146
]
142-
return reorder_columns(df, final_columns)
147+
return reorder_columns(df.dropna(subset=["step"]), list(dict.fromkeys(final_columns)))
143148

144149

145-
def filter_constrain(df, column, value):
150+
def filter_constrain(dataframe, column, value):
146151
"""
147152
Filter a dataframe based on a column and a regex value.
148153
149154
Args:
150-
df (pd.DataFrame): The dataframe to be filtered.
155+
dataframe (pd.DataFrame): The dataframe to be filtered.
151156
column (str): The column to filter on.
152157
regex_value (str): The regex value to filter on.
153158
154159
Returns:
155160
pd.DataFrame, pd.DataFrame: The filtered dataframe with the regex value and the filtered dataframe without the regex value.
156161
"""
162+
df = dataframe.copy()
157163
# Find rows with the regex value
158164
locations = df[column].str.contains(value) | df["step"].str.contains("constrain")
159165

@@ -188,8 +194,12 @@ def create_constrain_summary(df_constrain: pd.DataFrame, file_columns: List[Unio
188194
else:
189195
dic_columns[item] = item
190196

197+
logger.debug("dic_columns: %s", dic_columns)
198+
191199
columns_of_interest = [dic_columns.get(key, key) for key in CONSTRAIN_GENERAL_STATS_COLUMNS]
192200

201+
logger.debug("columns_of_interest: %s", columns_of_interest)
202+
193203
if not columns_of_interest:
194204
logger.warning("No columns of interest were found to create the constrain summary table!")
195205
return pd.DataFrame()
@@ -219,6 +229,7 @@ def create_constrain_summary(df_constrain: pd.DataFrame, file_columns: List[Unio
219229
df_constrain = df_constrain[present_columns]
220230

221231
if df_constrain.empty:
232+
logger.warning("The constrain DataFrame is empty.")
222233
return df_constrain
223234

224235
df_constrain = df_constrain.rename(columns=COLUMN_MAPPING)
@@ -260,7 +271,8 @@ def reformat_constrain_df(df, file_columns, args):
260271
"""
261272
# Separate table for mapping constrains
262273
if df.empty:
263-
return df
274+
logger.warning("The constrain DataFrame is empty.")
275+
return df, df
264276

265277
# Add constrain metadata to the mapping constrain table
266278
constrain_meta = filelist_to_df([args.mapping_constrains])
@@ -296,14 +308,12 @@ def generate_ignore_samples(dataframe: pd.DataFrame) -> pd.Series:
296308
Generate a Series of indices that are not part of the df_snip dataframe.
297309
298310
Parameters:
299-
dataframe (pd.DataFrame): The input DataFrame to ocess.
311+
dataframe (pd.DataFrame): The input DataFrame to process.
300312
301313
Returns:
302314
pd.Series: A Series containing the indices that are not in df_snip.
303315
"""
304316
df = dataframe.copy()
305-
df = drop_columns(df, ["index"])
306-
df["index"] = df.index
307317
df = split_index_column(df)
308318

309319
df = reorder_rows(df)
@@ -322,3 +332,77 @@ def add_prefix_to_values_dict(data: List[Union[str, Dict[str, str]]], prefix: st
322332
else:
323333
updated_items.extend({key: f"({prefix}) {value}"} for key, value in item.items())
324334
return updated_items
335+
336+
337+
def check_section_exists(module_data: Dict, section_key: str) -> bool:
338+
"""Check if a section exists in the module data."""
339+
return any(section_key in key for key in module_data.keys())
340+
341+
342+
def extract_mqc_from_simple_section(all_module_data: Dict, section: Optional[str], module: str) -> Tuple[List[pd.DataFrame], List[Any]]:
343+
"""Handle simple string or None section cases."""
344+
logger.debug("Extracting data from simple str %s", module)
345+
if not section:
346+
# Return all data if no specific section is specified
347+
return [pd.DataFrame.from_dict(all_module_data, orient="index")], []
348+
349+
# Check if specific section exists
350+
if check_section_exists(all_module_data, section):
351+
return [pd.DataFrame.from_dict(all_module_data[section], orient="index")], []
352+
353+
logger.warning(f"Section {section} not found in module {module}")
354+
return [pd.DataFrame()], []
355+
356+
357+
def extract_mqc_from_list_section(all_module_data: Dict, section: List, module: str) -> Tuple[List[pd.DataFrame], List[Any]]:
358+
"""Handle list-based section specifications."""
359+
logger.debug("Extracting data from list %s : %s", module, section)
360+
# Case for list of column names
361+
if all(not isinstance(item, dict) or not isinstance(list(item.values())[0], list) for item in section):
362+
full_df = pd.DataFrame.from_dict(all_module_data, orient="index")
363+
return [filter_and_rename_columns(full_df, section)], section
364+
365+
# Handle nested section lists
366+
result_dfs = []
367+
result_columns = []
368+
for subsection in section:
369+
# Handle different types of subsections
370+
if isinstance(subsection, str):
371+
# Simple section name
372+
subsection_dfs, subsection_columns = extract_mqc_from_simple_section(all_module_data, subsection, module)
373+
if isinstance(subsection, list):
374+
# Simple section name
375+
subsection_dfs, subsection_columns = extract_mqc_from_list_section(all_module_data, subsection, module)
376+
elif isinstance(subsection, dict):
377+
# Dictionary-based section specification
378+
subsection_dfs, subsection_columns = extract_mqc_from_dict_section(all_module_data, subsection, module)
379+
else:
380+
# Unsupported subsection type
381+
logger.warning(f"Unsupported subsection type: {type(subsection)}")
382+
continue
383+
384+
result_dfs.extend(subsection_dfs)
385+
result_columns.extend(subsection_columns)
386+
387+
return result_dfs, result_columns
388+
389+
390+
def extract_mqc_from_dict_section(all_module_data: Dict, section: Dict, module: str) -> Tuple[List[pd.DataFrame], List[Any]]:
391+
"""Handle dictionary-based section specifications."""
392+
logger.debug("Extracting data from dict %s, %s", module, section)
393+
# Extract section name and column specifications
394+
section_name, columns = next(iter(section.items()))
395+
396+
# Check if section exists
397+
if check_section_exists(all_module_data, section_name):
398+
# Find the matching section data
399+
section_data = next((data for key, data in all_module_data.items() if section_name in key), None)
400+
401+
if section_data:
402+
# Convert to DataFrame and filter columns
403+
data = pd.DataFrame.from_dict(section_data, orient="index")
404+
filtered_data = filter_and_rename_columns(data, columns)
405+
return [filtered_data], columns
406+
407+
logger.warning(f"Section '{section_name}' not found in module '{module}'")
408+
return [pd.DataFrame()], []

bin/utils/pandas_tools.py

+13-9
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,13 @@
77

88
import pandas as pd
99

10+
1011
logger = logging.getLogger()
1112

1213

1314
def reorder_columns(df, columns):
1415
"""
15-
Try to reorder columns in a dataframe and return the dataframe.
16+
Try to reorder columns in a dataframe and return the dataframe - keep all columns
1617
1718
Args:
1819
df (pd.DataFrame): The dataframe to reorder columns in.
@@ -24,6 +25,7 @@ def reorder_columns(df, columns):
2425
df = df[[column for column in columns if column in df.columns] + df.columns.difference(columns, sort=False).tolist()]
2526
return df
2627

28+
2729
def reorder_rows(dataframe):
2830
"""
2931
Reorder the rows in the DataFrame based on the ranking of the steps.
@@ -66,7 +68,7 @@ def coalesce_constrain(dataframe):
6668
return result.query('step == "constrain"')
6769

6870

69-
def split_index_column(df: pd.DataFrame, prefix: str = None, split_column: str = "index") -> pd.DataFrame:
71+
def split_index_column(dataframe: pd.DataFrame, prefix: str = None, split_column: str = "index") -> pd.DataFrame:
7072
"""
7173
Split the index column of the DataFrame into separate columns for sample name, cluster, and step.
7274
@@ -78,13 +80,14 @@ def split_index_column(df: pd.DataFrame, prefix: str = None, split_column: str =
7880
Returns:
7981
pd.DataFrame: The updated DataFrame with separate columns for sample name, cluster, and step.
8082
"""
81-
df_copy = df.copy()
83+
df = dataframe.copy()
8284
# Reset the index and rename the index column
83-
df_copy = df_copy.reset_index(drop=True).rename(columns={df_copy.index.name: split_column})
84-
df_copy = df_copy[df_copy[split_column].str.contains("_", na=False)]
85+
if split_column not in df.columns:
86+
df[split_column] = df.index
87+
df = df[df[split_column].str.contains("_", na=False)]
8588

8689
# Apply the dynamic split function to each row in the column
87-
split_data = df_copy[split_column].apply(dynamic_split).apply(pd.Series)
90+
split_data = df[split_column].apply(dynamic_split).apply(pd.Series)
8891

8992
# Take the first three columns and rename them
9093
split_data = split_data.iloc[:, :3]
@@ -97,11 +100,11 @@ def split_index_column(df: pd.DataFrame, prefix: str = None, split_column: str =
97100
inplace=True,
98101
)
99102

100-
df_copy = drop_columns(df_copy, ["sample", "cluster", "step"])
103+
df = drop_columns(df, ["sample", "cluster", "step"])
101104
# Concatenate the original DataFrame and the split data
102-
df_copy = pd.concat([df_copy, split_data], axis=1)
105+
df = pd.concat([df, split_data], axis=1)
103106

104-
return df_copy
107+
return df
105108

106109

107110
def fill_group_na(s):
@@ -175,6 +178,7 @@ def generate_indexed_df(df: pd.DataFrame, prefix: str = None, column_to_split: s
175178
)
176179
return result_df
177180

181+
178182
def filter_and_rename_columns(data: pd.DataFrame, columns: List[Union[str, Dict[str, str]]]) -> pd.DataFrame:
179183
"""
180184
Filter and rename columns in a DataFrame.

0 commit comments

Comments
 (0)