|
8 | 8 | from shutil import move
|
9 | 9 | import logging
|
10 | 10 | from sequence_processing_pipeline.Commands import split_similar_size_bins
|
11 |
| -from sequence_processing_pipeline.util import iter_paired_files |
| 11 | +from sequence_processing_pipeline.util import (iter_paired_files, |
| 12 | + determine_orientation) |
12 | 13 | from jinja2 import Environment
|
13 | 14 | from glob import glob
|
14 | 15 | import re
|
15 | 16 | from sys import executable
|
| 17 | +from gzip import open as gzip_open |
16 | 18 |
|
17 | 19 |
|
18 | 20 | logging.basicConfig(level=logging.DEBUG)
|
@@ -116,6 +118,63 @@ def __init__(self, fastq_root_dir, output_path, sample_sheet_path,
|
116 | 118 |
|
117 | 119 | self._validate_project_data()
|
118 | 120 |
|
| 121 | + def hack_helper(self): |
| 122 | + # get list of raw compressed fastq files. only consider R1 and R2 |
| 123 | + # reads. |
| 124 | + |
| 125 | + # Note that NuQCJob works across all projects in a sample-sheet. if |
| 126 | + # there are more than one project in the sample-sheet and if one |
| 127 | + # is a TellSeq project and one isn't then that would break an |
| 128 | + # an assumption of this helper (one file is representative of all |
| 129 | + # files.) However since tellseq requires a special sample-sheet, it |
| 130 | + # can be assumed that all projects in a tellseq sample-sheet will be |
| 131 | + # tellseq and therefore carry the TellSeq BX metadata. The inverse |
| 132 | + # should also be true. |
| 133 | + |
| 134 | + fastq_paths = glob(self.root_dir + '/*/*.fastq.gz') |
| 135 | + fastq_paths = [x for x in fastq_paths |
| 136 | + if determine_orientation(x) in ['R1', 'R2']] |
| 137 | + |
| 138 | + apply_bx = None |
| 139 | + |
| 140 | + for fp in fastq_paths: |
| 141 | + # open a compressed fastq file and read its first line. |
| 142 | + with gzip_open(fp, 'r') as f: |
| 143 | + line = f.readline() |
| 144 | + |
| 145 | + # convert the line to regular text and remove newline. |
| 146 | + line = line.decode("utf-8").strip() |
| 147 | + |
| 148 | + # if file is empty process another file until we find |
| 149 | + # one that isn't empty. |
| 150 | + if line == '': |
| 151 | + continue |
| 152 | + |
| 153 | + # break up sequence id line into sequence id plus possible |
| 154 | + # metadata element(s). |
| 155 | + line = line.split(' ') |
| 156 | + |
| 157 | + if len(line) == 1: |
| 158 | + # there is no metadata. do not apply 'BX'. |
| 159 | + apply_bx = False |
| 160 | + break |
| 161 | + elif len(line) == 2: |
| 162 | + # there is some kind of additional metadata, |
| 163 | + # but it may not be BX. |
| 164 | + if line[-1].startswith('BX'): |
| 165 | + apply_bx = True |
| 166 | + break |
| 167 | + else: |
| 168 | + apply_bx = False |
| 169 | + break |
| 170 | + else: |
| 171 | + raise ValueError("I don't know how to process '%s'" % line) |
| 172 | + |
| 173 | + if apply_bx is None: |
| 174 | + raise ValueError("It seems like all raw files are empty") |
| 175 | + |
| 176 | + return apply_bx |
| 177 | + |
119 | 178 | def _validate_project_data(self):
|
120 | 179 | # Validate project settings in [Bioinformatics]
|
121 | 180 | for project in self.project_data:
|
@@ -394,15 +453,26 @@ def _generate_mmi_filter_cmds(self, working_dir):
|
394 | 453 |
|
395 | 454 | cores_to_allocate = int(self.cores_per_task / 2)
|
396 | 455 |
|
397 |
| - if len(self.additional_fastq_tags) > 0: |
| 456 | + # hack_helper is a hack that will scan all of the R1 and R2 files |
| 457 | + # in self.root_dir until it finds a non-empty file to read. It will |
| 458 | + # read the first line of the compressed fastq file and see if it |
| 459 | + # contains optional BX metadata. If not it will return False, other |
| 460 | + # wise True. |
| 461 | + apply_bx = self.hack_helper() |
| 462 | + |
| 463 | + # the default setting. |
| 464 | + tags = "" |
| 465 | + t_switch = "" |
| 466 | + |
| 467 | + if apply_bx & len(self.additional_fastq_tags) > 0: |
398 | 468 | # add tags for known metadata types that fastq files may have
|
399 | 469 | # been annotated with. Samtools will safely ignore tags that
|
400 | 470 | # are not present.
|
| 471 | + # NB: This doesn't appear to be true, actually. if there is |
| 472 | + # a metadata element but it does not begin with 'BX', supplying |
| 473 | + # '-T BX' will cause an error writing output to disk. |
401 | 474 | tags = " -T %s" % ','.join(self.additional_fastq_tags)
|
402 | 475 | t_switch = " -y"
|
403 |
| - else: |
404 |
| - tags = "" |
405 |
| - t_switch = "" |
406 | 476 |
|
407 | 477 | for count, mmi_db_path in enumerate(self.mmi_file_paths):
|
408 | 478 | if count == 0:
|
@@ -499,3 +569,26 @@ def _generate_job_script(self, max_bucket_size):
|
499 | 569 | pmls_path=self.pmls_path))
|
500 | 570 |
|
501 | 571 | return job_script_path
|
| 572 | + |
| 573 | + def parse_logs(self): |
| 574 | + log_path = join(self.output_path, 'logs') |
| 575 | + files = sorted(glob(join(log_path, '*'))) |
| 576 | + msgs = [] |
| 577 | + |
| 578 | + # assume that the only possible files in logs directory are '.out' |
| 579 | + # files and zero, one, or many 'seqs.movi.n.txt.gz' files. |
| 580 | + # the latter may be present because the last step of a successful |
| 581 | + # job is to rename and move this file into its final location while |
| 582 | + # the logs directory is the default 'working' directory for this job |
| 583 | + # as this ensures slurm.err and slurm.out files will always be in |
| 584 | + # a known location. |
| 585 | + |
| 586 | + # for now, construct lists of both of these types of files. |
| 587 | + output_logs = [x for x in files if x.endswith('.out')] |
| 588 | + |
| 589 | + for some_file in output_logs: |
| 590 | + with open(some_file, 'r') as f: |
| 591 | + msgs += [line for line in f.readlines() |
| 592 | + if 'error:' in line.lower()] |
| 593 | + |
| 594 | + return [msg.strip() for msg in msgs] |
0 commit comments