Skip to content

Commit d580a2c

Browse files
committed
Add fastq_minqual and fastq_truncee_rate options
1 parent 78013bf commit d580a2c

File tree

4 files changed

+64
-40
lines changed

4 files changed

+64
-40
lines changed

man/vsearch.1

+30-14
Original file line numberDiff line numberDiff line change
@@ -1886,6 +1886,12 @@ merged sequence. The default is 1.
18861886
.BI \-\-fastq_minovlen\~ "positive integer"
18871887
When using \-\-fastq_mergepairs, specify the minimum overlap between
18881888
the merged reads. The default is 10. Must be at least 5.
1889+
.TAG fastq_minqual
1890+
.TP
1891+
.BI \-\-fastq_minqual\~ "positive integer"
1892+
When using \-\-fastq_filter or \-\-fastx_filter, discard reads having
1893+
any base with a quality score below the given value. The default is 0,
1894+
which discards none.
18891895
.TAG fastq_nostagger
18901896
.TP
18911897
.B \-\-fastq_nostagger
@@ -2030,6 +2036,15 @@ default, \fIk\fR = 4.
20302036
When using \-\-fastq_filter or \-\-fastx_filter, truncate sequences so
20312037
that their total expected error is not higher than the specified
20322038
value.
2039+
.TAG fastq_truncee_rate
2040+
.TP
2041+
.BI \-\-fastq_truncee_rate\~ real
2042+
When using \-\-fastq_filter or \-\-fastx_filter, truncate sequences so
2043+
that their average expected error per base is not higher than the
2044+
specified value. The truncation will happen at the first
2045+
occurence. The average expected error per base is calculated as the
2046+
total expected number of errors divided by the length of the sequence
2047+
after truncation.
20332048
.TAG fastq_trunclen
20342049
.TP
20352050
.BI \-\-fastq_trunclen\~ "positive integer"
@@ -2095,26 +2110,27 @@ corresponding output will be written to the files specified with the
20952110
files if the input is in FASTA format. The sequences are first trimmed
20962111
and then filtered based on the remaining bases. Sequences may be
20972112
trimmed using the options \-\-fastq_stripleft, \-\-fastq_stripright,
2098-
\-\-fastq_truncee, \-\-fastq_trunclen, \-\-fastq_trunclen_keep and
2099-
\-\-fastq_truncqual. The sequences may be filtered using the options
2100-
\-\-fastq_maxee, \-\-fastq_maxee_rate, \-\-fastq_maxlen,
2101-
\-\-fastq_maxns, \-\-fastq_minlen (default 1), \-\-fastq_trunclen,
2102-
\-\-maxsize, and \-\-minsize. Sequences not satisfying the
2103-
requirements are discarded. For pairs of sequences, both sequences in
2104-
a pair must satisfy the requirements, otherwise both are discarded. If
2105-
no shortening or filtering options are given, all sequences are
2106-
written to the output files, possibly after conversion from FASTQ to
2107-
FASTA format. The \-\-relabel option may be used to relabel the output
2113+
\-\-fastq_truncee, \-\-fastq_truncee_rate, \-\-fastq_trunclen,
2114+
\-\-fastq_trunclen_keep and \-\-fastq_truncqual. The sequences may be
2115+
filtered using the options \-\-fastq_maxee, \-\-fastq_maxee_rate,
2116+
\-\-fastq_maxlen, \-\-fastq_maxns, \-\-fastq_minlen (default 1),
2117+
\-\-fastq_minqual, \-\-fastq_trunclen, \-\-maxsize, and
2118+
\-\-minsize. Sequences not satisfying the requirements are
2119+
discarded. For pairs of sequences, both sequences in a pair must
2120+
satisfy the requirements, otherwise both are discarded. If no
2121+
shortening or filtering options are given, all sequences are written
2122+
to the output files, possibly after conversion from FASTQ to FASTA
2123+
format. The \-\-relabel option may be used to relabel the output
21082124
sequences. The \-\-eeout option may be used to output the expected
21092125
number of errors in each sequence. After all sequences have been
21102126
processed, the number of kept and discarded sequences will be shown,
21112127
as well as how many of the kept sequences were trimmed. When the input
21122128
is in FASTA format, the following options are not accepted because
21132129
quality scores are not available: \-\-eeout, \-\-fastq_ascii,
2114-
\-\-fastq_eeout, \-\-fastq_maxee, \-\-fastq_maxee_rate, \-\-fastq_out,
2115-
\-\-fastq_qmax, \-\-fastq_qmin, \-\-fastq_truncee,
2116-
\-\-fastq_truncqual, \-\-fastqout_discarded,
2117-
\-\-fastqout_discarded_rev, \-\-fastqout_rev.
2130+
\-\-fastq_eeout, \-\-fastq_maxee, \-\-fastq_maxee_rate,
2131+
\-\-fastq_minqual, \-\-fastq_out, \-\-fastq_qmax, \-\-fastq_qmin,
2132+
\-\-fastq_truncee, \-\-fastq_truncee_rate, \-\-fastq_truncqual,
2133+
\-\-fastqout_discarded, \-\-fastqout_discarded_rev, \-\-fastqout_rev.
21182134
.TAG fastx_revcomp
21192135
.TP
21202136
.BI \-\-fastx_revcomp \0filename

src/filter.cc

+10-26
Original file line numberDiff line numberDiff line change
@@ -183,12 +183,18 @@ auto analyse(fastx_handle h) -> struct analysis_res
183183
res.ee += e;
184184

185185
if ((qual <= opt_fastq_truncqual) ||
186-
(res.ee > opt_fastq_truncee))
186+
(res.ee > opt_fastq_truncee) ||
187+
(res.ee > opt_fastq_truncee_rate * (i + 1)))
187188
{
188189
res.ee -= e;
189190
res.length = i;
190191
break;
191192
}
193+
194+
if (qual < opt_fastq_minqual)
195+
{
196+
res.discarded = true;
197+
}
192198
}
193199

194200
/* filter by expected errors (ee) */
@@ -287,12 +293,14 @@ auto filter(bool fastq_only, char * filename) -> void
287293
(opt_fastq_qmax < 41) ||
288294
(opt_fastq_qmin > 0) ||
289295
(opt_fastq_truncee < dbl_max) ||
296+
(opt_fastq_truncee_rate < dbl_max) ||
290297
(opt_fastq_truncqual < long_min) ||
298+
(opt_fastq_minqual > 0) ||
291299
opt_fastqout_discarded ||
292300
opt_fastqout_discarded_rev ||
293301
opt_fastqout_rev)
294302
{
295-
fatal("The following options are not accepted with the fastx_filter command when the input is a FASTA file, because quality scores are not available: eeout, fastq_ascii, fastq_eeout, fastq_maxee, fastq_maxee_rate, fastq_out, fastq_qmax, fastq_qmin, fastq_truncee, fastq_truncqual, fastqout_discarded, fastqout_discarded_rev, fastqout_rev");
303+
fatal("The following options are not accepted with the fastx_filter command when the input is a FASTA file, because quality scores are not available: eeout, fastq_ascii, fastq_eeout, fastq_maxee, fastq_maxee_rate, fastq_minqual, fastq_out, fastq_qmax, fastq_qmin, fastq_truncee, fastq_truncee_rate, fastq_truncqual, fastqout_discarded, fastqout_discarded_rev, fastqout_rev");
296304
}
297305
}
298306

@@ -311,30 +319,6 @@ auto filter(bool fastq_only, char * filename) -> void
311319
{
312320
fatal("The forward and reverse input sequence must in the same format, either FASTA or FASTQ");
313321
}
314-
315-
if (! (h2->is_fastq || h2->is_empty))
316-
{
317-
if (fastq_only)
318-
{
319-
fatal("FASTA input files not allowed with fastq_filter, consider using fastx_filter command instead");
320-
}
321-
else if (opt_eeout ||
322-
(opt_fastq_ascii != 33) ||
323-
opt_fastq_eeout ||
324-
(opt_fastq_maxee < dbl_max) ||
325-
(opt_fastq_maxee_rate < dbl_max) ||
326-
opt_fastqout ||
327-
(opt_fastq_qmax < 41) ||
328-
(opt_fastq_qmin > 0) ||
329-
(opt_fastq_truncee < dbl_max) ||
330-
(opt_fastq_truncqual < long_min) ||
331-
opt_fastqout_discarded ||
332-
opt_fastqout_discarded_rev ||
333-
opt_fastqout_rev)
334-
{
335-
fatal("The following options are not accepted with the fastx_filter command when the input is a FASTA file, because quality scores are not available: eeout, fastq_ascii, fastq_eeout, fastq_maxee, fastq_maxee_rate, fastq_out, fastq_qmax, fastq_qmin, fastq_truncee, fastq_truncqual, fastqout_discarded, fastqout_discarded_rev, fastqout_rev");
336-
}
337-
}
338322
}
339323

340324
FILE * fp_fastaout = nullptr;

src/vsearch.cc

+22
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,7 @@ double opt_fastq_maxdiffpct;
226226
double opt_fastq_maxee;
227227
double opt_fastq_maxee_rate;
228228
double opt_fastq_truncee;
229+
double opt_fastq_truncee_rate;
229230
double opt_id;
230231
double opt_lca_cutoff;
231232
double opt_max_unmasked_pct;
@@ -284,6 +285,7 @@ int64_t opt_fastq_maxns;
284285
int64_t opt_fastq_minlen;
285286
int64_t opt_fastq_minmergelen;
286287
int64_t opt_fastq_minovlen;
288+
int64_t opt_fastq_minqual;
287289
int64_t opt_fastq_qmax;
288290
int64_t opt_fastq_qmaxout;
289291
int64_t opt_fastq_qmin;
@@ -839,6 +841,7 @@ auto args_init(int argc, char ** argv, struct Parameters & parameters) -> void
839841
opt_fastq_minlen = 1;
840842
opt_fastq_minmergelen = 0;
841843
opt_fastq_minovlen = 10;
844+
opt_fastq_minqual = 0;
842845
opt_fastq_nostagger = true;
843846
opt_fastq_qmax = 41;
844847
opt_fastq_qmaxout = 41;
@@ -848,6 +851,7 @@ auto args_init(int argc, char ** argv, struct Parameters & parameters) -> void
848851
opt_fastq_stripleft = 0;
849852
opt_fastq_stripright = 0;
850853
opt_fastq_truncee = dbl_max;
854+
opt_fastq_truncee_rate = dbl_max;
851855
opt_fastq_trunclen = -1;
852856
opt_fastq_trunclen_keep = -1;
853857
opt_fastq_truncqual = long_min;
@@ -1082,6 +1086,7 @@ auto args_init(int argc, char ** argv, struct Parameters & parameters) -> void
10821086
option_fastq_minlen,
10831087
option_fastq_minmergelen,
10841088
option_fastq_minovlen,
1089+
option_fastq_minqual,
10851090
option_fastq_nostagger,
10861091
option_fastq_qmax,
10871092
option_fastq_qmaxout,
@@ -1093,6 +1098,7 @@ auto args_init(int argc, char ** argv, struct Parameters & parameters) -> void
10931098
option_fastq_stripright,
10941099
option_fastq_tail,
10951100
option_fastq_truncee,
1101+
option_fastq_truncee_rate,
10961102
option_fastq_trunclen,
10971103
option_fastq_trunclen_keep,
10981104
option_fastq_truncqual,
@@ -1330,6 +1336,7 @@ auto args_init(int argc, char ** argv, struct Parameters & parameters) -> void
13301336
{"fastq_minlen", required_argument, nullptr, 0 },
13311337
{"fastq_minmergelen", required_argument, nullptr, 0 },
13321338
{"fastq_minovlen", required_argument, nullptr, 0 },
1339+
{"fastq_minqual", required_argument, nullptr, 0 },
13331340
{"fastq_nostagger", no_argument, nullptr, 0 },
13341341
{"fastq_qmax", required_argument, nullptr, 0 },
13351342
{"fastq_qmaxout", required_argument, nullptr, 0 },
@@ -1341,6 +1348,7 @@ auto args_init(int argc, char ** argv, struct Parameters & parameters) -> void
13411348
{"fastq_stripright", required_argument, nullptr, 0 },
13421349
{"fastq_tail", required_argument, nullptr, 0 },
13431350
{"fastq_truncee", required_argument, nullptr, 0 },
1351+
{"fastq_truncee_rate", required_argument, nullptr, 0 },
13441352
{"fastq_trunclen", required_argument, nullptr, 0 },
13451353
{"fastq_trunclen_keep", required_argument, nullptr, 0 },
13461354
{"fastq_truncqual", required_argument, nullptr, 0 },
@@ -2603,6 +2611,14 @@ auto args_init(int argc, char ** argv, struct Parameters & parameters) -> void
26032611
opt_n_mismatch = true;
26042612
break;
26052613

2614+
case option_fastq_minqual:
2615+
opt_fastq_minqual = args_getlong(optarg);
2616+
break;
2617+
2618+
case option_fastq_truncee_rate:
2619+
opt_fastq_truncee_rate = args_getdouble(optarg);
2620+
break;
2621+
26062622
default:
26072623
fatal("Internal error in option parsing");
26082624
}
@@ -3469,11 +3485,13 @@ auto args_init(int argc, char ** argv, struct Parameters & parameters) -> void
34693485
option_fastq_maxlen,
34703486
option_fastq_maxns,
34713487
option_fastq_minlen,
3488+
option_fastq_minqual,
34723489
option_fastq_qmax,
34733490
option_fastq_qmin,
34743491
option_fastq_stripleft,
34753492
option_fastq_stripright,
34763493
option_fastq_truncee,
3494+
option_fastq_truncee_rate,
34773495
option_fastq_trunclen,
34783496
option_fastq_trunclen_keep,
34793497
option_fastq_truncqual,
@@ -3611,11 +3629,13 @@ auto args_init(int argc, char ** argv, struct Parameters & parameters) -> void
36113629
option_fastq_maxlen,
36123630
option_fastq_maxns,
36133631
option_fastq_minlen,
3632+
option_fastq_minqual,
36143633
option_fastq_qmax,
36153634
option_fastq_qmin,
36163635
option_fastq_stripleft,
36173636
option_fastq_stripright,
36183637
option_fastq_truncee,
3638+
option_fastq_truncee_rate,
36193639
option_fastq_trunclen,
36203640
option_fastq_trunclen_keep,
36213641
option_fastq_truncqual,
@@ -5498,11 +5518,13 @@ auto cmd_help(struct Parameters const & parameters) -> void {
54985518
" --fastq_maxlen INT discard if length of sequence is longer\n"
54995519
" --fastq_maxns INT discard if number of N's is higher\n"
55005520
" --fastq_minlen INT discard if length of sequence is shorter\n"
5521+
" --fastq_minqual INT discard if any base quality value lower (0)\n"
55015522
" --fastq_qmax INT maximum base quality value for FASTQ input (41)\n"
55025523
" --fastq_qmin INT minimum base quality value for FASTQ input (0)\n"
55035524
" --fastq_stripleft INT delete given number of bases from the 5' end\n"
55045525
" --fastq_stripright INT delete given number of bases from the 3' end\n"
55055526
" --fastq_truncee REAL truncate to given maximum expected error\n"
5527+
" --fastq_truncee_rate REAL truncate to given maximum expected error rate\n"
55065528
" --fastq_trunclen INT truncate to given length (discard if shorter)\n"
55075529
" --fastq_trunclen_keep INT truncate to given length (keep if shorter)\n"
55085530
" --fastq_truncqual INT truncate to given minimum base quality\n"

src/vsearch.h

+2
Original file line numberDiff line numberDiff line change
@@ -346,6 +346,7 @@ extern double opt_fastq_maxdiffpct;
346346
extern double opt_fastq_maxee;
347347
extern double opt_fastq_maxee_rate;
348348
extern double opt_fastq_truncee;
349+
extern double opt_fastq_truncee_rate;
349350
extern double opt_id;
350351
extern double opt_lca_cutoff;
351352
extern double opt_max_unmasked_pct;
@@ -404,6 +405,7 @@ extern int64_t opt_fastq_maxns;
404405
extern int64_t opt_fastq_minlen;
405406
extern int64_t opt_fastq_minmergelen;
406407
extern int64_t opt_fastq_minovlen;
408+
extern int64_t opt_fastq_minqual;
407409
extern int64_t opt_fastq_qmax;
408410
extern int64_t opt_fastq_qmaxout;
409411
extern int64_t opt_fastq_qmin;

0 commit comments

Comments
 (0)