Skip to content

Commit 5496494

Browse files
committed
update docs and tests
1 parent daf2d3f commit 5496494

File tree

4 files changed

+96
-2
lines changed

4 files changed

+96
-2
lines changed

docs/how-to/guides/match.rst

+61
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
How to match scoring file variants against target genomes
2+
=========================================================
3+
4+
``pgscatalog-match`` is a CLI application that makes it easy to match genetic variants in a normalised scoring file against target variant information files. The application:
5+
6+
* identifies match candidates using genomic coordinates and allele information
7+
* creates a summary log and calculates match rates using the best match candidates
8+
* writes scoring files in plink2 --score format using the best match candidates
9+
* creates a log that describes all possible match candidate and match status
10+
11+
The application will error if not enough variants in the scoring file are present in the target variant information files. This is because it's important to match variants well to faithfully reproduce published scoring files.
12+
13+
14+
Installation
15+
-------------
16+
17+
::
18+
19+
$ pip install pgscatalog-matchapp
20+
21+
Usage
22+
-----
23+
24+
Match variants
25+
~~~~~~~~~~~~~~
26+
27+
::
28+
29+
$ mkdir matchout
30+
$ pgscatalog-match --dataset test --scorefiles normalised_scorefile.txt --target variants.pvar --outdir matchout --min_overlap 0.75
31+
32+
.. note::
33+
34+
Variant information files in both plink2 pvar and plink1 bim format are supported
35+
36+
Matching very large datasets
37+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
38+
39+
If you're matching many millions of variants it can be a good idea to split work by matching each chromosome separately:
40+
41+
::
42+
43+
$ mkdir matchout
44+
$ pgscatalog-match --dataset test --scorefiles normalised_scorefile.txt --target variants_chrom1.pvar --outdir matchout --chrom 1 --only_match
45+
46+
This will generate match files in Arrow IPC format in matchout/matchtmp. This process can be run in parallel to distribute work.
47+
48+
You can then merge these matches with a different CLI tool:
49+
50+
::
51+
52+
$ pgscatalog-matchmerge --dataset test --scorefiles normalised_scorefile.txt --matches path/to/matchfile --outdir matchmergeout --min_overlap 0.75
53+
54+
The PGS Catalog Calculator does this automatically when working with target genomes split to have one file per chromosome.
55+
56+
Help
57+
----
58+
59+
::
60+
61+
$ pgscatalog-match --help

docs/how-to/index.rst

+2-1
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,5 @@ These guides describe how to use pygscatalog CLI applications.
88
:caption: Contents:
99

1010
guides/download
11-
guides/combine
11+
guides/combine
12+
guides/match

pgscatalog.matchapp/src/pgscatalog/matchapp/match_cli.py

-1
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,6 @@ def parse_args(args=None):
154154
dest="min_overlap",
155155
required=False,
156156
type=float,
157-
default=0.75,
158157
help="<Optional> Minimum proportion of variants to match before error",
159158
)
160159
parser.add_argument(

pgscatalog.matchapp/tests/test_match_cli.py

+33
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import itertools
2+
import os
23
from unittest.mock import patch
34
import pytest
45

@@ -36,6 +37,8 @@ def test_match(tmp_path_factory, good_scorefile, good_variants):
3637
str(good_variants),
3738
"--outdir",
3839
str(outdir),
40+
"--min_overlap",
41+
"0.75",
3942
)
4043
]
4144
flargs = list(itertools.chain(*args))
@@ -50,6 +53,34 @@ def test_match(tmp_path_factory, good_scorefile, good_variants):
5053
assert (outdir / "test_ALL_additive_0.scorefile.gz").exists()
5154

5255

56+
def test_only_match(tmp_path_factory, good_scorefile, good_variants):
57+
"""Test just matching (for big data)"""
58+
outdir = tmp_path_factory.mktemp("outdir")
59+
60+
args = [
61+
(
62+
"pgscatalog-match",
63+
"-d",
64+
"test",
65+
"-s",
66+
str(good_scorefile),
67+
"-t",
68+
str(good_variants),
69+
"--outdir",
70+
str(outdir),
71+
"--only_match",
72+
)
73+
]
74+
flargs = list(itertools.chain(*args))
75+
76+
with pytest.raises(SystemExit):
77+
with patch("sys.argv", flargs):
78+
run_match()
79+
80+
# arrow IPC files have been written
81+
assert len(os.listdir(outdir / "matchtmp")) == 1
82+
83+
5384
def test_strict_match(tmp_path_factory, good_scorefile, good_variants):
5485
"""Test matching well with extremely strict overlap to trigger a MatchRateError"""
5586
outdir = tmp_path_factory.mktemp("outdir")
@@ -91,6 +122,8 @@ def test_match_fail(tmp_path_factory, bad_scorefile, good_variants):
91122
str(good_variants),
92123
"--outdir",
93124
str(outdir),
125+
"--min_overlap",
126+
"0.75",
94127
)
95128
]
96129
flargs = list(itertools.chain(*args))

0 commit comments

Comments
 (0)