Skip to content

Commit cd027b3

Browse files
smlmbrtnebfield
andauthored
Fix problems with ancestry aggregation & scaling on biobank-data (#19)
* Don't perform ancestry adjustments/keep AVG columns. * Edit expected column list * Bump versions (calc=0.2.0;utils=1.1.0) * simplify polygenicscore class (remove batches until we run into problems) * Fix pgscatalog.match performance regression (#22) * drop pyarrow support, it doesn't scale well, and be more consistent about public path properties * refactor to use polars for reading and writing IPC files to improve scalability * fix map_elements deprecation warning * update lockfiles * fix weird path -> is_path refactor that broke this test * missed one >_> * fix pyproject * update dockerfile * fix exception handling when one score fails matching * fix merging scoring files with different column sets * set pgscatalog package logging levels to INFO * Improve aggregation (#23) * export key functions for sorting chromosomes / effect types * use new key functions for sorting * reduce memory usage during aggregation * fix doctest output * make aggregation steps clearer * bump minor version of pgscatalog.core * minor version bump pgscatalog.match --------- Co-authored-by: Benjamin Wingfield <bwingfield@ebi.ac.uk>
1 parent e88b41f commit cd027b3

32 files changed

+1134
-1112
lines changed

Dockerfile

+10-9
Original file line numberDiff line numberDiff line change
@@ -9,19 +9,20 @@ WORKDIR /app
99

1010
RUN pip install poetry
1111

12-
COPY . .
12+
COPY pgscatalog.core /app/pgscatalog.core
13+
14+
COPY pgscatalog.calc /app/pgscatalog.calc
15+
16+
COPY pgscatalog.match /app/pgscatalog.match
17+
18+
COPY pgscatalog.utils /app/pgscatalog.utils
1319

1420
WORKDIR /app/pgscatalog.utils
1521

1622
RUN poetry install --no-root && rm -rf $POETRY_CACHE_DIR
1723

18-
FROM python:3.11-slim-bullseye
19-
20-
ENV VIRTUAL_ENV=/app/pgscatalog.utils/.venv \
21-
PATH="/app/pgscatalog.utils/.venv/bin:$PATH"
22-
23-
COPY --from=builder ${VIRTUAL_ENV} ${VIRTUAL_ENV}
24-
2524
RUN apt-get update && apt-get install -y procps && rm -rf /var/lib/apt/lists/*
2625

27-
ENV PATH="/venv/bin:${PATH}"
26+
ENV PATH="/app/pgscatalog.utils/.venv/bin:$PATH"
27+
28+

pgscatalog.calc/poetry.lock

+257-238
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pgscatalog.calc/pyproject.toml

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "pgscatalog.calc"
3-
version = "0.1.1"
3+
version = "0.2.0"
44
description = "Libraries and applications for working with calculated polygenic scores"
55
authors = ["Benjamin Wingfield <bwingfield@ebi.ac.uk>", "Samuel Lambert <sl925@medschl.cam.ac.uk>", "Laurent Gil <lg10@sanger.ac.uk>"]
66
readme = "README.md"
@@ -10,7 +10,7 @@ packages = [
1010

1111
[tool.poetry.dependencies]
1212
python = "^3.11"
13-
"pgscatalog.core" = "^0.1.0"
13+
"pgscatalog.core" = {path = "../pgscatalog.core", develop = true}
1414
numpy = "^1.26.4"
1515
pandas = "^2.2.0"
1616
pyarrow = "^15.0.0"

pgscatalog.calc/src/pgscatalog/calc/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -21,4 +21,4 @@
2121
"AdjustResults",
2222
]
2323

24-
__version__ = "0.1.1"
24+
__version__ = "0.2.0"
Original file line numberDiff line numberDiff line change
@@ -1,3 +1 @@
1-
from .aggregate_cli import run_aggregate
2-
3-
__all__ = ["run_aggregate"]
1+
__all__ = []

pgscatalog.calc/src/pgscatalog/calc/cli/aggregate_cli.py

+43-7
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,11 @@
22
import logging
33
import pathlib
44
import textwrap
5-
import operator
6-
import functools
5+
from collections import deque
6+
from typing import Optional
77

8-
from ..lib.polygenicscore import PolygenicScore
8+
from ..lib import PolygenicScore
9+
from pgscatalog.core import chrom_keyfunc
910

1011
logger = logging.getLogger(__name__)
1112

@@ -21,15 +22,50 @@ def run_aggregate():
2122

2223
if args.verbose:
2324
logger.setLevel(logging.INFO)
25+
logging.getLogger("pgscatalog.core").setLevel(logging.INFO)
26+
logging.getLogger("pgscatalog.calc").setLevel(logging.INFO)
2427

2528
if not (outdir := pathlib.Path(args.outdir)).exists():
2629
raise FileNotFoundError(f"--outdir {outdir.name} doesn't exist")
2730

28-
score_paths = [pathlib.Path(x) for x in args.scores]
29-
pgs = [PolygenicScore(path=x) for x in score_paths]
30-
# call __add__ a lot
31-
aggregated = functools.reduce(operator.add, pgs)
31+
score_paths = sorted([pathlib.Path(x) for x in args.scores], key=chrom_keyfunc())
32+
# dfs are only read into memory after accessing them explicitly e.g. pgs[0].df
33+
pgs = deque(PolygenicScore(path=x) for x in score_paths)
34+
35+
observed_columns = set()
36+
aggregated: Optional[PolygenicScore] = None
37+
38+
# first, use PolygenicScore's __add__ method, which implements df.add(fill_value=0)
39+
while pgs:
40+
# popleft ensures that dfs are removed from memory after each aggregation
41+
score: PolygenicScore = pgs.popleft()
42+
if aggregated is None:
43+
logger.info(f"Initialising aggregation with {score}")
44+
aggregated: PolygenicScore = score
45+
else:
46+
logger.info(f"Adding {score}")
47+
aggregated += score
48+
observed_columns.update(set(score.df.columns))
49+
50+
# check to make sure that every column we saw in the dataframes is in the output
51+
if (dfcols := set(aggregated.df.columns)) != observed_columns:
52+
raise ValueError(
53+
f"Missing columns in aggregated file!. "
54+
f"Observed: {observed_columns}. "
55+
f"In aggregated: {dfcols}"
56+
)
57+
else:
58+
logger.info("Aggregated columns match observed columns")
59+
60+
# next, melt the plink2 scoring files from wide (many columns) format to long format
61+
aggregated.melt()
62+
63+
# recalculate PGS average using aggregated SUM and DENOM
64+
aggregated.average()
65+
66+
logger.info("Aggregation finished! Writing to a file")
3267
aggregated.write(outdir=args.outdir, split=args.split)
68+
logger.info("all done. bye :)")
3369

3470

3571
def _description_text() -> str:

pgscatalog.calc/src/pgscatalog/calc/cli/ancestry_cli.py

+2
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@ def run_ancestry():
2020

2121
if args.verbose:
2222
logger.setLevel(logging.INFO)
23+
logging.getLogger("pgscatalog.core").setLevel(logging.INFO)
24+
logging.getLogger("pgscatalog.calc").setLevel(logging.INFO)
2325
logger.info("Starting ancestry adjustment")
2426
logger.info("Verbose mode enabled")
2527

pgscatalog.calc/src/pgscatalog/calc/lib/_ancestry/read.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -87,8 +87,8 @@ def read_pgs(loc_aggscore):
8787
index_col=["sampleset", "IID"],
8888
converters={"IID": str},
8989
header=0,
90-
).pivot(columns=["PGS"], values=["SUM", "AVG"])
91-
# join column levels ({PGS}_{VALUE})
92-
df.columns = [f"{j}_{i}" for i, j in df.columns]
90+
).pivot(columns=["PGS"], values=["SUM"])
91+
# rename to PGS only
92+
df.columns = [f"{j}" for i, j in df.columns]
9393

9494
return df

0 commit comments

Comments
 (0)