Skip to content

Commit f356578

Browse files
committed
more ancestry stuff
1 parent 9b31676 commit f356578

File tree

7 files changed

+903
-2
lines changed

7 files changed

+903
-2
lines changed

pgscatalog.calclib/poetry.lock

+127-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pgscatalog.calclib/pyproject.toml

+2
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ python = "^3.11"
1414
numpy = "^1.26.4"
1515
pandas = "^2.2.0"
1616
pyarrow = "^15.0.0"
17+
scikit-learn = "^1.4.0"
18+
scipy = "^1.12.0"
1719

1820

1921
[tool.poetry.group.dev.dependencies]

pgscatalog.calclib/src/pgscatalog/calclib/ancestry/__init__.py

Whitespace-only changes.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
import logging
2+
import pandas as pd
3+
4+
5+
logger = logging.getLogger(__name__)
6+
7+
8+
def read_pcs(loc_pcs: list[str], dataset: str, loc_related_ids=None, nPCs=None):
9+
"""
10+
Read the .pc file outputs of the fraposa_pgsc projection
11+
:param loc_pcs: list of locations for .pcs files
12+
:param dataset: name of the dataset being read (used for index)
13+
:param loc_related_ids: path to newline-delimited list of IDs for related samples that can be used to filter
14+
:return: pandas dataframe with PC information
15+
"""
16+
proj = pd.DataFrame()
17+
18+
for i, path in enumerate(loc_pcs):
19+
logger.debug("Reading PCA projection: {}".format(path))
20+
df = pd.read_csv(path, sep="\t", converters={"IID": str}, header=0)
21+
df["sampleset"] = dataset
22+
df.set_index(["sampleset", "IID"], inplace=True)
23+
24+
if i == 0:
25+
logger.debug("Initialising combined DF")
26+
proj = df.copy()
27+
else:
28+
logger.debug("Appending to combined DF")
29+
proj = pd.concat([proj, df])
30+
31+
# Drop PCs
32+
if nPCs:
33+
logger.debug("Filtering to relevant PCs")
34+
dropcols = []
35+
for x in proj.columns:
36+
if int(x[2:]) > nPCs:
37+
dropcols.append(x)
38+
proj = proj.drop(dropcols, axis=1)
39+
40+
# Read/process IDs for unrelated samples (usually reference dataset)
41+
if loc_related_ids:
42+
logger.debug("Flagging related samples with: {}".format(loc_related_ids))
43+
proj["Unrelated"] = True
44+
with open(loc_related_ids, "r") as infile:
45+
IDs_related = [x.strip() for x in infile.readlines()]
46+
proj.loc[
47+
proj.index.get_level_values(level=1).isin(IDs_related), "Unrelated"
48+
] = False
49+
else:
50+
# if unrelated is all nan -> dtype is float64
51+
# if unrelated is only true / false -> dtype is bool
52+
# if unrelated contains None, dtype stays bool, and pd.concat warning disappears
53+
proj["Unrelated"] = None
54+
55+
return proj
56+
57+
58+
def extract_ref_psam_cols(
59+
loc_psam, dataset: str, df_target, keepcols=["SuperPop", "Population"]
60+
):
61+
psam = pd.read_csv(loc_psam, sep="\t", header=0)
62+
63+
match psam.columns[0]:
64+
# handle case of #IID -> IID (happens when #FID is present)
65+
case "#IID":
66+
psam.rename({"#IID": "IID"}, axis=1, inplace=True)
67+
case "#FID":
68+
psam.drop(["#FID"], axis=1, inplace=True)
69+
case _:
70+
assert False, "Invalid columns"
71+
psam["sampleset"] = dataset
72+
psam.set_index(["sampleset", "IID"], inplace=True)
73+
74+
return pd.merge(df_target, psam[keepcols], left_index=True, right_index=True)
75+
76+
77+
def read_pgs(loc_aggscore):
78+
"""
79+
Function to read the output of aggreagte_scores
80+
:param loc_aggscore: path to aggregated scores output
81+
:return:
82+
"""
83+
logger.debug("Reading aggregated score data: {}".format(loc_aggscore))
84+
df = pd.read_csv(
85+
loc_aggscore,
86+
sep="\t",
87+
index_col=["sampleset", "IID"],
88+
converters={"IID": str},
89+
header=0,
90+
).pivot(columns=["PGS"], values=["SUM", "AVG"])
91+
# join column levels ({PGS}_{VALUE})
92+
df.columns = [f"{j}_{i}" for i, j in df.columns]
93+
94+
return df

0 commit comments

Comments
 (0)