Skip to content

Commit 23d76df

Browse files
committed
Initial commit
1 parent dc4191e commit 23d76df

File tree

5 files changed

+99
-1
lines changed

5 files changed

+99
-1
lines changed

.gitignore

+3
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@ venv/
1717
/tutorials/gps_output/
1818
/tutorials/gps_summary.csv
1919

20+
/src/forest/stb_screen_time/data_nock_lab/
21+
/src/forest/stb_screen_time/results/
22+
2023
# Jupyter notebook checkpoints
2124
.ipynb_checkpoints/
2225

pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ dependencies = [
3030
"librosa",
3131
"numpy",
3232
"openrouteservice",
33-
"pandas",
33+
"pandas[excel]",
3434
"pyproj",
3535
"python-dateutil",
3636
"pytz",

src/forest/stb_screen_time/__init__.py

Whitespace-only changes.

src/forest/stb_screen_time/config.py

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
BEIWE_ID_FIX_1 = "c3qgv1lo"
2+
BEIWE_ID_FIX_2 = "q1asyq1d"
3+
BEIWE_ID_FIX_3 = "upxfqjc4"
4+
BEIWE_ID_FIX_4 = "8wm2hk33"
5+
BEIWE_ID_FIX_5 = "thozhbm7"
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
from datetime import datetime, timedelta
2+
import os
3+
4+
import pandas as pd
5+
6+
from . import config
7+
8+
9+
def excel_numeric_to_date3(val):
10+
if pd.isna(val) or not str(val).startswith('4'):
11+
return None
12+
date_val = int(float(val))
13+
return datetime(1899, 12, 30) + timedelta(days=date_val)
14+
15+
16+
# File paths
17+
ext_file_path = os.path.join(os.getcwd(), "data_nock_lab", "data_participants_other", "BeiweID_11_9_2022.xlsx")
18+
dat_F_path = os.path.join(os.getcwd(), "results", "beiwe_id_list_with_dates_2022-11-17.csv")
19+
dat_demog_fc_path = os.path.join(os.getcwd(), "data_nock_lab", "data_participants_other", "fc_u01_participant_enrollment_demog.csv")
20+
dat_demog_nofc_path = os.path.join(os.getcwd(), "data_nock_lab", "data_participants_other", "u01_participant_enrollment_demog.csv")
21+
cleaned_demog_path = os.path.join(os.getcwd(), "data_nock_lab", "data_participants_other_processed", "demog_clean.csv")
22+
23+
# Read and clean external data: study start and end
24+
ext_file0 = pd.read_excel(ext_file_path, sheet_name=0).rename(columns=str.lower)
25+
26+
# Remove participants to be excluded
27+
ext_file1 = ext_file0[ext_file0['exclude_p'].isna()]
28+
29+
# Format dates
30+
# ext_file1['study_start'] = ext_file1['study_start'].apply(excel_numeric_to_date3)
31+
ext_file1.loc[:, 'study_start'] = ext_file1['study_start'].apply(excel_numeric_to_date3)
32+
33+
# those are yielding >168 days, cap at 168
34+
# (use variables from the config file -- not publicly available to keep them anonymous)
35+
ext_file1.loc[ext_file1['beiwe_id'] == config.BEIWE_ID_FIX_1, 'study_start'] = pd.to_datetime('2020-09-25')
36+
ext_file1.loc[ext_file1['beiwe_id'] == config.BEIWE_ID_FIX_2, 'study_start'] = pd.to_datetime('2021-11-19')
37+
38+
for fix_id in [config.BEIWE_ID_FIX_3, config.BEIWE_ID_FIX_4, config.BEIWE_ID_FIX_5]:
39+
if fix_id in ext_file1['beiwe_id'].values:
40+
start_date = ext_file1.loc[ext_file1['beiwe_id'] == fix_id, 'study_start']
41+
if start_date.notna().all():
42+
ext_file1.loc[ext_file1['beiwe_id'] == fix_id, 'study_end'] = start_date + pd.DateOffset(days=168)
43+
44+
# Ensure 'study_start' and 'study_end' are in datetime format
45+
ext_file1.loc[:, 'study_start'] = pd.to_datetime(ext_file1['study_start'], errors='coerce')
46+
ext_file1.loc[:, 'study_end'] = pd.to_datetime(ext_file1['study_end'], errors='coerce')
47+
48+
print(ext_file1[['study_start', 'study_end']].dtypes)
49+
50+
# Calculate the duration of observation
51+
ext_file1['obs_duration'] = (ext_file1['study_end'] - ext_file1['study_start']).dt.days
52+
53+
# Save cleaned data to CSV
54+
ext_file1.to_csv(dat_F_path, index=False)
55+
56+
# Read demographics
57+
dat_demog_fc = pd.read_csv(dat_demog_fc_path).rename(columns=str.lower)
58+
dat_demog_nofc = pd.read_csv(dat_demog_nofc_path).rename(columns=str.lower)
59+
60+
# Process demographics data
61+
dat_demog_fc['age_cat'] = 'adol'
62+
dat_demog_fc = dat_demog_fc.rename(columns={'sex2_new': 'sex', 'race2_new': 'race', 'age_new': 'age'})
63+
dat_demog_fc = dat_demog_fc.dropna(subset=['beiwe_id']).query("beiwe_id != ''")
64+
65+
dat_demog_nofc['age_cat'] = 'adult'
66+
dat_demog_nofc = dat_demog_nofc.rename(columns={'sex_new': 'sex', 'race2_new': 'race', 'age_new': 'age'})
67+
dat_demog_nofc = dat_demog_nofc.dropna(subset=['beiwe_id']).query("beiwe_id != ''")
68+
69+
# Combine demographics data
70+
dat_demog = pd.concat([dat_demog_fc, dat_demog_nofc])
71+
72+
# Remove duplicate beiwe_id
73+
dat_demog = dat_demog.drop_duplicates(subset='beiwe_id', keep='first')
74+
75+
# Save cleaned demographics
76+
dat_demog.to_csv(cleaned_demog_path, index=False)
77+
78+
# Create master file with combined data
79+
ext_file_F = ext_file1.copy()
80+
ext_file_F['has_start_end'] = 1
81+
dat_demog_tojoin = dat_demog.copy()
82+
dat_demog_tojoin['has_demog'] = 1
83+
84+
beiwe_masterfile = ext_file_F.merge(dat_demog_tojoin, on='beiwe_id', how='outer')
85+
beiwe_masterfile['has_start_end'] = beiwe_masterfile['has_start_end'].fillna(0).astype(int)
86+
beiwe_masterfile['has_demog'] = beiwe_masterfile['has_demog'].fillna(0).astype(int)
87+
88+
# Save master file
89+
beiwe_masterfile_path = os.path.join(os.getcwd(), "data_nock_lab", "data_participants_other_processed", "beiwe_id_masterfile.csv")
90+
beiwe_masterfile.to_csv(beiwe_masterfile_path, index=False)

0 commit comments

Comments
 (0)