|
| 1 | +from datetime import datetime, timedelta |
| 2 | +import os |
| 3 | + |
| 4 | +import pandas as pd |
| 5 | + |
| 6 | +from . import config |
| 7 | + |
| 8 | + |
| 9 | +def excel_numeric_to_date3(val): |
| 10 | + if pd.isna(val) or not str(val).startswith('4'): |
| 11 | + return None |
| 12 | + date_val = int(float(val)) |
| 13 | + return datetime(1899, 12, 30) + timedelta(days=date_val) |
| 14 | + |
| 15 | + |
| 16 | +# File paths |
| 17 | +ext_file_path = os.path.join(os.getcwd(), "data_nock_lab", "data_participants_other", "BeiweID_11_9_2022.xlsx") |
| 18 | +dat_F_path = os.path.join(os.getcwd(), "results", "beiwe_id_list_with_dates_2022-11-17.csv") |
| 19 | +dat_demog_fc_path = os.path.join(os.getcwd(), "data_nock_lab", "data_participants_other", "fc_u01_participant_enrollment_demog.csv") |
| 20 | +dat_demog_nofc_path = os.path.join(os.getcwd(), "data_nock_lab", "data_participants_other", "u01_participant_enrollment_demog.csv") |
| 21 | +cleaned_demog_path = os.path.join(os.getcwd(), "data_nock_lab", "data_participants_other_processed", "demog_clean.csv") |
| 22 | + |
| 23 | +# Read and clean external data: study start and end |
| 24 | +ext_file0 = pd.read_excel(ext_file_path, sheet_name=0).rename(columns=str.lower) |
| 25 | + |
| 26 | +# Remove participants to be excluded |
| 27 | +ext_file1 = ext_file0[ext_file0['exclude_p'].isna()] |
| 28 | + |
| 29 | +# Format dates |
| 30 | +# ext_file1['study_start'] = ext_file1['study_start'].apply(excel_numeric_to_date3) |
| 31 | +ext_file1.loc[:, 'study_start'] = ext_file1['study_start'].apply(excel_numeric_to_date3) |
| 32 | + |
| 33 | +# those are yielding >168 days, cap at 168 |
| 34 | +# (use variables from the config file -- not publicly available to keep them anonymous) |
| 35 | +ext_file1.loc[ext_file1['beiwe_id'] == config.BEIWE_ID_FIX_1, 'study_start'] = pd.to_datetime('2020-09-25') |
| 36 | +ext_file1.loc[ext_file1['beiwe_id'] == config.BEIWE_ID_FIX_2, 'study_start'] = pd.to_datetime('2021-11-19') |
| 37 | + |
| 38 | +for fix_id in [config.BEIWE_ID_FIX_3, config.BEIWE_ID_FIX_4, config.BEIWE_ID_FIX_5]: |
| 39 | + if fix_id in ext_file1['beiwe_id'].values: |
| 40 | + start_date = ext_file1.loc[ext_file1['beiwe_id'] == fix_id, 'study_start'] |
| 41 | + if start_date.notna().all(): |
| 42 | + ext_file1.loc[ext_file1['beiwe_id'] == fix_id, 'study_end'] = start_date + pd.DateOffset(days=168) |
| 43 | + |
| 44 | +# Ensure 'study_start' and 'study_end' are in datetime format |
| 45 | +ext_file1.loc[:, 'study_start'] = pd.to_datetime(ext_file1['study_start'], errors='coerce') |
| 46 | +ext_file1.loc[:, 'study_end'] = pd.to_datetime(ext_file1['study_end'], errors='coerce') |
| 47 | + |
| 48 | +print(ext_file1[['study_start', 'study_end']].dtypes) |
| 49 | + |
| 50 | +# Calculate the duration of observation |
| 51 | +ext_file1['obs_duration'] = (ext_file1['study_end'] - ext_file1['study_start']).dt.days |
| 52 | + |
| 53 | +# Save cleaned data to CSV |
| 54 | +ext_file1.to_csv(dat_F_path, index=False) |
| 55 | + |
| 56 | +# Read demographics |
| 57 | +dat_demog_fc = pd.read_csv(dat_demog_fc_path).rename(columns=str.lower) |
| 58 | +dat_demog_nofc = pd.read_csv(dat_demog_nofc_path).rename(columns=str.lower) |
| 59 | + |
| 60 | +# Process demographics data |
| 61 | +dat_demog_fc['age_cat'] = 'adol' |
| 62 | +dat_demog_fc = dat_demog_fc.rename(columns={'sex2_new': 'sex', 'race2_new': 'race', 'age_new': 'age'}) |
| 63 | +dat_demog_fc = dat_demog_fc.dropna(subset=['beiwe_id']).query("beiwe_id != ''") |
| 64 | + |
| 65 | +dat_demog_nofc['age_cat'] = 'adult' |
| 66 | +dat_demog_nofc = dat_demog_nofc.rename(columns={'sex_new': 'sex', 'race2_new': 'race', 'age_new': 'age'}) |
| 67 | +dat_demog_nofc = dat_demog_nofc.dropna(subset=['beiwe_id']).query("beiwe_id != ''") |
| 68 | + |
| 69 | +# Combine demographics data |
| 70 | +dat_demog = pd.concat([dat_demog_fc, dat_demog_nofc]) |
| 71 | + |
| 72 | +# Remove duplicate beiwe_id |
| 73 | +dat_demog = dat_demog.drop_duplicates(subset='beiwe_id', keep='first') |
| 74 | + |
| 75 | +# Save cleaned demographics |
| 76 | +dat_demog.to_csv(cleaned_demog_path, index=False) |
| 77 | + |
| 78 | +# Create master file with combined data |
| 79 | +ext_file_F = ext_file1.copy() |
| 80 | +ext_file_F['has_start_end'] = 1 |
| 81 | +dat_demog_tojoin = dat_demog.copy() |
| 82 | +dat_demog_tojoin['has_demog'] = 1 |
| 83 | + |
| 84 | +beiwe_masterfile = ext_file_F.merge(dat_demog_tojoin, on='beiwe_id', how='outer') |
| 85 | +beiwe_masterfile['has_start_end'] = beiwe_masterfile['has_start_end'].fillna(0).astype(int) |
| 86 | +beiwe_masterfile['has_demog'] = beiwe_masterfile['has_demog'].fillna(0).astype(int) |
| 87 | + |
| 88 | +# Save master file |
| 89 | +beiwe_masterfile_path = os.path.join(os.getcwd(), "data_nock_lab", "data_participants_other_processed", "beiwe_id_masterfile.csv") |
| 90 | +beiwe_masterfile.to_csv(beiwe_masterfile_path, index=False) |
0 commit comments