-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathScoreAnnotatedMuts.py
97 lines (78 loc) · 4.98 KB
/
ScoreAnnotatedMuts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
'''
Created on May 18, 2016
@author: Husen M. Umer
'''
import pandas as pd
import statsmodels.api as sm
import numpy as np
import matplotlib.pyplot as plt
'''
Given a list of annotated mutations whose scores are known find the weight for each annotation.
input:
- a tab separated file containing the scores for each mutation and a score attribute. A header line should exist including names of the annotations
- a list containing names of annotaton to use in the regression model
process: run a linear regression model to find the coefficient for each of the annotations
output: return a dict containing the weights for each annotation
'''
def getWeightsforAnnotations(annoated_scored_muts_input_file, outcome_col_name="", outcome_col_index=0, list_of_cols_names=[], list_of_cols_index=[], take_abs_entropy_diff=True, log_base=10):
all_names_cols_list = [outcome_col_name]
all_names_cols_list.extend(list_of_cols_names)
all_index_cols_list = [outcome_col_index]
all_index_cols_list.extend(list_of_cols_index)
pcsv = pd.read_csv(annoated_scored_muts_input_file, header=0, sep='\t', na_filter=True, na_values=["NA", "None"], usecols=all_index_cols_list, names=all_names_cols_list)#list the columns to be read from hear and the decision
df = pd.DataFrame(pcsv).dropna()
if take_abs_entropy_diff:
if 'Entropy_diff' in list_of_cols_names:
df['Entropy_diff'] = abs(df['Entropy_diff'])
df[outcome_col_name] = np.where(df[outcome_col_name]==True, 1, 0)
if 'normal_expression_level_of_this_motif' in list_of_cols_names:
df['normal_expression_level_of_this_motif']= np.where(df['normal_expression_level_of_this_motif']==0, 1, df['normal_expression_level_of_this_motif'])
df['normal_expression_level_of_this_motif'] = np.log(np.array(df['normal_expression_level_of_this_motif']))/np.log(log_base)
if 'other_factors' in list_of_cols_names:
df['other_factors']= np.where(df['other_factors']==0, 1, df['other_factors'])
df['other_factors'] = np.log(np.array(df['other_factors']))/np.log(log_base)
#plot_corr(df)
if 'CAGE_expr' in list_of_cols_names:
df['CAGE_expr']= np.where(df['CAGE_expr']==0, 1, df['CAGE_expr'])
df['CAGE_expr'] = np.log(np.array(df['CAGE_expr']))/np.log(log_base)
if 'chromatin_states' in list_of_cols_names:
df = pd.get_dummies(df, columns=['chromatin_states'], sparse=True)
list_of_cols_names = list(df.columns)
del list_of_cols_names[list_of_cols_names.index(outcome_col_name)]
if 'replicationdomain' in list_of_cols_names:
df = pd.get_dummies(df, columns=['replicationdomain'], sparse=True)
list_of_cols_names = list(df.columns)
del list_of_cols_names[list_of_cols_names.index(outcome_col_name)]
df['intercept'] = 1
list_of_cols_names.append('intercept')
X = df[list_of_cols_names]
y = df[outcome_col_name]
#error here
logit = sm.Logit(y,X).fit(method='bfgs', full_output=True)#, maxiter=1000000000
print(logit.summary())
for k in X.columns:
kk = X[k]
print(y.corr(kk))
print("log odds ratio is reported: logit.params:")
return logit.params#think of negative values (excluded) ((weightXi[without exp, just use the result from here logit.params]*valueXi)+(weightXi+1*valueXi+1))
#return np.exp(logit.params)# think of values less than one (should be excluded, may be). (weightXi^valueXi)*weightXi+1*exp(valueXi+1)))
def plot_corr(df,size=10):
'''Function plots a graphical correlation matrix for each pair of columns in the dataframe.
Input:
df: pandas DataFrame
size: vertical and horizontal size of the plot'''
corr = df.corr()
fig, ax = plt.subplots(figsize=(size, size))
sm = ax.matshow(corr)
cbar = plt.colorbar(sm, alpha=0.05, aspect=16, shrink=0.4)
cbar.solids.set_edgecolor("face")
plt.xticks(range(len(corr.columns)), corr.columns);
plt.yticks(range(len(corr.columns)), corr.columns);
def get_col_names():
col_names = ['MutChr', 'MutStart', 'MutStop', 'Ref', 'Alt', 'Cancer-project', 'Mut_score', 'vcf_info', 'sample_file_id',
'MotifChr', 'MotifStart', 'MotifEnd', 'Motif_name', 'Motif_score', 'Motif_strand', 'Entropy_diff', 'MutMotif_position',
'normal_expression_level_of_this_motif', 'tumor_expression_level_of_this_motif',]
col_names.extend(["same_factor", "other_factors", "dnase", "contactingdomain", "loopdomain", "CAGE_expr", "chromatin_states", "replicationdomain"])
col_names.extend(["same_cell_same_factor","same_cell_dnase","same_cell_chromatin_states","same_cell_replicationdomain","same_cell_contactingdomain","same_cell_loopdomain","same_cell_CAGE_expr","same_cell_other_factors_names","same_cell_other_factors", "other_cells_same_factor","other_cells_dnase","other_cell_chromatin_states","other_cells_replicationdomain","other_cells_contactingdomain","other_cells_loopdomain","other_cells_CAGE_expr"])
col_names.append('WAScore')
return col_names