This repository was archived by the owner on Jan 20, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdemoTransmartRClientCommands.R
193 lines (158 loc) · 11 KB
/
demoTransmartRClientCommands.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
# Copyright 2014 Janssen Research & Development, LLC.
# Copyright (c) 2013 The Hyve B.V.
#
# This file is part of tranSMART R Client: R package allowing access to
# tranSMART's data via its RESTful API.
#
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation, either version 3 of the License, or (at your
# option) any later version, along with the following terms:
#
# 1. You may convey a work based on this program in accordance with
# section 5, provided that you retain the above notices.
# 2. You may convey verbatim copies of this program code as you receive
# it, in any medium, provided that you retain the above notices.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
# Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program. If not, see <http://www.gnu.org/licenses/>.
# This file contains some basic demo commands
# Example steps to authenticate with, connect to, and retrieve data from tranSMART
# load package
require("transmartRClient")
# Connect to the Hyve's demo transmart instance with the connectToTransmart function.
# You can log in as a guest user with username "user" and password "user".
# Occasionally, access to this demo server will be temporarily blocked when a demo is being given, in which case you cannot log in with these credentials.
# In that case: try again later.
# If you are not able to login for a persistent time, you can email us for help at: <<SOME_ADRESS>
#connectToTransmart("http://transmart-demo.thehyve.net/transmart")
# connect to server on local VM:
connectToTransmart("http://localhost:8080/transmart")
#Alternatively, you can try to log in to the demo server of the transmart foundation.
# We cannot, however, guarantee that this server will be running, that it has the right tranSMART version and which data will be present there
# Always check what the latest adress is of the transmart demo server and if this matches with the adress filled in in the connectToTransmart function below.
# You can look this up at the foundation's website http://transmartfoundation.org - Go to "PLATFORM" in the menu on top of the page and
# follow the link(s) to the transmart foundation demo instance.
#connectToTransmart("http://75.124.74.64/transmart")
# retrieve a list of the available studies in the database:
studies <- getStudies()
print(studies)
# to access the studies programmatically use for example:
# study<-studies$id[1]
# for the examples below we will use GSE8581, a study that aimed to identify biomarkers for COPD
study <- "GSE8581"
# Retrieve Clinical Data
allObservations <- getObservations(study, as.data.frame = T)
#the clinical data is stored in three tables:
# - an observation table with the clinical measurements
# - a subjectInfo table which contains information about the subjects/patients, such as their age or sex
# - and a table with information about the variables/concepts and where they can be
# found in the data tree of tranSMART (see the panel "Navigate terms" in the "Analyze" tab of the
# tranSMART web app: http://transmart-demo.thehyve.net/transmart/datasetExplorer/index )
summary(allObservations)
View(allObservations$observations[1:12,1:7]) #hint: if you are using RStudio you can also use the function "View" to see the data in a more user-friendly table, e.g.: View(allObservations[[1]])
View(allObservations$subjectInfo[1:12,])
View(allObservations$conceptInfo[1:10,])
# retrieve information about the concepts that are present for this study
concepts <- getConcepts(study)
View(concepts)
# You can use the concept names or the concept links to only retrieve data for a subset of the variables:
# retrieve observations for the first concept which conceptname contains "Lung Disease"
observations <- getObservations(study, concept.match = "Lung Disease", as.data.frame = T)
print(observations$observations)
# retrieve observations belonging to specific concepts by using the api.links contained in the getConcepts-result
observations <- getObservations(study,
concept.links =
c("/studies/gse8581/concepts/Subjects/Age",
"/studies/gse8581/concepts/Subjects/Sex")
)
# filter females :
observations_female <- subset(observations$observations, Sex == 'female')
observations_female[1:10,]
observations_male <- subset(observations$observations, Sex == 'male')
observations_male[1:10,]
# show distributions:
d <- density(as.integer(observations_male$Age)) # returns the density data
plot(d, col="blue") # plots the results
d <- density(as.integer(observations_female$Age)) # returns the density data
lines(d, col="red") # plots the results
# EXERCISE: make the same comparison of Age distribution between "control" and "chronic obstructive pulmonary disease" subjects
# TODO
# if a concept contains high dimensional data, use the following command to obtain this data.
# NB: if you would run the command below you will be told that you need to select one of the listed projections. For the full command,
# see next line below.
# getHighdimData(study.name = study, concept.match = "Lung")
# As noted above, you will be told that one of the listed projections needs to be selected.
# Set the projection argument to one of the following options:
# default_real_projection
# zscore
# log_intensity
# all_data
# The following will return the actual data, log transformed.
dataDownloaded <- getHighdimData(study.name = study, concept.match = "Lung", projection = "log_intensity")
# getHighDimData returns a list containing two objects:
# a data.frame containing the data, and a hash which maps probe names (labels) to Biomarker (e.g. gene) names
summary(dataDownloaded)
#View the data
View(dataDownloaded[["data"]])
# to view just a few columns and lines, e.g. 10 x 10, you can use:
View(dataDownloaded[["data"]][1:10,1:10])
#The hash will return the name of the bioMarker when it is supplied with the probe name
# (note: R automatically prepends "X" in front of column names that start with a numerical value.
# Therefore probe "1562446_at" will be found back in the data as "X1562446_at")
dataDownloaded[["labelToBioMarkerMap"]]["1562446_at"]
# mapping table with probes x gene/biomarker name:
dataDownloaded[["labelToBioMarkerMap"]]
# take a look at the data to find where gene expression columns start:
data<-dataDownloaded[["data"]]
data[1:4,1:10]
# select gene expression data, which is the data *excluding* columns 1 to 6:
expression_data<-data[,-c(1:6)]
# take a look at the dimensions
dim(expression_data)
# add patientId as the row name for the expression_data matrix:
rownames(expression_data)<-data$patientId
View(expression_data)
#Make a heatmap
# If the dimensions of the expression_data table are large, you may want to create a subset of the data first.
# In this case we use the following probelist as a *subset* for the probes:
# (extracted from 1: "Bhattacharya S., Srisuma S., Demeo D. L., et al., Molecular biomarkers for quantitative and discrete COPD phenotypes.
# American Journal of Respiratory Cell and Molecular Biology. 2009;40(3):359–367. doi: 10.1165/rcmb.2008-0114OC.")
probeNames<- c("1552622_s_at","1555318_at","1557293_at","1558280_s_at","1558411_at","1558515_at","1559964_at","204284_at","205051_s_at","205528_s_at","208835_s_at","209377_s_at","209815_at","211548_s_at","212179_at","212263_at","213156_at","213269_at","213650_at","213878_at","215359_x_at","215933_s_at","218352_at","218490_s_at","220094_s_at","220906_at","220925_at","222108_at","224711_at","225318_at","225595_at","225835_at","225892_at","226316_at","226492_at","226534_at","226666_at","226800_at","227095_at","227105_at","227148_at","227812_at","227852_at","227930_at","227947_at","228157_at","228630_at","228665_at","228760_at","228850_s_at","228875_at","228963_at","229111_at","229572_at","230142_s_at","230986_at","232014_at","235423_at","235810_at","238712_at","238992_at","239842_x_at","239847_at","241936_x_at","242389_at")
probeNames<- paste("X", probeNames, sep = "") #note: this is because R automatically prepends "X" in front of column names that start with a numerical value. Therefore prepend "X"
# select only the cases and controls (excluding the patients for which the lung disease is not specified). Note: in the observation table the database IDs
# are used to identify the patients and not the patient IDs that are used in the gene expression dataset
cases <- allObservations$observations$subject.id[ allObservations$observations$'Subjects_Lung Disease' == "chronic obstructive pulmonary disease"]
controls <- allObservations$observations$subject.id[allObservations$observations$'Subjects_Lung Disease' == "control"]
# if you are using the Transmart Foundation's instance, you might need to run the following instead:
#cases <- allObservations$observations$subject.id[!is.na(allObservations$observations$'Subjects_Lung Disease_chronic obstructive pulmonary disease')]
#controls <- allObservations$observations$subject.id[!is.na(allObservations$observations$'Subjects_Lung Disease_control')]
# now we have the *internal database* IDs for the patients, but we need to get the patient IDs because
# this is the index of the expression_data matrix.
# These can be retrieved from the subjectInfo table: subjectInfo <- allObservations$subjectInfo
patientIDsCase <- subjectInfo$subject.inTrialId[ subjectInfo$subject.id %in% cases ]
patientIDsControl <- subjectInfo$subject.inTrialId[ subjectInfo$subject.id %in% controls]
# patient sets containing case and control patientIDs
patientSets <- c(patientIDsCase,patientIDsControl)
patientSets<-patientSets[which(patientSets %in% rownames(expression_data))]
#make a subset of the data based on the selected patientSets and the probelist, and transpose the table so that the rows now contain probe names
subset<-t(expression_data[patientSets,probeNames])
#for ease of recognition: append "Case" and "Control" to the patient names
colnames(subset)[colnames(subset)%in% patientIDsCase] <- paste(colnames(subset)[colnames(subset)%in% patientIDsCase],"Case", sep="_" )
colnames(subset)[colnames(subset)%in% patientIDsControl] <- paste( colnames(subset)[colnames(subset)%in% patientIDsControl] , "Control",sep= "_")
dim(subset)
heatmap(as.matrix(subset), scale = "row")
#there is one patient that seems to be an outlier: GSE8581GSM212810_Case
subset_without_outlier <- subset[,colnames(subset)!= "GSE8581GSM212810_Case"]
heatmap(as.matrix(subset_without_outlier), scale = "row")
# do a PCA analysis :
subset_t <- t(subset_without_outlier)
prcomp_result <- prcomp(x = subset_t)
# tip1: see View(prcomp_result$x)
# tip2 see ?plot3D::scatter3D
# tip3 use rownames(prcomp_result$x) information to make a color list and use this in the plot3D::scatter3D function (col parameter)