1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
|
# Last modified on 8 Aug 2019
TISSUE.FILE <- '../Data/information/experiment.and.tissue.txt'
DATA.FILE <- '../Data/history/expr/TPM.txt'
TEMP.DIR <- '../Data/temp'
TARGET.FILE <- '../Data/temp/all_targets.txt'
TF.FILE <- '../Data/temp/all_tfs.txt'
tau <- 0.60
# Make sure we have required files
if (! file.exists(TISSUE.FILE)) {
stop(sprintf('[correlation_per_tissue.R] Unable to find %s', TISSUE.FILE))
}
if (! file.exists(DATA.FILE)) {
stop(sprintf('[correlation_per_tissue.R] Unable to find %s', DATA.FILE))
}
if (! dir.exists(TEMP.DIR)) {
stop(sprintf('[correlation_per_tissue.R] Unable to find directory %s', TEMP.DIR))
}
if (! file.exists(TARGET.FILE)) {
stop(sprintf('[correlation_per_tissue.R] Unable to find %s', TARGET.FILE))
}
if (! file.exists(TF.FILE)) {
stop(sprintf('[correlation_per_tissue.R] Unable to find %s', TF.FILE))
}
X0 <- read.table(DATA.FILE, header=TRUE, check.names=FALSE)
all.id <- X0$gene_id
X0$gene_id <- NULL # remove column gene_id
row.names(X0) <- all.id # add row names
all_genes <- rownames(X0)
tissue <- read.table(TISSUE.FILE, header=TRUE, check.names=FALSE, sep='\t')
labels <- as.vector(tissue$suggested.tissue)
labels <- unlist(lapply(labels, function(x) {e<-regexpr("\\.", x)[1]; if (e > 0) {y<-substr(x, 1, e-1)} else {x} })) # remove subcategories
unique.label <- unique(labels)
targets <- read.table(TARGET.FILE, header=FALSE)
tfs <- read.table(TF.FILE, header=FALSE)
targets <- as.vector(targets$V1)
tfs <- as.vector(tfs$V1)
##############################################################
get.index <- function(X, tissue.matrix, tissue.name) {
labels <- as.vector(tissue.matrix$suggested.tissue)
labels <- unlist(lapply(labels, function(x) {e<-regexpr("\\.", x)[1]; if (e > 0) {y<-substr(x, 1, e-1)} else {x} })) # remove subcategories
index <- c()
count <- 1
for (rseqid in colnames(X)) {
i <- which(as.vector(tissue.matrix$run.id) == rseqid)
if (length(i) != 0) {
suggested.tissue.name <- labels[i]
if (tissue.name == suggested.tissue.name) {
index <- c(index, count)
}
}
count <- count + 1
}
index
}
##############################################################
# for each tissue type, get a correlation matrix
for (ul in unique.label) {
index.rnaseq <- get.index(X0, tissue, ul)
if (length(index.rnaseq) >= 50) {
OUTPUT.FILE <- paste('../Data/temp/edges.txt.simple.correlation.tissue', ul, 'txt', sep='.')
X <- as.matrix(X0[, index.rnaseq])
sd.1 <- apply(X, 1, sd) # sd of each row
s0 <- apply(X, 1, function(c) sum(c==0)) # number of zeros in each row
sd.tau <- (quantile(sd.1)[1] + quantile(sd.1)[2]) / 2.0 # min SD
good <- sd.1 > max(sd.tau, 0.05)
tf_good <- which( good & (all_genes %in% tfs) == T )
target_good <- which( good & (all_genes %in% targets) == T )
# Compute correlation coefficient
X <- log(X + 1)
X[X<0.01] <- NA
if (length(tf_good) > 1) {
c <- cor(t(X[target_good,]), t(X[tf_good,]), use='pairwise.complete.obs')
} else {
c <- cor(t(X[target_good,]), t(X[c(tf_good, tf_good), ]), use='pairwise.complete.obs')
}
index <- !is.na(c) & abs(c) >= tau & abs(c) <= 0.99
row_names <- rownames(c)
col_names <- colnames(c)
result <- data.frame(row = row_names[row(c)[index]], col = col_names[col(c)[index]], r = c[index], tissue=rep(ul, sum(index)), numrnaseqids=rep(length(index.rnaseq), sum(index)))
# write results
write.table(result, OUTPUT.FILE, col.names=F, row.names=F, sep='\t', quote=F)
}
}
|