1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
|
k.lst <- c(3)
target <- 'AT2G28507'
id2 <- target
tfs <- c('AT5G54230','AT5G65310','AT5G17300','AT1G14687','AT4G00730','AT3G13810','AT1G52880','AT1G74480','AT1G30490','AT4G26840','AT5G03150','AT5G66730','AT2G02450','AT1G49480','AT1G69780','AT3G04070','AT2G02070','AT1G03840','AT5G01380','AT3G61150','AT2G22430','AT2G25930','AT5G47370','AT4G28500','AT1G01060','AT5G03790','AT5G13180','AT1G28470','AT1G69490','AT1G55110','AT3G60580','AT4G36740','AT1G51220','AT1G19850','AT3G15500','AT2G02080','AT1G75240')
conditions <- c('C0002000000374','C0002000000147 C0002000000148','C0002000000440','C0002000000220','C0002000000055','C0002000000221','C0002000000403 C0002000000404','C0002000000436','C0002000000421','C0001000011111 C0001000012119','C0002000000334','C0002000000217 C0002000000218','C0002000000019','C0002000000434','C0002000000126','C0002000000026','C0002000000332','C0002000000351','C0002000000313','C0002000000318','C0002000000149','C0001000007335','C0002000000316 C0002000000317','C0002000000448','C0002000000346','C0002000000348 C0002000000349','C0002000000044','C0002000000450','C0002000000405 C0002000000406','C0002000000333','C0002000000192','C0002000000143','C0002000000505','C0002000000352','C0002000000031','C0002000000331','C0002000000140')
recent.edge <- c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)
jsonTPM.dir <- '/home/hui/network/v03/Data/history/expr/jsonTPM_20170424_154323'
AGINAME_FILE <- '/home/hui/network/v03/Data/information/AGI-to-gene-names_v2.txt'
post.translation <- function(x, y) {
mean.x <- mean(x)
sd.x <- sd(x)
index <- x > mean.x - sd.x & x < mean.x + sd.x
sd.y <- sd(y[index])
result <- list(value=ifelse(mean.x < 2.0, 0.0, (mean.x/max(x)) * sd.y * sum(index)/length(index)), index=which(index==T), percent=sum(index)/length(index))
}
post.translation.2 <- function(x, y) {
# x is consititutively high while y varies a lot
mean.x <- mean(x)
sd.x <- max(sd(x), 1) # a number above 1
index <- x > mean.x - sd.x & x < mean.x + sd.x # points within the window +/- sd.x
sd.y <- quantile(y[index],0.85)-quantile(y[index],0.15) # dispersion of y within the window
sd.y.2 <- quantile(y,0.85)-quantile(y,0.15) # dispersion of all y
v.disp <- sd.y/max(1, sd.y.2) # how disperse y is within the windown, a number between 0 and 1
# value measure dispersion of y and percent of points within a window
result <- list(value=ifelse(mean.x < 2.0, 0.0, v.disp * sum(index)/length(index)), index=which(index==T), percent=sum(index)/length(index))
}
post.translation.3 <- function(x, y) {
# x is consititutively high while y varies a lot
mean.x <- mean(x)
upper.percentile <- 0.85 # used for computing vertical dispersion
lowest.n <- 3 # number of points with lowest x values
min.mean.x <- max(2.0, quantile(x, 0.25)) # mean of x must be greater than this value
sd.x <- min(sd(x), 1) # a number between 0 and 1
index <- x > mean.x - sd.x & x < mean.x + sd.x # points within the window +/- sd.x
sd.y <- quantile(y[index],upper.percentile)-quantile(y[index],1-upper.percentile) # dispersion of y within the window
sd.y.2 <- quantile(y,upper.percentile)-quantile(y,1-upper.percentile) # dispersion of all y
v.disp <- sd.y/max(1, sd.y.2) # how disperse y is within the window, a number between 0 and 1
rst <- sort(x, index.return=T)
top.n <- sum(rst$x < 1)
top.n <- max(1, min(top.n, lowest.n))
small.y <- min(mean(y[rst$ix[1:top.n]]), mean(y[x<1])) # use the smaller value
small.y <- ifelse(is.nan(small.y)==T, 999, small.y)
# value measure dispersion of y and percent of points within a window
result <- list(valid=small.y, value=ifelse(mean.x < min.mean.x, 0.0, v.disp * sum(index)/length(index)), index=which(index==T), percent=sum(index)/length(index))
}
in.component <- function(posterior, k) {
# posterior is an Nxk matrix, each row is a data points, and each col is prob belonging to a component
p = posterior[,k]
n = length(p)
index <- rep(F,n)
for (i in 1:n) {
if (p[i] > runif(1)) {
index[i] = T
}
}
result <- index
}
####### Read data #########################################
CORR_THRESHOLD <- 0.7
agi <- read.table(AGINAME_FILE, sep='\t', header=FALSE, row.names=1, stringsAsFactors=F) # AGINAME_FILE cannot contain quotes
#######################################################
library(mixtools)
library(rjson)
name2 <- agi[id2,1]
result <- ''
for (i in 1:length(tfs)) {
if (recent.edge[i] == 1) {
next
}
curr.date <- gsub('-','',Sys.Date())
id1 <- tfs[i]
name1 <- agi[id1,1]
cond <- conditions[i]
file.x <- paste(jsonTPM.dir, paste(id1, '.json', sep=''), sep='/')
if (!file.exists(file.x)) { next }
x <- as.data.frame(fromJSON(file = file.x))
x <- log(x+1)
rcond.x <- names(x)
x <- as.vector(t(x)) # convert it to a vector
file.y <- paste(jsonTPM.dir, paste(id2, '.json', sep=''), sep='/')
if (!file.exists(file.y)) { break }
y <- as.data.frame(fromJSON(file = file.y))
y <- log(y+1)
rcond.y <- names(y)
y <- as.vector(t(y)) # convert it to a vector
rna.sample.id <- rcond.x
if (all(rcond.x == rcond.y) == FALSE | id1 == id2) { # if the IDs in two json files do not match, or target is the same as tf, then ignore
next
}
MIN_SIZE <- min(100, max(10, ceiling(0.5 * length(x))))
index <- x < 0.01 | y < 0.01 # don't include data that is too small
x <- x[!index]
y <- y[!index]
if (length(x) < MIN_SIZE) {
next
}
r <- cor(x, y)
if (abs(r) >= CORR_THRESHOLD) {
s = sprintf('%s %s\t%s %s\t%4.2f\t%s\t%s\t%s\t%s\t%s\n', id2, name2, id1, name1, r, 'all', '.', cond, '.', curr.date)
result <- paste(result, s, sep='')
next # a good correlation is found using all experiments, so not necessary to look further
}
rna.sample.id <- rna.sample.id[!index] # this step is important to make the following index work
pos_r_max <- -2
pos_r_N <- 0
pos_r_index <- c()
pos_r_loglik <- -100000000
neg_r_max <- 2
neg_r_N <- 0
neg_r_index <- c()
neg_r_loglik <- -100000000
for (k in k.lst) {
em.out <- regmixEM(y, x, maxit=150, epsilon=1e-04, k=k)
for (j in seq(1,k,1)) {
index <- in.component(em.out$posterior, j)
size <- sum(index)
r <- cor(em.out$x[index,2], em.out$y[index])
if (!is.na(r) && r >= CORR_THRESHOLD && size >= MIN_SIZE && r > pos_r_max && size > pos_r_N) {
pos_r_max <- r
pos_r_N <- size
pos_r_index <- index
pos_r_loglik <- em.out$loglik
}
if (!is.na(r) && r <= -CORR_THRESHOLD && size >= MIN_SIZE && r < neg_r_max && size > neg_r_N) {
neg_r_max <- r
neg_r_N <- size
neg_r_index <- index
neg_r_loglik <- em.out$loglik
}
}
}
hit <- 0
if (pos_r_max > 0) { # has a good positive correlation
sub.cond <- paste(rna.sample.id[pos_r_index], collapse=' ')
s = sprintf('%s %s\t%s %s\t%4.2f\t%s\t%s\t%s\t%4.2f\t%s\n', id2, name2, id1, name1, pos_r_max, 'mix', sub.cond, cond, pos_r_loglik, curr.date)
result <- paste(result, s, sep='')
hit <- hit + 1
}
if (neg_r_max < 0) { # has a good negative correlation
sub.cond <- paste(rna.sample.id[neg_r_index], collapse=' ')
s = sprintf('%s %s\t%s %s\t%4.2f\t%s\t%s\t%s\t%4.2f\t%s\n', id2, name2, id1, name1, neg_r_max, 'mix', sub.cond, cond, neg_r_loglik, curr.date)
result <- paste(result, s, sep='')
hit <- hit + 1
}
if (hit == 0) {
t <- post.translation.3(x, y)
post.r <- t$percent
if (t$valid < quantile(y,0.25) & t$value > 0.69 & post.r >= 0.70 & length(t$index) > MIN_SIZE) {
sub.cond <- paste(rna.sample.id[t$index], collapse=' ')
s = sprintf('%s %s\t%s %s\t%4.2f\t%s\t%s\t%s\t%s\t%s\n', id2, name2, id1, name1, post.r, 'mix', sub.cond, cond, '.', curr.date)
result <- paste(result, s, sep='')
}
}
}
output.file <- paste('../Data/history/edges/one_target/edges.txt', id2, format(Sys.time(), '%b.%d.%Y.%X'), 'k3', sep='.')
if (result != '') cat(result, file=output.file, sep='')
|