Code/MixReg.R


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173

k.lst <- c(3)
target <- 'AT2G28507'
id2 <- target
tfs <- c('AT5G54230','AT5G65310','AT5G17300','AT1G14687','AT4G00730','AT3G13810','AT1G52880','AT1G74480','AT1G30490','AT4G26840','AT5G03150','AT5G66730','AT2G02450','AT1G49480','AT1G69780','AT3G04070','AT2G02070','AT1G03840','AT5G01380','AT3G61150','AT2G22430','AT2G25930','AT5G47370','AT4G28500','AT1G01060','AT5G03790','AT5G13180','AT1G28470','AT1G69490','AT1G55110','AT3G60580','AT4G36740','AT1G51220','AT1G19850','AT3G15500','AT2G02080','AT1G75240')
conditions <- c('C0002000000374','C0002000000147 C0002000000148','C0002000000440','C0002000000220','C0002000000055','C0002000000221','C0002000000403 C0002000000404','C0002000000436','C0002000000421','C0001000011111 C0001000012119','C0002000000334','C0002000000217 C0002000000218','C0002000000019','C0002000000434','C0002000000126','C0002000000026','C0002000000332','C0002000000351','C0002000000313','C0002000000318','C0002000000149','C0001000007335','C0002000000316 C0002000000317','C0002000000448','C0002000000346','C0002000000348 C0002000000349','C0002000000044','C0002000000450','C0002000000405 C0002000000406','C0002000000333','C0002000000192','C0002000000143','C0002000000505','C0002000000352','C0002000000031','C0002000000331','C0002000000140')
recent.edge <- c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)
jsonTPM.dir <- '/home/hui/network/v03/Data/history/expr/jsonTPM_20170424_154323'
AGINAME_FILE   <- '/home/hui/network/v03/Data/information/AGI-to-gene-names_v2.txt'

post.translation <- function(x, y) {
  mean.x <- mean(x)
  sd.x  <- sd(x)
  index <- x > mean.x - sd.x & x < mean.x + sd.x
  sd.y <- sd(y[index])
  result <- list(value=ifelse(mean.x < 2.0, 0.0, (mean.x/max(x)) * sd.y * sum(index)/length(index)), index=which(index==T), percent=sum(index)/length(index))
}

post.translation.2 <- function(x, y) {
  # x is consititutively high while y varies a lot
  mean.x <- mean(x)
  sd.x  <- max(sd(x), 1) # a number above 1
  index <- x > mean.x - sd.x & x < mean.x + sd.x # points within the window +/- sd.x
  sd.y <- quantile(y[index],0.85)-quantile(y[index],0.15) # dispersion of y within the window
  sd.y.2 <- quantile(y,0.85)-quantile(y,0.15) # dispersion of all y
  v.disp <- sd.y/max(1, sd.y.2) # how disperse y is within the windown, a number between 0 and 1
  # value measure dispersion of y and percent of points within a window
  result <- list(value=ifelse(mean.x < 2.0, 0.0, v.disp * sum(index)/length(index)), index=which(index==T), percent=sum(index)/length(index))
}

post.translation.3 <- function(x, y) {
  # x is consititutively high while y varies a lot
  mean.x <- mean(x)
  upper.percentile <- 0.85 # used for computing vertical dispersion
  lowest.n <- 3 # number of points with lowest x values
  min.mean.x <- max(2.0, quantile(x, 0.25)) # mean of x must be greater than this value
  sd.x  <- min(sd(x), 1) # a number between 0 and 1
  index <- x > mean.x - sd.x & x < mean.x + sd.x # points within the window +/- sd.x
  sd.y <- quantile(y[index],upper.percentile)-quantile(y[index],1-upper.percentile) # dispersion of y within the window
  sd.y.2 <- quantile(y,upper.percentile)-quantile(y,1-upper.percentile) # dispersion of all y
  v.disp <- sd.y/max(1, sd.y.2) # how disperse y is within the window, a number between 0 and 1

  rst <- sort(x, index.return=T)
  top.n <- sum(rst$x < 1)
  top.n <- max(1, min(top.n, lowest.n))
  small.y <- min(mean(y[rst$ix[1:top.n]]), mean(y[x<1])) # use the smaller value
  small.y <- ifelse(is.nan(small.y)==T, 999, small.y)
  # value measure dispersion of y and percent of points within a window
  result <- list(valid=small.y, value=ifelse(mean.x < min.mean.x, 0.0, v.disp * sum(index)/length(index)), index=which(index==T), percent=sum(index)/length(index))
}

in.component <- function(posterior, k) {
  # posterior is an Nxk matrix, each row is a data points, and each col is prob belonging to a component
  p = posterior[,k]
  n = length(p)
  index <- rep(F,n)
  for (i in 1:n) {
    if (p[i] > runif(1)) {
      index[i] = T
    }
  }
  result <- index
}

####### Read data #########################################
CORR_THRESHOLD <- 0.7
agi        <- read.table(AGINAME_FILE, sep='\t', header=FALSE, row.names=1, stringsAsFactors=F) # AGINAME_FILE cannot contain quotes
#######################################################
library(mixtools)
library(rjson)
name2 <- agi[id2,1]
result <- ''
for (i in 1:length(tfs)) {
    if (recent.edge[i] == 1) {
        next
    }
    curr.date <- gsub('-','',Sys.Date())
    id1 <- tfs[i]
    name1 <- agi[id1,1]
    cond <- conditions[i]

    file.x <- paste(jsonTPM.dir, paste(id1, '.json', sep=''), sep='/')
    if (!file.exists(file.x)) { next }
    x <- as.data.frame(fromJSON(file = file.x))
    x <- log(x+1)
    rcond.x <- names(x)
    x <- as.vector(t(x)) # convert it to a vector

    file.y <- paste(jsonTPM.dir, paste(id2, '.json', sep=''), sep='/')
    if (!file.exists(file.y)) { break }
    y <- as.data.frame(fromJSON(file = file.y))
    y <- log(y+1)
    rcond.y <- names(y)
    y <- as.vector(t(y)) # convert it to a vector

    rna.sample.id <- rcond.x
    if (all(rcond.x == rcond.y) == FALSE | id1 == id2) { # if the IDs in two json files do not match, or target is the same as tf, then ignore
       next
    }

    MIN_SIZE <- min(100, max(10, ceiling(0.5 * length(x))))

    index <- x < 0.01 | y < 0.01 # don't include data that is too small
    x <- x[!index]
    y <- y[!index]

    if (length(x) < MIN_SIZE) {
       next
    }
    r <- cor(x, y)
    if (abs(r) >= CORR_THRESHOLD) {
        s = sprintf('%s %s\t%s %s\t%4.2f\t%s\t%s\t%s\t%s\t%s\n', id2, name2, id1, name1, r, 'all', '.', cond, '.', curr.date)
        result <- paste(result, s, sep='')
        next  # a good correlation is found using all experiments, so not necessary to look further
    }

    rna.sample.id <- rna.sample.id[!index] # this step is important to make the following index work

    pos_r_max   <- -2
    pos_r_N     <- 0
    pos_r_index <- c()
    pos_r_loglik <- -100000000

    neg_r_max   <- 2
    neg_r_N     <- 0
    neg_r_index <- c()
    neg_r_loglik <- -100000000

    for (k in k.lst) {
        em.out <- regmixEM(y, x, maxit=150, epsilon=1e-04, k=k)
        for (j in seq(1,k,1)) {
            index <- in.component(em.out$posterior, j)
            size <- sum(index)
            r <- cor(em.out$x[index,2], em.out$y[index])
            if (!is.na(r) && r >= CORR_THRESHOLD && size >= MIN_SIZE && r > pos_r_max && size > pos_r_N) {
                pos_r_max <- r
                pos_r_N   <- size
                pos_r_index <- index
                pos_r_loglik <- em.out$loglik
            }
            if (!is.na(r) && r <= -CORR_THRESHOLD && size >= MIN_SIZE && r < neg_r_max && size > neg_r_N) {
                neg_r_max <- r
                neg_r_N   <- size
                neg_r_index <- index
                neg_r_loglik <- em.out$loglik
            }
        }
    }
    hit <- 0
    if (pos_r_max > 0) { # has a good positive correlation
        sub.cond <- paste(rna.sample.id[pos_r_index], collapse=' ')
        s = sprintf('%s %s\t%s %s\t%4.2f\t%s\t%s\t%s\t%4.2f\t%s\n', id2, name2, id1, name1, pos_r_max, 'mix', sub.cond, cond, pos_r_loglik, curr.date)
        result <- paste(result, s, sep='')
        hit <- hit + 1
    }
    if (neg_r_max < 0) { # has a good negative correlation
        sub.cond <- paste(rna.sample.id[neg_r_index], collapse=' ')
        s = sprintf('%s %s\t%s %s\t%4.2f\t%s\t%s\t%s\t%4.2f\t%s\n', id2, name2, id1, name1, neg_r_max, 'mix', sub.cond, cond, neg_r_loglik, curr.date)
        result <- paste(result, s, sep='')
        hit <- hit + 1
    }
    if (hit == 0) {
        t <- post.translation.3(x, y)
        post.r <- t$percent
        if (t$valid < quantile(y,0.25) & t$value > 0.69 & post.r >= 0.70 & length(t$index) > MIN_SIZE) {
          sub.cond <- paste(rna.sample.id[t$index], collapse=' ')
          s = sprintf('%s %s\t%s %s\t%4.2f\t%s\t%s\t%s\t%s\t%s\n', id2, name2, id1, name1, post.r, 'mix', sub.cond, cond, '.', curr.date)
          result <- paste(result, s, sep='')
        }
    }
}

output.file <- paste('../Data/history/edges/one_target/edges.txt', id2, format(Sys.time(), '%b.%d.%Y.%X'), 'k3', sep='.')
if (result != '') cat(result, file=output.file, sep='')