brain: add python and R code to local repository.

author: Hui Lan <lanhui@zjnu.edu.cn> 2019-12-04 19:03:19 +0800
committer: Hui Lan <lanhui@zjnu.edu.cn> 2019-12-04 19:03:19 +0800
commit: 97fdefab064f63642fa3ece05b807d29b459df31 (patch)
tree: a058530023224f3e35b1783996f3530c80c04bc5 /Code/MixReg.R
1 files changed, 173 insertions, 0 deletions
diff --git a/Code/MixReg.R b/Code/MixReg.R
new file mode 100644
index 0000000..a109430
--- /dev/null
+++ b/Code/MixReg.R
@@ -0,0 +1,173 @@
+k.lst <- c(3)
+target <- 'AT2G28507'
+id2 <- target
+tfs <- c('AT5G54230','AT5G65310','AT5G17300','AT1G14687','AT4G00730','AT3G13810','AT1G52880','AT1G74480','AT1G30490','AT4G26840','AT5G03150','AT5G66730','AT2G02450','AT1G49480','AT1G69780','AT3G04070','AT2G02070','AT1G03840','AT5G01380','AT3G61150','AT2G22430','AT2G25930','AT5G47370','AT4G28500','AT1G01060','AT5G03790','AT5G13180','AT1G28470','AT1G69490','AT1G55110','AT3G60580','AT4G36740','AT1G51220','AT1G19850','AT3G15500','AT2G02080','AT1G75240')
+conditions <- c('C0002000000374','C0002000000147 C0002000000148','C0002000000440','C0002000000220','C0002000000055','C0002000000221','C0002000000403 C0002000000404','C0002000000436','C0002000000421','C0001000011111 C0001000012119','C0002000000334','C0002000000217 C0002000000218','C0002000000019','C0002000000434','C0002000000126','C0002000000026','C0002000000332','C0002000000351','C0002000000313','C0002000000318','C0002000000149','C0001000007335','C0002000000316 C0002000000317','C0002000000448','C0002000000346','C0002000000348 C0002000000349','C0002000000044','C0002000000450','C0002000000405 C0002000000406','C0002000000333','C0002000000192','C0002000000143','C0002000000505','C0002000000352','C0002000000031','C0002000000331','C0002000000140')
+recent.edge <- c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)
+jsonTPM.dir <- '/home/hui/network/v03/Data/history/expr/jsonTPM_20170424_154323'
+AGINAME_FILE   <- '/home/hui/network/v03/Data/information/AGI-to-gene-names_v2.txt'
+
+post.translation <- function(x, y) {
+  mean.x <- mean(x)
+  sd.x  <- sd(x)
+  index <- x > mean.x - sd.x & x < mean.x + sd.x
+  sd.y <- sd(y[index])
+  result <- list(value=ifelse(mean.x < 2.0, 0.0, (mean.x/max(x)) * sd.y * sum(index)/length(index)), index=which(index==T), percent=sum(index)/length(index))
+}
+
+post.translation.2 <- function(x, y) {
+  # x is consititutively high while y varies a lot
+  mean.x <- mean(x)
+  sd.x  <- max(sd(x), 1) # a number above 1
+  index <- x > mean.x - sd.x & x < mean.x + sd.x # points within the window +/- sd.x
+  sd.y <- quantile(y[index],0.85)-quantile(y[index],0.15) # dispersion of y within the window
+  sd.y.2 <- quantile(y,0.85)-quantile(y,0.15) # dispersion of all y
+  v.disp <- sd.y/max(1, sd.y.2) # how disperse y is within the windown, a number between 0 and 1
+  # value measure dispersion of y and percent of points within a window
+  result <- list(value=ifelse(mean.x < 2.0, 0.0, v.disp * sum(index)/length(index)), index=which(index==T), percent=sum(index)/length(index))
+}
+
+post.translation.3 <- function(x, y) {
+  # x is consititutively high while y varies a lot
+  mean.x <- mean(x)
+  upper.percentile <- 0.85 # used for computing vertical dispersion
+  lowest.n <- 3 # number of points with lowest x values
+  min.mean.x <- max(2.0, quantile(x, 0.25)) # mean of x must be greater than this value
+  sd.x  <- min(sd(x), 1) # a number between 0 and 1
+  index <- x > mean.x - sd.x & x < mean.x + sd.x # points within the window +/- sd.x
+  sd.y <- quantile(y[index],upper.percentile)-quantile(y[index],1-upper.percentile) # dispersion of y within the window
+  sd.y.2 <- quantile(y,upper.percentile)-quantile(y,1-upper.percentile) # dispersion of all y
+  v.disp <- sd.y/max(1, sd.y.2) # how disperse y is within the window, a number between 0 and 1
+
+  rst <- sort(x, index.return=T)
+  top.n <- sum(rst$x < 1)
+  top.n <- max(1, min(top.n, lowest.n))
+  small.y <- min(mean(y[rst$ix[1:top.n]]), mean(y[x<1])) # use the smaller value
+  small.y <- ifelse(is.nan(small.y)==T, 999, small.y)
+  # value measure dispersion of y and percent of points within a window
+  result <- list(valid=small.y, value=ifelse(mean.x < min.mean.x, 0.0, v.disp * sum(index)/length(index)), index=which(index==T), percent=sum(index)/length(index))
+}
+
+in.component <- function(posterior, k) {
+  # posterior is an Nxk matrix, each row is a data points, and each col is prob belonging to a component
+  p = posterior[,k]
+  n = length(p)
+  index <- rep(F,n)
+  for (i in 1:n) {
+    if (p[i] > runif(1)) {
+      index[i] = T
+    }
+  }
+  result <- index
+}
+
+####### Read data #########################################
+CORR_THRESHOLD <- 0.7
+agi        <- read.table(AGINAME_FILE, sep='\t', header=FALSE, row.names=1, stringsAsFactors=F) # AGINAME_FILE cannot contain quotes
+#######################################################
+library(mixtools)
+library(rjson)
+name2 <- agi[id2,1]
+result <- ''
+for (i in 1:length(tfs)) {
+    if (recent.edge[i] == 1) {
+        next
+    }
+    curr.date <- gsub('-','',Sys.Date())
+    id1 <- tfs[i]
+    name1 <- agi[id1,1]
+    cond <- conditions[i]
+
+    file.x <- paste(jsonTPM.dir, paste(id1, '.json', sep=''), sep='/')
+    if (!file.exists(file.x)) { next }
+    x <- as.data.frame(fromJSON(file = file.x))
+    x <- log(x+1)
+    rcond.x <- names(x)
+    x <- as.vector(t(x)) # convert it to a vector
+
+    file.y <- paste(jsonTPM.dir, paste(id2, '.json', sep=''), sep='/')
+    if (!file.exists(file.y)) { break }
+    y <- as.data.frame(fromJSON(file = file.y))
+    y <- log(y+1)
+    rcond.y <- names(y)
+    y <- as.vector(t(y)) # convert it to a vector
+
+    rna.sample.id <- rcond.x
+    if (all(rcond.x == rcond.y) == FALSE | id1 == id2) { # if the IDs in two json files do not match, or target is the same as tf, then ignore
+       next
+    }
+
+    MIN_SIZE <- min(100, max(10, ceiling(0.5 * length(x))))
+
+    index <- x < 0.01 | y < 0.01 # don't include data that is too small
+    x <- x[!index]
+    y <- y[!index]
+
+    if (length(x) < MIN_SIZE) {
+       next
+    }
+    r <- cor(x, y)
+    if (abs(r) >= CORR_THRESHOLD) {
+        s = sprintf('%s %s\t%s %s\t%4.2f\t%s\t%s\t%s\t%s\t%s\n', id2, name2, id1, name1, r, 'all', '.', cond, '.', curr.date)
+        result <- paste(result, s, sep='')
+        next  # a good correlation is found using all experiments, so not necessary to look further
+    }
+
+    rna.sample.id <- rna.sample.id[!index] # this step is important to make the following index work
+
+    pos_r_max   <- -2
+    pos_r_N     <- 0
+    pos_r_index <- c()
+    pos_r_loglik <- -100000000
+
+    neg_r_max   <- 2
+    neg_r_N     <- 0
+    neg_r_index <- c()
+    neg_r_loglik <- -100000000
+
+    for (k in k.lst) {
+        em.out <- regmixEM(y, x, maxit=150, epsilon=1e-04, k=k)
+        for (j in seq(1,k,1)) {
+            index <- in.component(em.out$posterior, j)
+            size <- sum(index)
+            r <- cor(em.out$x[index,2], em.out$y[index])
+            if (!is.na(r) && r >= CORR_THRESHOLD && size >= MIN_SIZE && r > pos_r_max && size > pos_r_N) {
+                pos_r_max <- r
+                pos_r_N   <- size
+                pos_r_index <- index
+                pos_r_loglik <- em.out$loglik
+            }
+            if (!is.na(r) && r <= -CORR_THRESHOLD && size >= MIN_SIZE && r < neg_r_max && size > neg_r_N) {
+                neg_r_max <- r
+                neg_r_N   <- size
+                neg_r_index <- index
+                neg_r_loglik <- em.out$loglik
+            }
+        }
+    }
+    hit <- 0
+    if (pos_r_max > 0) { # has a good positive correlation
+        sub.cond <- paste(rna.sample.id[pos_r_index], collapse=' ')
+        s = sprintf('%s %s\t%s %s\t%4.2f\t%s\t%s\t%s\t%4.2f\t%s\n', id2, name2, id1, name1, pos_r_max, 'mix', sub.cond, cond, pos_r_loglik, curr.date)
+        result <- paste(result, s, sep='')
+        hit <- hit + 1
+    }
+    if (neg_r_max < 0) { # has a good negative correlation
+        sub.cond <- paste(rna.sample.id[neg_r_index], collapse=' ')
+        s = sprintf('%s %s\t%s %s\t%4.2f\t%s\t%s\t%s\t%4.2f\t%s\n', id2, name2, id1, name1, neg_r_max, 'mix', sub.cond, cond, neg_r_loglik, curr.date)
+        result <- paste(result, s, sep='')
+        hit <- hit + 1
+    }
+    if (hit == 0) {
+        t <- post.translation.3(x, y)
+        post.r <- t$percent
+        if (t$valid < quantile(y,0.25) & t$value > 0.69 & post.r >= 0.70 & length(t$index) > MIN_SIZE) {
+          sub.cond <- paste(rna.sample.id[t$index], collapse=' ')
+          s = sprintf('%s %s\t%s %s\t%4.2f\t%s\t%s\t%s\t%s\t%s\n', id2, name2, id1, name1, post.r, 'mix', sub.cond, cond, '.', curr.date)
+          result <- paste(result, s, sep='')
+        }
+    }
+}
+
+output.file <- paste('../Data/history/edges/one_target/edges.txt', id2, format(Sys.time(), '%b.%d.%Y.%X'), 'k3', sep='.')
+if (result != '') cat(result, file=output.file, sep='')
author	Hui Lan <lanhui@zjnu.edu.cn>	2019-12-04 19:03:19 +0800
committer	Hui Lan <lanhui@zjnu.edu.cn>	2019-12-04 19:03:19 +0800
commit	97fdefab064f63642fa3ece05b807d29b459df31 (patch)
tree	a058530023224f3e35b1783996f3530c80c04bc5 /Code/MixReg.R