summaryrefslogtreecommitdiff
path: root/Code/MixReg.R
diff options
context:
space:
mode:
authorHui Lan <lanhui@zjnu.edu.cn>2019-12-04 19:03:19 +0800
committerHui Lan <lanhui@zjnu.edu.cn>2019-12-04 19:03:19 +0800
commit97fdefab064f63642fa3ece05b807d29b459df31 (patch)
treea058530023224f3e35b1783996f3530c80c04bc5 /Code/MixReg.R
brain: add python and R code to local repository.
Diffstat (limited to 'Code/MixReg.R')
-rw-r--r--Code/MixReg.R173
1 files changed, 173 insertions, 0 deletions
diff --git a/Code/MixReg.R b/Code/MixReg.R
new file mode 100644
index 0000000..a109430
--- /dev/null
+++ b/Code/MixReg.R
@@ -0,0 +1,173 @@
+k.lst <- c(3)
+target <- 'AT2G28507'
+id2 <- target
+tfs <- c('AT5G54230','AT5G65310','AT5G17300','AT1G14687','AT4G00730','AT3G13810','AT1G52880','AT1G74480','AT1G30490','AT4G26840','AT5G03150','AT5G66730','AT2G02450','AT1G49480','AT1G69780','AT3G04070','AT2G02070','AT1G03840','AT5G01380','AT3G61150','AT2G22430','AT2G25930','AT5G47370','AT4G28500','AT1G01060','AT5G03790','AT5G13180','AT1G28470','AT1G69490','AT1G55110','AT3G60580','AT4G36740','AT1G51220','AT1G19850','AT3G15500','AT2G02080','AT1G75240')
+conditions <- c('C0002000000374','C0002000000147 C0002000000148','C0002000000440','C0002000000220','C0002000000055','C0002000000221','C0002000000403 C0002000000404','C0002000000436','C0002000000421','C0001000011111 C0001000012119','C0002000000334','C0002000000217 C0002000000218','C0002000000019','C0002000000434','C0002000000126','C0002000000026','C0002000000332','C0002000000351','C0002000000313','C0002000000318','C0002000000149','C0001000007335','C0002000000316 C0002000000317','C0002000000448','C0002000000346','C0002000000348 C0002000000349','C0002000000044','C0002000000450','C0002000000405 C0002000000406','C0002000000333','C0002000000192','C0002000000143','C0002000000505','C0002000000352','C0002000000031','C0002000000331','C0002000000140')
+recent.edge <- c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)
+jsonTPM.dir <- '/home/hui/network/v03/Data/history/expr/jsonTPM_20170424_154323'
+AGINAME_FILE <- '/home/hui/network/v03/Data/information/AGI-to-gene-names_v2.txt'
+
+post.translation <- function(x, y) {
+ mean.x <- mean(x)
+ sd.x <- sd(x)
+ index <- x > mean.x - sd.x & x < mean.x + sd.x
+ sd.y <- sd(y[index])
+ result <- list(value=ifelse(mean.x < 2.0, 0.0, (mean.x/max(x)) * sd.y * sum(index)/length(index)), index=which(index==T), percent=sum(index)/length(index))
+}
+
+post.translation.2 <- function(x, y) {
+ # x is consititutively high while y varies a lot
+ mean.x <- mean(x)
+ sd.x <- max(sd(x), 1) # a number above 1
+ index <- x > mean.x - sd.x & x < mean.x + sd.x # points within the window +/- sd.x
+ sd.y <- quantile(y[index],0.85)-quantile(y[index],0.15) # dispersion of y within the window
+ sd.y.2 <- quantile(y,0.85)-quantile(y,0.15) # dispersion of all y
+ v.disp <- sd.y/max(1, sd.y.2) # how disperse y is within the windown, a number between 0 and 1
+ # value measure dispersion of y and percent of points within a window
+ result <- list(value=ifelse(mean.x < 2.0, 0.0, v.disp * sum(index)/length(index)), index=which(index==T), percent=sum(index)/length(index))
+}
+
+post.translation.3 <- function(x, y) {
+ # x is consititutively high while y varies a lot
+ mean.x <- mean(x)
+ upper.percentile <- 0.85 # used for computing vertical dispersion
+ lowest.n <- 3 # number of points with lowest x values
+ min.mean.x <- max(2.0, quantile(x, 0.25)) # mean of x must be greater than this value
+ sd.x <- min(sd(x), 1) # a number between 0 and 1
+ index <- x > mean.x - sd.x & x < mean.x + sd.x # points within the window +/- sd.x
+ sd.y <- quantile(y[index],upper.percentile)-quantile(y[index],1-upper.percentile) # dispersion of y within the window
+ sd.y.2 <- quantile(y,upper.percentile)-quantile(y,1-upper.percentile) # dispersion of all y
+ v.disp <- sd.y/max(1, sd.y.2) # how disperse y is within the window, a number between 0 and 1
+
+ rst <- sort(x, index.return=T)
+ top.n <- sum(rst$x < 1)
+ top.n <- max(1, min(top.n, lowest.n))
+ small.y <- min(mean(y[rst$ix[1:top.n]]), mean(y[x<1])) # use the smaller value
+ small.y <- ifelse(is.nan(small.y)==T, 999, small.y)
+ # value measure dispersion of y and percent of points within a window
+ result <- list(valid=small.y, value=ifelse(mean.x < min.mean.x, 0.0, v.disp * sum(index)/length(index)), index=which(index==T), percent=sum(index)/length(index))
+}
+
+in.component <- function(posterior, k) {
+ # posterior is an Nxk matrix, each row is a data points, and each col is prob belonging to a component
+ p = posterior[,k]
+ n = length(p)
+ index <- rep(F,n)
+ for (i in 1:n) {
+ if (p[i] > runif(1)) {
+ index[i] = T
+ }
+ }
+ result <- index
+}
+
+####### Read data #########################################
+CORR_THRESHOLD <- 0.7
+agi <- read.table(AGINAME_FILE, sep='\t', header=FALSE, row.names=1, stringsAsFactors=F) # AGINAME_FILE cannot contain quotes
+#######################################################
+library(mixtools)
+library(rjson)
+name2 <- agi[id2,1]
+result <- ''
+for (i in 1:length(tfs)) {
+ if (recent.edge[i] == 1) {
+ next
+ }
+ curr.date <- gsub('-','',Sys.Date())
+ id1 <- tfs[i]
+ name1 <- agi[id1,1]
+ cond <- conditions[i]
+
+ file.x <- paste(jsonTPM.dir, paste(id1, '.json', sep=''), sep='/')
+ if (!file.exists(file.x)) { next }
+ x <- as.data.frame(fromJSON(file = file.x))
+ x <- log(x+1)
+ rcond.x <- names(x)
+ x <- as.vector(t(x)) # convert it to a vector
+
+ file.y <- paste(jsonTPM.dir, paste(id2, '.json', sep=''), sep='/')
+ if (!file.exists(file.y)) { break }
+ y <- as.data.frame(fromJSON(file = file.y))
+ y <- log(y+1)
+ rcond.y <- names(y)
+ y <- as.vector(t(y)) # convert it to a vector
+
+ rna.sample.id <- rcond.x
+ if (all(rcond.x == rcond.y) == FALSE | id1 == id2) { # if the IDs in two json files do not match, or target is the same as tf, then ignore
+ next
+ }
+
+ MIN_SIZE <- min(100, max(10, ceiling(0.5 * length(x))))
+
+ index <- x < 0.01 | y < 0.01 # don't include data that is too small
+ x <- x[!index]
+ y <- y[!index]
+
+ if (length(x) < MIN_SIZE) {
+ next
+ }
+ r <- cor(x, y)
+ if (abs(r) >= CORR_THRESHOLD) {
+ s = sprintf('%s %s\t%s %s\t%4.2f\t%s\t%s\t%s\t%s\t%s\n', id2, name2, id1, name1, r, 'all', '.', cond, '.', curr.date)
+ result <- paste(result, s, sep='')
+ next # a good correlation is found using all experiments, so not necessary to look further
+ }
+
+ rna.sample.id <- rna.sample.id[!index] # this step is important to make the following index work
+
+ pos_r_max <- -2
+ pos_r_N <- 0
+ pos_r_index <- c()
+ pos_r_loglik <- -100000000
+
+ neg_r_max <- 2
+ neg_r_N <- 0
+ neg_r_index <- c()
+ neg_r_loglik <- -100000000
+
+ for (k in k.lst) {
+ em.out <- regmixEM(y, x, maxit=150, epsilon=1e-04, k=k)
+ for (j in seq(1,k,1)) {
+ index <- in.component(em.out$posterior, j)
+ size <- sum(index)
+ r <- cor(em.out$x[index,2], em.out$y[index])
+ if (!is.na(r) && r >= CORR_THRESHOLD && size >= MIN_SIZE && r > pos_r_max && size > pos_r_N) {
+ pos_r_max <- r
+ pos_r_N <- size
+ pos_r_index <- index
+ pos_r_loglik <- em.out$loglik
+ }
+ if (!is.na(r) && r <= -CORR_THRESHOLD && size >= MIN_SIZE && r < neg_r_max && size > neg_r_N) {
+ neg_r_max <- r
+ neg_r_N <- size
+ neg_r_index <- index
+ neg_r_loglik <- em.out$loglik
+ }
+ }
+ }
+ hit <- 0
+ if (pos_r_max > 0) { # has a good positive correlation
+ sub.cond <- paste(rna.sample.id[pos_r_index], collapse=' ')
+ s = sprintf('%s %s\t%s %s\t%4.2f\t%s\t%s\t%s\t%4.2f\t%s\n', id2, name2, id1, name1, pos_r_max, 'mix', sub.cond, cond, pos_r_loglik, curr.date)
+ result <- paste(result, s, sep='')
+ hit <- hit + 1
+ }
+ if (neg_r_max < 0) { # has a good negative correlation
+ sub.cond <- paste(rna.sample.id[neg_r_index], collapse=' ')
+ s = sprintf('%s %s\t%s %s\t%4.2f\t%s\t%s\t%s\t%4.2f\t%s\n', id2, name2, id1, name1, neg_r_max, 'mix', sub.cond, cond, neg_r_loglik, curr.date)
+ result <- paste(result, s, sep='')
+ hit <- hit + 1
+ }
+ if (hit == 0) {
+ t <- post.translation.3(x, y)
+ post.r <- t$percent
+ if (t$valid < quantile(y,0.25) & t$value > 0.69 & post.r >= 0.70 & length(t$index) > MIN_SIZE) {
+ sub.cond <- paste(rna.sample.id[t$index], collapse=' ')
+ s = sprintf('%s %s\t%s %s\t%4.2f\t%s\t%s\t%s\t%s\t%s\n', id2, name2, id1, name1, post.r, 'mix', sub.cond, cond, '.', curr.date)
+ result <- paste(result, s, sep='')
+ }
+ }
+}
+
+output.file <- paste('../Data/history/edges/one_target/edges.txt', id2, format(Sys.time(), '%b.%d.%Y.%X'), 'k3', sep='.')
+if (result != '') cat(result, file=output.file, sep='')