From 97fdefab064f63642fa3ece05b807d29b459df31 Mon Sep 17 00:00:00 2001 From: Hui Lan Date: Wed, 4 Dec 2019 19:03:19 +0800 Subject: brain: add python and R code to local repository. --- Code/MixReg.R | 173 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 173 insertions(+) create mode 100644 Code/MixReg.R (limited to 'Code/MixReg.R') diff --git a/Code/MixReg.R b/Code/MixReg.R new file mode 100644 index 0000000..a109430 --- /dev/null +++ b/Code/MixReg.R @@ -0,0 +1,173 @@ +k.lst <- c(3) +target <- 'AT2G28507' +id2 <- target +tfs <- c('AT5G54230','AT5G65310','AT5G17300','AT1G14687','AT4G00730','AT3G13810','AT1G52880','AT1G74480','AT1G30490','AT4G26840','AT5G03150','AT5G66730','AT2G02450','AT1G49480','AT1G69780','AT3G04070','AT2G02070','AT1G03840','AT5G01380','AT3G61150','AT2G22430','AT2G25930','AT5G47370','AT4G28500','AT1G01060','AT5G03790','AT5G13180','AT1G28470','AT1G69490','AT1G55110','AT3G60580','AT4G36740','AT1G51220','AT1G19850','AT3G15500','AT2G02080','AT1G75240') +conditions <- c('C0002000000374','C0002000000147 C0002000000148','C0002000000440','C0002000000220','C0002000000055','C0002000000221','C0002000000403 C0002000000404','C0002000000436','C0002000000421','C0001000011111 C0001000012119','C0002000000334','C0002000000217 C0002000000218','C0002000000019','C0002000000434','C0002000000126','C0002000000026','C0002000000332','C0002000000351','C0002000000313','C0002000000318','C0002000000149','C0001000007335','C0002000000316 C0002000000317','C0002000000448','C0002000000346','C0002000000348 C0002000000349','C0002000000044','C0002000000450','C0002000000405 C0002000000406','C0002000000333','C0002000000192','C0002000000143','C0002000000505','C0002000000352','C0002000000031','C0002000000331','C0002000000140') +recent.edge <- c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0) +jsonTPM.dir <- '/home/hui/network/v03/Data/history/expr/jsonTPM_20170424_154323' +AGINAME_FILE <- '/home/hui/network/v03/Data/information/AGI-to-gene-names_v2.txt' + +post.translation <- function(x, y) { + mean.x <- mean(x) + sd.x <- sd(x) + index <- x > mean.x - sd.x & x < mean.x + sd.x + sd.y <- sd(y[index]) + result <- list(value=ifelse(mean.x < 2.0, 0.0, (mean.x/max(x)) * sd.y * sum(index)/length(index)), index=which(index==T), percent=sum(index)/length(index)) +} + +post.translation.2 <- function(x, y) { + # x is consititutively high while y varies a lot + mean.x <- mean(x) + sd.x <- max(sd(x), 1) # a number above 1 + index <- x > mean.x - sd.x & x < mean.x + sd.x # points within the window +/- sd.x + sd.y <- quantile(y[index],0.85)-quantile(y[index],0.15) # dispersion of y within the window + sd.y.2 <- quantile(y,0.85)-quantile(y,0.15) # dispersion of all y + v.disp <- sd.y/max(1, sd.y.2) # how disperse y is within the windown, a number between 0 and 1 + # value measure dispersion of y and percent of points within a window + result <- list(value=ifelse(mean.x < 2.0, 0.0, v.disp * sum(index)/length(index)), index=which(index==T), percent=sum(index)/length(index)) +} + +post.translation.3 <- function(x, y) { + # x is consititutively high while y varies a lot + mean.x <- mean(x) + upper.percentile <- 0.85 # used for computing vertical dispersion + lowest.n <- 3 # number of points with lowest x values + min.mean.x <- max(2.0, quantile(x, 0.25)) # mean of x must be greater than this value + sd.x <- min(sd(x), 1) # a number between 0 and 1 + index <- x > mean.x - sd.x & x < mean.x + sd.x # points within the window +/- sd.x + sd.y <- quantile(y[index],upper.percentile)-quantile(y[index],1-upper.percentile) # dispersion of y within the window + sd.y.2 <- quantile(y,upper.percentile)-quantile(y,1-upper.percentile) # dispersion of all y + v.disp <- sd.y/max(1, sd.y.2) # how disperse y is within the window, a number between 0 and 1 + + rst <- sort(x, index.return=T) + top.n <- sum(rst$x < 1) + top.n <- max(1, min(top.n, lowest.n)) + small.y <- min(mean(y[rst$ix[1:top.n]]), mean(y[x<1])) # use the smaller value + small.y <- ifelse(is.nan(small.y)==T, 999, small.y) + # value measure dispersion of y and percent of points within a window + result <- list(valid=small.y, value=ifelse(mean.x < min.mean.x, 0.0, v.disp * sum(index)/length(index)), index=which(index==T), percent=sum(index)/length(index)) +} + +in.component <- function(posterior, k) { + # posterior is an Nxk matrix, each row is a data points, and each col is prob belonging to a component + p = posterior[,k] + n = length(p) + index <- rep(F,n) + for (i in 1:n) { + if (p[i] > runif(1)) { + index[i] = T + } + } + result <- index +} + +####### Read data ######################################### +CORR_THRESHOLD <- 0.7 +agi <- read.table(AGINAME_FILE, sep='\t', header=FALSE, row.names=1, stringsAsFactors=F) # AGINAME_FILE cannot contain quotes +####################################################### +library(mixtools) +library(rjson) +name2 <- agi[id2,1] +result <- '' +for (i in 1:length(tfs)) { + if (recent.edge[i] == 1) { + next + } + curr.date <- gsub('-','',Sys.Date()) + id1 <- tfs[i] + name1 <- agi[id1,1] + cond <- conditions[i] + + file.x <- paste(jsonTPM.dir, paste(id1, '.json', sep=''), sep='/') + if (!file.exists(file.x)) { next } + x <- as.data.frame(fromJSON(file = file.x)) + x <- log(x+1) + rcond.x <- names(x) + x <- as.vector(t(x)) # convert it to a vector + + file.y <- paste(jsonTPM.dir, paste(id2, '.json', sep=''), sep='/') + if (!file.exists(file.y)) { break } + y <- as.data.frame(fromJSON(file = file.y)) + y <- log(y+1) + rcond.y <- names(y) + y <- as.vector(t(y)) # convert it to a vector + + rna.sample.id <- rcond.x + if (all(rcond.x == rcond.y) == FALSE | id1 == id2) { # if the IDs in two json files do not match, or target is the same as tf, then ignore + next + } + + MIN_SIZE <- min(100, max(10, ceiling(0.5 * length(x)))) + + index <- x < 0.01 | y < 0.01 # don't include data that is too small + x <- x[!index] + y <- y[!index] + + if (length(x) < MIN_SIZE) { + next + } + r <- cor(x, y) + if (abs(r) >= CORR_THRESHOLD) { + s = sprintf('%s %s\t%s %s\t%4.2f\t%s\t%s\t%s\t%s\t%s\n', id2, name2, id1, name1, r, 'all', '.', cond, '.', curr.date) + result <- paste(result, s, sep='') + next # a good correlation is found using all experiments, so not necessary to look further + } + + rna.sample.id <- rna.sample.id[!index] # this step is important to make the following index work + + pos_r_max <- -2 + pos_r_N <- 0 + pos_r_index <- c() + pos_r_loglik <- -100000000 + + neg_r_max <- 2 + neg_r_N <- 0 + neg_r_index <- c() + neg_r_loglik <- -100000000 + + for (k in k.lst) { + em.out <- regmixEM(y, x, maxit=150, epsilon=1e-04, k=k) + for (j in seq(1,k,1)) { + index <- in.component(em.out$posterior, j) + size <- sum(index) + r <- cor(em.out$x[index,2], em.out$y[index]) + if (!is.na(r) && r >= CORR_THRESHOLD && size >= MIN_SIZE && r > pos_r_max && size > pos_r_N) { + pos_r_max <- r + pos_r_N <- size + pos_r_index <- index + pos_r_loglik <- em.out$loglik + } + if (!is.na(r) && r <= -CORR_THRESHOLD && size >= MIN_SIZE && r < neg_r_max && size > neg_r_N) { + neg_r_max <- r + neg_r_N <- size + neg_r_index <- index + neg_r_loglik <- em.out$loglik + } + } + } + hit <- 0 + if (pos_r_max > 0) { # has a good positive correlation + sub.cond <- paste(rna.sample.id[pos_r_index], collapse=' ') + s = sprintf('%s %s\t%s %s\t%4.2f\t%s\t%s\t%s\t%4.2f\t%s\n', id2, name2, id1, name1, pos_r_max, 'mix', sub.cond, cond, pos_r_loglik, curr.date) + result <- paste(result, s, sep='') + hit <- hit + 1 + } + if (neg_r_max < 0) { # has a good negative correlation + sub.cond <- paste(rna.sample.id[neg_r_index], collapse=' ') + s = sprintf('%s %s\t%s %s\t%4.2f\t%s\t%s\t%s\t%4.2f\t%s\n', id2, name2, id1, name1, neg_r_max, 'mix', sub.cond, cond, neg_r_loglik, curr.date) + result <- paste(result, s, sep='') + hit <- hit + 1 + } + if (hit == 0) { + t <- post.translation.3(x, y) + post.r <- t$percent + if (t$valid < quantile(y,0.25) & t$value > 0.69 & post.r >= 0.70 & length(t$index) > MIN_SIZE) { + sub.cond <- paste(rna.sample.id[t$index], collapse=' ') + s = sprintf('%s %s\t%s %s\t%4.2f\t%s\t%s\t%s\t%s\t%s\n', id2, name2, id1, name1, post.r, 'mix', sub.cond, cond, '.', curr.date) + result <- paste(result, s, sep='') + } + } +} + +output.file <- paste('../Data/history/edges/one_target/edges.txt', id2, format(Sys.time(), '%b.%d.%Y.%X'), 'k3', sep='.') +if (result != '') cat(result, file=output.file, sep='') -- cgit v1.2.1