k.lst <- c(3) target <- 'AT2G28507' id2 <- target tfs <- c('AT5G54230','AT5G65310','AT5G17300','AT1G14687','AT4G00730','AT3G13810','AT1G52880','AT1G74480','AT1G30490','AT4G26840','AT5G03150','AT5G66730','AT2G02450','AT1G49480','AT1G69780','AT3G04070','AT2G02070','AT1G03840','AT5G01380','AT3G61150','AT2G22430','AT2G25930','AT5G47370','AT4G28500','AT1G01060','AT5G03790','AT5G13180','AT1G28470','AT1G69490','AT1G55110','AT3G60580','AT4G36740','AT1G51220','AT1G19850','AT3G15500','AT2G02080','AT1G75240') conditions <- c('C0002000000374','C0002000000147 C0002000000148','C0002000000440','C0002000000220','C0002000000055','C0002000000221','C0002000000403 C0002000000404','C0002000000436','C0002000000421','C0001000011111 C0001000012119','C0002000000334','C0002000000217 C0002000000218','C0002000000019','C0002000000434','C0002000000126','C0002000000026','C0002000000332','C0002000000351','C0002000000313','C0002000000318','C0002000000149','C0001000007335','C0002000000316 C0002000000317','C0002000000448','C0002000000346','C0002000000348 C0002000000349','C0002000000044','C0002000000450','C0002000000405 C0002000000406','C0002000000333','C0002000000192','C0002000000143','C0002000000505','C0002000000352','C0002000000031','C0002000000331','C0002000000140') recent.edge <- c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0) jsonTPM.dir <- '/home/hui/network/v03/Data/history/expr/jsonTPM_20170424_154323' AGINAME_FILE <- '/home/hui/network/v03/Data/information/AGI-to-gene-names_v2.txt' post.translation <- function(x, y) { mean.x <- mean(x) sd.x <- sd(x) index <- x > mean.x - sd.x & x < mean.x + sd.x sd.y <- sd(y[index]) result <- list(value=ifelse(mean.x < 2.0, 0.0, (mean.x/max(x)) * sd.y * sum(index)/length(index)), index=which(index==T), percent=sum(index)/length(index)) } post.translation.2 <- function(x, y) { # x is consititutively high while y varies a lot mean.x <- mean(x) sd.x <- max(sd(x), 1) # a number above 1 index <- x > mean.x - sd.x & x < mean.x + sd.x # points within the window +/- sd.x sd.y <- quantile(y[index],0.85)-quantile(y[index],0.15) # dispersion of y within the window sd.y.2 <- quantile(y,0.85)-quantile(y,0.15) # dispersion of all y v.disp <- sd.y/max(1, sd.y.2) # how disperse y is within the windown, a number between 0 and 1 # value measure dispersion of y and percent of points within a window result <- list(value=ifelse(mean.x < 2.0, 0.0, v.disp * sum(index)/length(index)), index=which(index==T), percent=sum(index)/length(index)) } post.translation.3 <- function(x, y) { # x is consititutively high while y varies a lot mean.x <- mean(x) upper.percentile <- 0.85 # used for computing vertical dispersion lowest.n <- 3 # number of points with lowest x values min.mean.x <- max(2.0, quantile(x, 0.25)) # mean of x must be greater than this value sd.x <- min(sd(x), 1) # a number between 0 and 1 index <- x > mean.x - sd.x & x < mean.x + sd.x # points within the window +/- sd.x sd.y <- quantile(y[index],upper.percentile)-quantile(y[index],1-upper.percentile) # dispersion of y within the window sd.y.2 <- quantile(y,upper.percentile)-quantile(y,1-upper.percentile) # dispersion of all y v.disp <- sd.y/max(1, sd.y.2) # how disperse y is within the window, a number between 0 and 1 rst <- sort(x, index.return=T) top.n <- sum(rst$x < 1) top.n <- max(1, min(top.n, lowest.n)) small.y <- min(mean(y[rst$ix[1:top.n]]), mean(y[x<1])) # use the smaller value small.y <- ifelse(is.nan(small.y)==T, 999, small.y) # value measure dispersion of y and percent of points within a window result <- list(valid=small.y, value=ifelse(mean.x < min.mean.x, 0.0, v.disp * sum(index)/length(index)), index=which(index==T), percent=sum(index)/length(index)) } in.component <- function(posterior, k) { # posterior is an Nxk matrix, each row is a data points, and each col is prob belonging to a component p = posterior[,k] n = length(p) index <- rep(F,n) for (i in 1:n) { if (p[i] > runif(1)) { index[i] = T } } result <- index } ####### Read data ######################################### CORR_THRESHOLD <- 0.7 agi <- read.table(AGINAME_FILE, sep='\t', header=FALSE, row.names=1, stringsAsFactors=F) # AGINAME_FILE cannot contain quotes ####################################################### library(mixtools) library(rjson) name2 <- agi[id2,1] result <- '' for (i in 1:length(tfs)) { if (recent.edge[i] == 1) { next } curr.date <- gsub('-','',Sys.Date()) id1 <- tfs[i] name1 <- agi[id1,1] cond <- conditions[i] file.x <- paste(jsonTPM.dir, paste(id1, '.json', sep=''), sep='/') if (!file.exists(file.x)) { next } x <- as.data.frame(fromJSON(file = file.x)) x <- log(x+1) rcond.x <- names(x) x <- as.vector(t(x)) # convert it to a vector file.y <- paste(jsonTPM.dir, paste(id2, '.json', sep=''), sep='/') if (!file.exists(file.y)) { break } y <- as.data.frame(fromJSON(file = file.y)) y <- log(y+1) rcond.y <- names(y) y <- as.vector(t(y)) # convert it to a vector rna.sample.id <- rcond.x if (all(rcond.x == rcond.y) == FALSE | id1 == id2) { # if the IDs in two json files do not match, or target is the same as tf, then ignore next } MIN_SIZE <- min(100, max(10, ceiling(0.5 * length(x)))) index <- x < 0.01 | y < 0.01 # don't include data that is too small x <- x[!index] y <- y[!index] if (length(x) < MIN_SIZE) { next } r <- cor(x, y) if (abs(r) >= CORR_THRESHOLD) { s = sprintf('%s %s\t%s %s\t%4.2f\t%s\t%s\t%s\t%s\t%s\n', id2, name2, id1, name1, r, 'all', '.', cond, '.', curr.date) result <- paste(result, s, sep='') next # a good correlation is found using all experiments, so not necessary to look further } rna.sample.id <- rna.sample.id[!index] # this step is important to make the following index work pos_r_max <- -2 pos_r_N <- 0 pos_r_index <- c() pos_r_loglik <- -100000000 neg_r_max <- 2 neg_r_N <- 0 neg_r_index <- c() neg_r_loglik <- -100000000 for (k in k.lst) { em.out <- regmixEM(y, x, maxit=150, epsilon=1e-04, k=k) for (j in seq(1,k,1)) { index <- in.component(em.out$posterior, j) size <- sum(index) r <- cor(em.out$x[index,2], em.out$y[index]) if (!is.na(r) && r >= CORR_THRESHOLD && size >= MIN_SIZE && r > pos_r_max && size > pos_r_N) { pos_r_max <- r pos_r_N <- size pos_r_index <- index pos_r_loglik <- em.out$loglik } if (!is.na(r) && r <= -CORR_THRESHOLD && size >= MIN_SIZE && r < neg_r_max && size > neg_r_N) { neg_r_max <- r neg_r_N <- size neg_r_index <- index neg_r_loglik <- em.out$loglik } } } hit <- 0 if (pos_r_max > 0) { # has a good positive correlation sub.cond <- paste(rna.sample.id[pos_r_index], collapse=' ') s = sprintf('%s %s\t%s %s\t%4.2f\t%s\t%s\t%s\t%4.2f\t%s\n', id2, name2, id1, name1, pos_r_max, 'mix', sub.cond, cond, pos_r_loglik, curr.date) result <- paste(result, s, sep='') hit <- hit + 1 } if (neg_r_max < 0) { # has a good negative correlation sub.cond <- paste(rna.sample.id[neg_r_index], collapse=' ') s = sprintf('%s %s\t%s %s\t%4.2f\t%s\t%s\t%s\t%4.2f\t%s\n', id2, name2, id1, name1, neg_r_max, 'mix', sub.cond, cond, neg_r_loglik, curr.date) result <- paste(result, s, sep='') hit <- hit + 1 } if (hit == 0) { t <- post.translation.3(x, y) post.r <- t$percent if (t$valid < quantile(y,0.25) & t$value > 0.69 & post.r >= 0.70 & length(t$index) > MIN_SIZE) { sub.cond <- paste(rna.sample.id[t$index], collapse=' ') s = sprintf('%s %s\t%s %s\t%4.2f\t%s\t%s\t%s\t%s\t%s\n', id2, name2, id1, name1, post.r, 'mix', sub.cond, cond, '.', curr.date) result <- paste(result, s, sep='') } } } output.file <- paste('../Data/history/edges/one_target/edges.txt', id2, format(Sys.time(), '%b.%d.%Y.%X'), 'k3', sep='.') if (result != '') cat(result, file=output.file, sep='')