k.lst <- c(3)
target <- 'AT2G28507'
id2 <- target
tfs <- c('AT5G54230','AT5G65310','AT5G17300','AT1G14687','AT4G00730','AT3G13810','AT1G52880','AT1G74480','AT1G30490','AT4G26840','AT5G03150','AT5G66730','AT2G02450','AT1G49480','AT1G69780','AT3G04070','AT2G02070','AT1G03840','AT5G01380','AT3G61150','AT2G22430','AT2G25930','AT5G47370','AT4G28500','AT1G01060','AT5G03790','AT5G13180','AT1G28470','AT1G69490','AT1G55110','AT3G60580','AT4G36740','AT1G51220','AT1G19850','AT3G15500','AT2G02080','AT1G75240')
conditions <- c('C0002000000374','C0002000000147 C0002000000148','C0002000000440','C0002000000220','C0002000000055','C0002000000221','C0002000000403 C0002000000404','C0002000000436','C0002000000421','C0001000011111 C0001000012119','C0002000000334','C0002000000217 C0002000000218','C0002000000019','C0002000000434','C0002000000126','C0002000000026','C0002000000332','C0002000000351','C0002000000313','C0002000000318','C0002000000149','C0001000007335','C0002000000316 C0002000000317','C0002000000448','C0002000000346','C0002000000348 C0002000000349','C0002000000044','C0002000000450','C0002000000405 C0002000000406','C0002000000333','C0002000000192','C0002000000143','C0002000000505','C0002000000352','C0002000000031','C0002000000331','C0002000000140')
recent.edge <- c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)
jsonTPM.dir <- '/home/hui/network/v03/Data/history/expr/jsonTPM_20170424_154323'
AGINAME_FILE   <- '/home/hui/network/v03/Data/information/AGI-to-gene-names_v2.txt'

post.translation <- function(x, y) {
  mean.x <- mean(x)
  sd.x  <- sd(x)
  index <- x > mean.x - sd.x & x < mean.x + sd.x
  sd.y <- sd(y[index])
  result <- list(value=ifelse(mean.x < 2.0, 0.0, (mean.x/max(x)) * sd.y * sum(index)/length(index)), index=which(index==T), percent=sum(index)/length(index))
}

post.translation.2 <- function(x, y) {
  # x is consititutively high while y varies a lot
  mean.x <- mean(x)
  sd.x  <- max(sd(x), 1) # a number above 1
  index <- x > mean.x - sd.x & x < mean.x + sd.x # points within the window +/- sd.x
  sd.y <- quantile(y[index],0.85)-quantile(y[index],0.15) # dispersion of y within the window
  sd.y.2 <- quantile(y,0.85)-quantile(y,0.15) # dispersion of all y
  v.disp <- sd.y/max(1, sd.y.2) # how disperse y is within the windown, a number between 0 and 1
  # value measure dispersion of y and percent of points within a window
  result <- list(value=ifelse(mean.x < 2.0, 0.0, v.disp * sum(index)/length(index)), index=which(index==T), percent=sum(index)/length(index))
}

post.translation.3 <- function(x, y) {
  # x is consititutively high while y varies a lot
  mean.x <- mean(x)
  upper.percentile <- 0.85 # used for computing vertical dispersion
  lowest.n <- 3 # number of points with lowest x values
  min.mean.x <- max(2.0, quantile(x, 0.25)) # mean of x must be greater than this value
  sd.x  <- min(sd(x), 1) # a number between 0 and 1
  index <- x > mean.x - sd.x & x < mean.x + sd.x # points within the window +/- sd.x
  sd.y <- quantile(y[index],upper.percentile)-quantile(y[index],1-upper.percentile) # dispersion of y within the window
  sd.y.2 <- quantile(y,upper.percentile)-quantile(y,1-upper.percentile) # dispersion of all y
  v.disp <- sd.y/max(1, sd.y.2) # how disperse y is within the window, a number between 0 and 1

  rst <- sort(x, index.return=T)
  top.n <- sum(rst$x < 1)
  top.n <- max(1, min(top.n, lowest.n))
  small.y <- min(mean(y[rst$ix[1:top.n]]), mean(y[x<1])) # use the smaller value
  small.y <- ifelse(is.nan(small.y)==T, 999, small.y)
  # value measure dispersion of y and percent of points within a window
  result <- list(valid=small.y, value=ifelse(mean.x < min.mean.x, 0.0, v.disp * sum(index)/length(index)), index=which(index==T), percent=sum(index)/length(index))
}

in.component <- function(posterior, k) {
  # posterior is an Nxk matrix, each row is a data points, and each col is prob belonging to a component
  p = posterior[,k]
  n = length(p)
  index <- rep(F,n)
  for (i in 1:n) {
    if (p[i] > runif(1)) {
      index[i] = T
    }
  }
  result <- index
}

####### Read data #########################################
CORR_THRESHOLD <- 0.7
agi        <- read.table(AGINAME_FILE, sep='\t', header=FALSE, row.names=1, stringsAsFactors=F) # AGINAME_FILE cannot contain quotes
#######################################################
library(mixtools)
library(rjson)
name2 <- agi[id2,1]
result <- ''
for (i in 1:length(tfs)) {
    if (recent.edge[i] == 1) {
        next
    }
    curr.date <- gsub('-','',Sys.Date())
    id1 <- tfs[i]
    name1 <- agi[id1,1]
    cond <- conditions[i]

    file.x <- paste(jsonTPM.dir, paste(id1, '.json', sep=''), sep='/')
    if (!file.exists(file.x)) { next }
    x <- as.data.frame(fromJSON(file = file.x))
    x <- log(x+1)
    rcond.x <- names(x)
    x <- as.vector(t(x)) # convert it to a vector

    file.y <- paste(jsonTPM.dir, paste(id2, '.json', sep=''), sep='/')
    if (!file.exists(file.y)) { break }
    y <- as.data.frame(fromJSON(file = file.y))
    y <- log(y+1)
    rcond.y <- names(y)
    y <- as.vector(t(y)) # convert it to a vector

    rna.sample.id <- rcond.x
    if (all(rcond.x == rcond.y) == FALSE | id1 == id2) { # if the IDs in two json files do not match, or target is the same as tf, then ignore
       next
    }

    MIN_SIZE <- min(100, max(10, ceiling(0.5 * length(x))))

    index <- x < 0.01 | y < 0.01 # don't include data that is too small
    x <- x[!index]
    y <- y[!index]

    if (length(x) < MIN_SIZE) {
       next
    }
    r <- cor(x, y)
    if (abs(r) >= CORR_THRESHOLD) {
        s = sprintf('%s %s\t%s %s\t%4.2f\t%s\t%s\t%s\t%s\t%s\n', id2, name2, id1, name1, r, 'all', '.', cond, '.', curr.date)
        result <- paste(result, s, sep='')
        next  # a good correlation is found using all experiments, so not necessary to look further
    }

    rna.sample.id <- rna.sample.id[!index] # this step is important to make the following index work

    pos_r_max   <- -2
    pos_r_N     <- 0
    pos_r_index <- c()
    pos_r_loglik <- -100000000

    neg_r_max   <- 2
    neg_r_N     <- 0
    neg_r_index <- c()
    neg_r_loglik <- -100000000

    for (k in k.lst) {
        em.out <- regmixEM(y, x, maxit=150, epsilon=1e-04, k=k)
        for (j in seq(1,k,1)) {
            index <- in.component(em.out$posterior, j)
            size <- sum(index)
            r <- cor(em.out$x[index,2], em.out$y[index])
            if (!is.na(r) && r >= CORR_THRESHOLD && size >= MIN_SIZE && r > pos_r_max && size > pos_r_N) {
                pos_r_max <- r
                pos_r_N   <- size
                pos_r_index <- index
                pos_r_loglik <- em.out$loglik
            }
            if (!is.na(r) && r <= -CORR_THRESHOLD && size >= MIN_SIZE && r < neg_r_max && size > neg_r_N) {
                neg_r_max <- r
                neg_r_N   <- size
                neg_r_index <- index
                neg_r_loglik <- em.out$loglik
            }
        }
    }
    hit <- 0
    if (pos_r_max > 0) { # has a good positive correlation
        sub.cond <- paste(rna.sample.id[pos_r_index], collapse=' ')
        s = sprintf('%s %s\t%s %s\t%4.2f\t%s\t%s\t%s\t%4.2f\t%s\n', id2, name2, id1, name1, pos_r_max, 'mix', sub.cond, cond, pos_r_loglik, curr.date)
        result <- paste(result, s, sep='')
        hit <- hit + 1
    }
    if (neg_r_max < 0) { # has a good negative correlation
        sub.cond <- paste(rna.sample.id[neg_r_index], collapse=' ')
        s = sprintf('%s %s\t%s %s\t%4.2f\t%s\t%s\t%s\t%4.2f\t%s\n', id2, name2, id1, name1, neg_r_max, 'mix', sub.cond, cond, neg_r_loglik, curr.date)
        result <- paste(result, s, sep='')
        hit <- hit + 1
    }
    if (hit == 0) {
        t <- post.translation.3(x, y)
        post.r <- t$percent
        if (t$valid < quantile(y,0.25) & t$value > 0.69 & post.r >= 0.70 & length(t$index) > MIN_SIZE) {
          sub.cond <- paste(rna.sample.id[t$index], collapse=' ')
          s = sprintf('%s %s\t%s %s\t%4.2f\t%s\t%s\t%s\t%s\t%s\n', id2, name2, id1, name1, post.r, 'mix', sub.cond, cond, '.', curr.date)
          result <- paste(result, s, sep='')
        }
    }
}

output.file <- paste('../Data/history/edges/one_target/edges.txt', id2, format(Sys.time(), '%b.%d.%Y.%X'), 'k3', sep='.')
if (result != '') cat(result, file=output.file, sep='')