# Last modified on 7 Agu 2019 by Hui Lan @ Jinhua
#DATA.FILE      <- '../Data/history/expr/TPM.txt.3130'
DATA.FILE      <- '../Data/history/expr/TPM.txt'
TARGET.TF.FILE <- '../Data/information/target_tf.txt'
AGINAME.FILE   <- '../Data/information/AGI-to-gene-names_v2.txt'
ONE.TARGET.DIR <- '../Data/history/edges/one_target'

# Make sure we have required files and directory 
if (! file.exists(DATA.FILE)) {
   stop(sprintf('[wedge.R] Unable to find %s', DATA.FILE))
}

if (! file.exists(TARGET.TF.FILE)) {
   stop(sprintf('[wedge.R] Unable to find %s', TARGET.TF.FILE))
}

if (! file.exists(AGINAME.FILE)) {
   stop(sprintf('[wedge.R] Unable to find %s', AGINAME.FILE))
}


if (! dir.exists(ONE.TARGET.DIR)) {
   stop(sprintf('[wedge.R] Unable to find directory %s', ONE.TARGET.DIR))
}


r.tau        <- 0.60

cat(sprintf('Read %s\n', DATA.FILE))
X            <- read.table(DATA.FILE, header=TRUE, check.names=FALSE)
all.id        <- X$gene_id
X$gene_id     <- NULL   # remove column gene_id
row.names(X)  <- all.id # add row names
all.genes     <- rownames(X)

cat(sprintf('Read %s\n', AGINAME.FILE))
#agi        <- read.table(AGINAME.FILE, sep='\t', header=FALSE, row.names=1, stringsAsFactors=F) # AGINAME_FILE cannot contain quotes
agi        <- read.table(AGINAME.FILE, stringsAsFactors=F) # AGINAME_FILE cannot contain quotes

cat(sprintf('Read %s\n', TARGET.TF.FILE))
target.tf <- read.table(TARGET.TF.FILE, header=FALSE, check.names=FALSE, sep='\t')
total.pair <- dim(target.tf)[1]

###########################################################################
post.translation.4 <- function(x, y) {
    mx = mean(x)
    index = (x > mx - 0.5) & (x < mx + 0.5)
    slope = max(y[index])/mx
    v = c(-slope, 1)
    xy = as.matrix(cbind(x,y))
    z = xy %*% v
    index0 = which(z <= 0) # points below the wedge
    index1 = which(z > 0)  # points above the wedge
    index2 = which(x <= 0.1) # x has low value, then y is expected to have low value too
    if (length(index2) > 0) {
        q = quantile(y[index2], 0.9)
        m = mean(y[index2])
    } else {
        q = 0.0
        m = 0.0
    }
    index3 = which(x < 1)
    if (length(index3) > 0) {
        m = mean(y[index3])
    } else {
        m = 0.0
    }
    # for a scatterplot to be considered a wedge shape, percent>0.90,q < 1 and
    # m < slope, disp.x < disp.y
    result <- list(below=index0, upper=index1, percent=length(index0)/length(x), q=q, m=m, slope=slope, disp.x=sd(x)/mean(x), disp.y=sd(y)/mean(y))
}


make.data <- function(slope, n) {
    x=abs(3.0 + 1*rnorm(n))
    y=abs(3.0 + 1*rnorm(n))
    v = c(-slope, 1)
    xy = as.matrix(cbind(x,y))
    z = xy %*% v
    index = which(z <= 0)
    result <- list(x=x[index], y=y[index])
}
###########################################################################

cat(sprintf('Go through pairs looking for wedge shapes ..\n'))

output.file <- paste('../Data/history/edges/one_target/edges.txt', 'wedge', format(Sys.time(), "%b.%d.%Y.%H%M%S"), sep='.')
f <- file(output.file, 'w')
	    
for (i in 1:total.pair) {
    id1 <- as.vector(target.tf[i,2]) # tf
    id2 <- as.vector(target.tf[i,1]) # target

    all.in <- id1 %in% all.genes & id2 %in% all.genes
    if (!all.in) {
        next
    }
    
    x <- X[id1,]
    y <- X[id2,]
    x <- log(x+1)
    y <- log(y+1)
    x <- t(x)
    y <- t(y)
    na.ratio <- max(sum(is.na(x))/length(x), sum(is.na(y))/length(y))
    index <- x < 0.01 | y < 0.01 | na.ratio > 0.5 # make sure very small values are not included
    x <- x[!index, 1, drop=FALSE]
    y <- y[!index, 1, drop=FALSE]

    if (dim(x)[1] < 50) {
        next
    }

    # We will not consider wedge shape if the correlation coefficient is large enough.
    if (abs(cor(x, y)) < r.tau) {
        result <- post.translation.4(x, y)
        if (result$percent > 0.95 & result$q < 0.25 & result$m < 1.0 & result$disp.y > 1.2 * result$disp.x) {
            #name1 <- agi[id1,1]
            #name2 <- agi[id2,1]
	    name1 <- agi$V2[which(agi$V1 == id1)]
	    name2 <- agi$V2[which(agi$V1 == id2)]	    
            max.r <- max(r.tau, result$percent * exp(-max(result$q, result$m)))
            curr.date <- gsub('-','',Sys.Date())
            loglik <- '-1001.0'
            rna.sample <- row.names(x)[result$below] # below the diagonal line
            #rna.sample.size <- length(rna.sample)
            #rna.sample.2 <- sample(rna.sample, ceiling(rna.sample.size^0.7)) # to save space, keep only a fraction of the rnaseq sample IDs
            sub.cond <- paste(rna.sample, collapse=' ')
	    sub.cond.length <- length(rna.sample)
            cond <- as.vector(target.tf[i,3])
            result2 <- sprintf('%s %s\t%s %s\t%4.2f\t%s\t%s\t%s\t%s\t%s\t%4.2f\t%s\n', id2, name2, id1, name1, max.r, 'mix', sub.cond.length, cond, loglik, curr.date, max.r, 'wedge')

            cat(result2, file=f, sep='')
        }
    }
}

close(f)