# Last modified on 8 Aug 2019

TISSUE.FILE <- '../Data/information/experiment.and.tissue.txt'
DATA.FILE   <- '../Data/history/expr/TPM.txt'
TEMP.DIR    <- '../Data/temp'
TARGET.FILE <- '../Data/temp/all_targets.txt'
TF.FILE     <- '../Data/temp/all_tfs.txt'
tau         <- 0.60


# Make sure we have required files
if (! file.exists(TISSUE.FILE)) {
   stop(sprintf('[correlation_per_tissue.R] Unable to find %s', TISSUE.FILE))
}

if (! file.exists(DATA.FILE)) {
   stop(sprintf('[correlation_per_tissue.R] Unable to find %s', DATA.FILE))
}

if (! dir.exists(TEMP.DIR)) {
   stop(sprintf('[correlation_per_tissue.R] Unable to find directory %s', TEMP.DIR))
}

if (! file.exists(TARGET.FILE)) {
   stop(sprintf('[correlation_per_tissue.R] Unable to find %s', TARGET.FILE))
}

if (! file.exists(TF.FILE)) {
   stop(sprintf('[correlation_per_tissue.R] Unable to find %s', TF.FILE))
}


X0            <- read.table(DATA.FILE, header=TRUE, check.names=FALSE)
all.id        <- X0$gene_id
X0$gene_id    <- NULL   # remove column gene_id
row.names(X0) <- all.id # add row names
all_genes     <- rownames(X0)

tissue        <- read.table(TISSUE.FILE, header=TRUE, check.names=FALSE, sep='\t')
labels        <- as.vector(tissue$suggested.tissue)
labels        <- unlist(lapply(labels, function(x) {e<-regexpr("\\.", x)[1]; if (e > 0) {y<-substr(x, 1, e-1)} else {x} })) # remove subcategories
unique.label  <- unique(labels)

targets       <- read.table(TARGET.FILE, header=FALSE)
tfs           <- read.table(TF.FILE, header=FALSE)
targets       <- as.vector(targets$V1)
tfs           <- as.vector(tfs$V1)


##############################################################
get.index <- function(X, tissue.matrix, tissue.name) {
    labels <- as.vector(tissue.matrix$suggested.tissue)
    labels <- unlist(lapply(labels, function(x) {e<-regexpr("\\.", x)[1]; if (e > 0) {y<-substr(x, 1, e-1)} else {x} })) # remove subcategories
    index <- c()
    count <- 1
    for (rseqid in colnames(X)) {
        i <- which(as.vector(tissue.matrix$run.id) == rseqid)
	if (length(i) != 0) {
   	    suggested.tissue.name <- labels[i]
	    if (tissue.name == suggested.tissue.name) {
	        index <- c(index, count)
	    }
	}
	count <- count + 1
    }
    index
}
##############################################################


# for each tissue type, get a correlation matrix
for (ul in unique.label) {
    index.rnaseq <- get.index(X0, tissue, ul)
    if (length(index.rnaseq) >= 50) {
        OUTPUT.FILE <- paste('../Data/temp/edges.txt.simple.correlation.tissue', ul, 'txt', sep='.')
        
        X           <- as.matrix(X0[, index.rnaseq])
        sd.1        <- apply(X, 1, sd) # sd of each row
        s0          <- apply(X, 1, function(c) sum(c==0)) # number of zeros in each row
        sd.tau      <- (quantile(sd.1, na.rm=TRUE)[1] +  quantile(sd.1, na.rm=TRUE)[2]) / 2.0 # min SD
        good        <- sd.1 > max(sd.tau, 0.05)
        tf_good     <- which( good & (all_genes %in% tfs) == T )
        target_good <- which( good & (all_genes %in% targets) == T )

        # Compute correlation coefficient
        X <- log(X + 1)
        X[X<0.01] <- NA
        if (length(tf_good) > 1) {
            c <- cor(t(X[target_good,]), t(X[tf_good,]), use='pairwise.complete.obs')
	} else {
            c <- cor(t(X[target_good,]), t(X[c(tf_good, tf_good), ]), use='pairwise.complete.obs')            
        }
        index <- !is.na(c) & abs(c) >= tau &  abs(c) <= 0.99
        row_names <- rownames(c)
        col_names <- colnames(c)
        result <- data.frame(row = row_names[row(c)[index]], col = col_names[col(c)[index]], r = c[index], tissue=rep(ul, sum(index)), numrnaseqids=rep(length(index.rnaseq), sum(index)))

        # write results
        write.table(result, OUTPUT.FILE, col.names=F, row.names=F, sep='\t', quote=F)
    }
}