# Last modified on 9 Aug 2019
# Last modified on 11 Aug 2019
# Last modified on  9 Nov 2022
# Purpose: divide the samples into fixed number of groups and compute
# correlation on each group.  The optimal number of groups is
# determined using tissue labels, to maximize the agreement between
# the groups and tissue label.  More specifically, within each group
# there should be as few distinct tissues as possible.

TISSUE.FILE    <- '../Data/information/experiment.and.tissue.2.txt'
DATA.FILE      <- '../Data/history/expr/TPM.txt'
TARGET.TF.FILE <- '../Data/information/target_tf.txt'
AGINAME.FILE   <- '../Data/information/AGI-to-gene-names_v2.txt'
r.tau          <- 0.50
min.cluster    <- 3  # min number of clusters


if (!file.exists(TISSUE.FILE)) {
    stop(sprintf('The file %s dose not exists. So I cannot compute fixed number of sample groups.', TISSUE.FILE))
}

if (! file.exists(DATA.FILE)) {
   stop(sprintf('[correlation_per_group.R] Unable to find %s', DATA.FILE))
}

if (! file.exists(TARGET.TF.FILE)) {
   stop(sprintf('[correlation_per_group.R] Unable to find %s', TARGET.TF.FILE))
}

if (! file.exists(AGINAME.FILE)) {
   stop(sprintf('[correlation_per_group.R] Unable to find %s', AGINAME.FILE))
}


cat(sprintf('Read %s\n', DATA.FILE))
X             <- read.table(DATA.FILE, header=TRUE, check.names=FALSE)
all.id        <- X$gene_id
X$gene_id     <- NULL   # remove column gene_id
row.names(X)  <- all.id # add row names
all.genes     <- rownames(X)


min.sample   <- max(50, ceiling(sqrt(dim(X)[2]))) # at least this many samples needed for computing a correlation coefficient.  r=0.6 on 50 samples has two-tailed p-value 0.000004. http://vassarstats.net/tabs_r.html
max.cluster  <- min(100, max(min.cluster + 1, ceiling(dim(X)[2]^0.50))) # max number of clusters, depending on total number of samples


# Filter genes
rowsum.tau <- dim(X)[2]       # the gene's TPM value is at least 1 on average
sd.val     <- apply(X, 1, sd)
lambda <- 0.3
#sd.tau  <- lambda * summary(sd.val)[3] + (1-lambda) * summary(sd.val)[5] # genes whose gene expression varies least are to be filtered
sd.tau <- 1
index.row <- rowSums(X) > rowsum.tau & sd.val > sd.tau & !is.na(sd.val)

X  <- log(X[index.row, ] + 1.0)


# Normalize each row such that its mean is 0 and standard deviation is 1
normalize <- function(X) {
    d <- dim(X)
    num_row <- d[1]
    num_col <- d[2]
    
    s <- apply(X, 1, sd)
    S <- matrix(rep(s, num_col), nrow=num_row)
    m <- apply(X, 1, mean)
    M <- matrix(rep(m, num_col), nrow=num_row)
    X <- (X - M)/S
}


# Choose the optimal number of clusters such that they have best agreement with tissue labels
# Added on 28 June 2017, slcu, hui, last modified on 5 Nov 2022, hui
get.optimal.number.of.clusters <- function(X, clusters, tissue.matrix, min.cluster, max.cluster) {
    labels <- as.vector(tissue.matrix$suggested.tissue)
    labels <- unlist(lapply(labels, function(x) {e<-regexpr("\\.", x)[1]; if (e > 0) {y<-substr(x, 1, e-1)} else {x} })) # remove subcategories
    run.label <- c() # what tissue does each run experiment come from?
    for (rseqid in colnames(X)) { # X is the gene expression matrix
        i <- which(as.vector(tissue.matrix$run.id) == rseqid) # tissue.matrix contains tissue information for each RNA-seq ID
	suggested.tissue.name <- labels[i]
	run.label <- c(run.label, suggested.tissue.name)
    }
    best.cn <- min.cluster  # best cluster number
    best.mix.rate <- 0 # perfect mix rate is 1.0
    # find the best cluster number that results in largest tissue homogeneity
    for (cn in seq(min.cluster, max.cluster, 1)) { # cn is number of clusters
        cut <- cutree(clusters, cn)
        mix.sum <- 0
        mix.count <- 0
        for (c in unique(cut)) { # each cluster
            sample.index <- (cut == c)
            t <- run.label[sample.index] # t is a list of tissue names in this cluster
	    max.freq = max(as.data.frame(table(t))$Freq) # what is the frequency of the most frequent tissue name in this cluster?
	    sum.freq = sum(as.data.frame(table(t))$Freq) # what is the total number of tissue names in this cluster?
            mix.sum <- mix.sum + max.freq/sum.freq
            mix.count <- mix.count + 1
        }
        mix.rate <- log10(length(run.label)/mix.count) * (mix.sum/mix.count)^8 # make sure high tissue homogeneity is much preferred. also make sure the cluster is not too small.
        cat(sprintf('In get.optimal.number.of.clusters: cluster number:%d\t%4.1f\tpercentage:%4.2f\tmix.rate:%4.2f\n', cn, length(run.label)/mix.count, mix.sum/mix.count, mix.rate))
        if (mix.rate > best.mix.rate) {
            best.mix.rate <- mix.rate
            best.cn <- cn
        }
    }
    result <- list(cn=best.cn, mix.rate=best.mix.rate)
}


cat(sprintf('Read %s\n', AGINAME.FILE))
agi        <- read.table(AGINAME.FILE, stringsAsFactors=F) # AGINAME_FILE cannot contain quotes

cat(sprintf('Read %s\n', TARGET.TF.FILE))
target.tf <- read.table(TARGET.TF.FILE, header=FALSE, check.names=FALSE, sep='\t')
total.pair <- dim(target.tf)[1]

cat(sprintf('Read %s\n', TISSUE.FILE))
tissue <- read.table(TISSUE.FILE, header=TRUE, check.names=FALSE, sep='\t')

cat(sprintf('min.cluster=%d, max.cluster=%d, min.sample=%d, r.tau=%4.2f\n', min.cluster, max.cluster, min.sample, r.tau))
cat('Hclust ...\n')
X2 <- normalize(X) # each row of X2 has mean 0 and standard deviation 1.
clusters <- hclust(dist(t(X2)), method = 'average')
#saveRDS(clusters, file="clusters.rds")
#clusters <- readRDS(file="clusters.rds")
cat(sprintf('Determine optimal number of clusters ...\n'))
cn.result <- get.optimal.number.of.clusters(X, clusters, tissue, min.cluster, max.cluster)
cat(sprintf('Best number of clusters %d, best mix rate %4.2f..\n', cn.result$cn, cn.result$mix.rate))
optimal.cut <- cutree(clusters, cn.result$cn)
sample.names <- names(optimal.cut)

output.file <- paste('../Data/history/edges/one_target/edges.txt', 'fixed.group', format(Sys.time(), '%b.%d.%Y.%H%M%S'), sep='.')
f <- file(output.file, 'w')
cat(sprintf('Go through %d pairs...\n', total.pair))

for (i in 1:total.pair) {
    
    gene.tf <- as.vector(target.tf[i,2])
    gene.target <- as.vector(target.tf[i,1])

    all.in <- gene.tf %in% all.genes & gene.target %in% all.genes
    if (!all.in) {
        next
    }
    if (!gene.tf %in% rownames(X) || !gene.target %in% rownames(X)) { # make sure both gene.tf and gene.target are in X
        next
    }

    # if too few rnaseq samples, or correlation on all rnaseq samples is good, don't look for group correlation
    x <- as.vector(t(X[gene.tf, ]))
    y <- as.vector(t(X[gene.target, ]))
    index <- x < 0.01 | y < 0.01 # don't include data that is too small
    x.1 <- x[!index]
    y.1 <- y[!index]
    if (length(x.1) < min.sample) {
        next
    } else if (cor(x.1, y.1) >= r.tau) {
        next
    }
    
    name1 <- agi$V2[which(agi$V1 == gene.tf)]
    name2 <- agi$V2[which(agi$V1 == gene.target)]	    

    # initial values
    max.pos.r <- 0.0
    max.pos.n <- 0
    max.pos.samples <- c()
    max.neg.r <- 0.0
    max.neg.n <- 0
    max.neg.samples <- c()

    # check each cluster with the optimal cut
    for (c in unique(optimal.cut)) { # each cluster
        sample.index <- (optimal.cut == c)
        x <- as.vector(t(X[gene.tf, sample.index]))
        y <- as.vector(t(X[gene.target, sample.index]))
        n <- length(x)
        if (n > min.sample & sd(x) > 0.1 & sd(y) > 0.1) { # both x and y should vary
            r <- cor(x, y)
        } else {
            r <- 0.0
        }

        # prefer larger n (number of samples)
        if (n > min.sample & r > r.tau & n > max.pos.n) {
            max.pos.r <- r
            max.pos.n <- n
            max.pos.samples <- sample.names[sample.index]
        }

        if (n > min.sample & r < -r.tau & n > max.neg.n) {
            max.neg.r <- r
            max.neg.n <- n
            max.neg.samples <- sample.names[sample.index]
        }
        
    }

    # save results
    curr.date <- gsub('-','',Sys.Date())
    loglik <- '-991.1'
    cond = as.vector(target.tf[i,3])
    result.1 <- NULL
    result.2 <- NULL
    if (max.pos.n > 0) {
        sub.cond <- paste(max.pos.samples, collapse=' ')
	num.sub.cond <- length(max.pos.samples)
        result.1 = sprintf('%s %s\t%s %s\t%4.2f\t%s\t%s\t%s\t%s\t%s\t%4.2f\t%s\n', gene.target, name2, gene.tf, name1, max.pos.r, 'mix', num.sub.cond, cond, loglik, curr.date, max.pos.r, 'hclust.fixed.group')
    }
    if (max.neg.n > 0) {
        sub.cond <- paste(max.neg.samples, collapse=' ')
	num.sub.cond <- length(max.neg.samples)	
        result.2 = sprintf('%s %s\t%s %s\t%4.2f\t%s\t%s\t%s\t%s\t%s\t%4.2f\t%s\n', gene.target, name2, gene.tf, name1, max.neg.r, 'mix', num.sub.cond, cond, loglik, curr.date, max.neg.r, 'hclust.fixed.group')
    }
    if (!is.null(result.1)) {
        cat(result.1, file=f, sep='')
    }
    if (!is.null(result.2)) {
        cat(result.2, file=f, sep='')
    }
}

close(f)