1 files changed, 18 insertions, 16 deletions
diff --git a/Code/correlation_per_group_fixed_number.R b/Code/correlation_per_group_fixed_number.R
index 3407973..dd28971 100644
--- a/Code/correlation_per_group_fixed_number.R
+++ b/Code/correlation_per_group_fixed_number.R
@@ -1,6 +1,6 @@
 # Last modified on 9 Aug 2019
 # Last modified on 11 Aug 2019
-
+# Last modified on  9 Nov 2022
 # Purpose: divide the samples into fixed number of groups and compute
 # correlation on each group.  The optimal number of groups is
 # determined using tissue labels, to maximize the agreement between
@@ -113,16 +113,20 @@ cat(sprintf('Read %s\n', TARGET.TF.FILE))
 target.tf <- read.table(TARGET.TF.FILE, header=FALSE, check.names=FALSE, sep='\t')
 total.pair <- dim(target.tf)[1]
 
+cat(sprintf('Read %s\n', TISSUE.FILE))
+tissue <- read.table(TISSUE.FILE, header=TRUE, check.names=FALSE, sep='\t')
+
 cat(sprintf('min.cluster=%d, max.cluster=%d, min.sample=%d, r.tau=%4.2f\n', min.cluster, max.cluster, min.sample, r.tau))
 cat('Hclust ...\n')
 X2 <- normalize(X) # each row of X2 has mean 0 and standard deviation 1.
 clusters <- hclust(dist(t(X2)), method = 'average')
+#saveRDS(clusters, file="clusters.rds")
+#clusters <- readRDS(file="clusters.rds")
 cat(sprintf('Determine optimal number of clusters ...\n'))
-tissue <- read.table(TISSUE.FILE, header=TRUE, check.names=FALSE, sep='\t')
 cn.result <- get.optimal.number.of.clusters(X, clusters, tissue, min.cluster, max.cluster)
 cat(sprintf('Best number of clusters %d, best mix rate %4.2f..\n', cn.result$cn, cn.result$mix.rate))
-cut <- cutree(clusters, cn.result$cn)
-sample.names <- names(cut)
+optimal.cut <- cutree(clusters, cn.result$cn)
+sample.names <- names(optimal.cut)
 
 output.file <- paste('../Data/history/edges/one_target/edges.txt', 'fixed.group', format(Sys.time(), '%b.%d.%Y.%H%M%S'), sep='.')
 f <- file(output.file, 'w')
@@ -132,6 +136,7 @@ for (i in 1:total.pair) {
     
     gene.tf <- as.vector(target.tf[i,2])
     gene.target <- as.vector(target.tf[i,1])
+
     all.in <- gene.tf %in% all.genes & gene.target %in% all.genes
     if (!all.in) {
         next
@@ -163,9 +168,9 @@ for (i in 1:total.pair) {
     max.neg.n <- 0
     max.neg.samples <- c()
 
-    # cut tree into different number of clusters
-    for (c in unique(cut)) { # each cluster
-        sample.index <- (cut == c)
+    # check each cluster with the optimal cut
+    for (c in unique(optimal.cut)) { # each cluster
+        sample.index <- (optimal.cut == c)
         x <- as.vector(t(X[gene.tf, sample.index]))
         y <- as.vector(t(X[gene.target, sample.index]))
         n <- length(x)
@@ -175,6 +180,7 @@ for (i in 1:total.pair) {
             r <- 0.0
         }
 
+        # prefer larger n (number of samples)
         if (n > min.sample & r > r.tau & n > max.pos.n) {
             max.pos.r <- r
             max.pos.n <- n
@@ -205,15 +211,11 @@ for (i in 1:total.pair) {
 	num.sub.cond <- length(max.neg.samples)	
         result.2 = sprintf('%s %s\t%s %s\t%4.2f\t%s\t%s\t%s\t%s\t%s\t%4.2f\t%s\n', gene.target, name2, gene.tf, name1, max.neg.r, 'mix', num.sub.cond, cond, loglik, curr.date, max.neg.r, 'hclust.fixed.group')
     }
-    if (result.1 != '' | result.2 != '') {
-        if (result.1 != '' & result.2 != '') {
-            result <- paste(result.1, result.2, sep='')
-        } else if (result.1 != '') {
-            result <- result.1
-        } else if (result.2 != '') {
-            result <- result.2            
-        }
-        cat(result, file=f, sep='')	
+    if (result.1 != '') {
+        cat(result.1, file=f, sep='')
+    }
+    if (result.2 != '') {
+        cat(result.2, file=f, sep='')
     }
 }