diff options
| -rw-r--r-- | Code/correlation_per_group_fixed_number.R | 34 | 
1 files changed, 18 insertions, 16 deletions
| diff --git a/Code/correlation_per_group_fixed_number.R b/Code/correlation_per_group_fixed_number.R index 3407973..dd28971 100644 --- a/Code/correlation_per_group_fixed_number.R +++ b/Code/correlation_per_group_fixed_number.R @@ -1,6 +1,6 @@  # Last modified on 9 Aug 2019  # Last modified on 11 Aug 2019 - +# Last modified on  9 Nov 2022  # Purpose: divide the samples into fixed number of groups and compute  # correlation on each group.  The optimal number of groups is  # determined using tissue labels, to maximize the agreement between @@ -113,16 +113,20 @@ cat(sprintf('Read %s\n', TARGET.TF.FILE))  target.tf <- read.table(TARGET.TF.FILE, header=FALSE, check.names=FALSE, sep='\t')  total.pair <- dim(target.tf)[1] +cat(sprintf('Read %s\n', TISSUE.FILE)) +tissue <- read.table(TISSUE.FILE, header=TRUE, check.names=FALSE, sep='\t') +  cat(sprintf('min.cluster=%d, max.cluster=%d, min.sample=%d, r.tau=%4.2f\n', min.cluster, max.cluster, min.sample, r.tau))  cat('Hclust ...\n')  X2 <- normalize(X) # each row of X2 has mean 0 and standard deviation 1.  clusters <- hclust(dist(t(X2)), method = 'average') +#saveRDS(clusters, file="clusters.rds") +#clusters <- readRDS(file="clusters.rds")  cat(sprintf('Determine optimal number of clusters ...\n')) -tissue <- read.table(TISSUE.FILE, header=TRUE, check.names=FALSE, sep='\t')  cn.result <- get.optimal.number.of.clusters(X, clusters, tissue, min.cluster, max.cluster)  cat(sprintf('Best number of clusters %d, best mix rate %4.2f..\n', cn.result$cn, cn.result$mix.rate)) -cut <- cutree(clusters, cn.result$cn) -sample.names <- names(cut) +optimal.cut <- cutree(clusters, cn.result$cn) +sample.names <- names(optimal.cut)  output.file <- paste('../Data/history/edges/one_target/edges.txt', 'fixed.group', format(Sys.time(), '%b.%d.%Y.%H%M%S'), sep='.')  f <- file(output.file, 'w') @@ -132,6 +136,7 @@ for (i in 1:total.pair) {      gene.tf <- as.vector(target.tf[i,2])      gene.target <- as.vector(target.tf[i,1]) +      all.in <- gene.tf %in% all.genes & gene.target %in% all.genes      if (!all.in) {          next @@ -163,9 +168,9 @@ for (i in 1:total.pair) {      max.neg.n <- 0      max.neg.samples <- c() -    # cut tree into different number of clusters -    for (c in unique(cut)) { # each cluster -        sample.index <- (cut == c) +    # check each cluster with the optimal cut +    for (c in unique(optimal.cut)) { # each cluster +        sample.index <- (optimal.cut == c)          x <- as.vector(t(X[gene.tf, sample.index]))          y <- as.vector(t(X[gene.target, sample.index]))          n <- length(x) @@ -175,6 +180,7 @@ for (i in 1:total.pair) {              r <- 0.0          } +        # prefer larger n (number of samples)          if (n > min.sample & r > r.tau & n > max.pos.n) {              max.pos.r <- r              max.pos.n <- n @@ -205,15 +211,11 @@ for (i in 1:total.pair) {  	num.sub.cond <- length(max.neg.samples)	          result.2 = sprintf('%s %s\t%s %s\t%4.2f\t%s\t%s\t%s\t%s\t%s\t%4.2f\t%s\n', gene.target, name2, gene.tf, name1, max.neg.r, 'mix', num.sub.cond, cond, loglik, curr.date, max.neg.r, 'hclust.fixed.group')      } -    if (result.1 != '' | result.2 != '') { -        if (result.1 != '' & result.2 != '') { -            result <- paste(result.1, result.2, sep='') -        } else if (result.1 != '') { -            result <- result.1 -        } else if (result.2 != '') { -            result <- result.2             -        } -        cat(result, file=f, sep='')	 +    if (result.1 != '') { +        cat(result.1, file=f, sep='') +    } +    if (result.2 != '') { +        cat(result.2, file=f, sep='')      }  } | 
