diff options
author | Hui Lan <lanhui@zjnu.edu.cn> | 2022-11-09 16:00:27 +0800 |
---|---|---|
committer | Hui Lan <lanhui@zjnu.edu.cn> | 2022-11-09 16:00:27 +0800 |
commit | 5b13b624473ce0c016272af1ef44aeb4991f1a33 (patch) | |
tree | 34eb7ad90009dd5656123ed23073ca83f64c5718 | |
parent | f9b584c029a88036df36088372aed1664a4a6c20 (diff) |
correlation_per_group_fixed_number.R: simplify
-rw-r--r-- | Code/correlation_per_group_fixed_number.R | 34 |
1 files changed, 18 insertions, 16 deletions
diff --git a/Code/correlation_per_group_fixed_number.R b/Code/correlation_per_group_fixed_number.R index 3407973..dd28971 100644 --- a/Code/correlation_per_group_fixed_number.R +++ b/Code/correlation_per_group_fixed_number.R @@ -1,6 +1,6 @@ # Last modified on 9 Aug 2019 # Last modified on 11 Aug 2019 - +# Last modified on 9 Nov 2022 # Purpose: divide the samples into fixed number of groups and compute # correlation on each group. The optimal number of groups is # determined using tissue labels, to maximize the agreement between @@ -113,16 +113,20 @@ cat(sprintf('Read %s\n', TARGET.TF.FILE)) target.tf <- read.table(TARGET.TF.FILE, header=FALSE, check.names=FALSE, sep='\t') total.pair <- dim(target.tf)[1] +cat(sprintf('Read %s\n', TISSUE.FILE)) +tissue <- read.table(TISSUE.FILE, header=TRUE, check.names=FALSE, sep='\t') + cat(sprintf('min.cluster=%d, max.cluster=%d, min.sample=%d, r.tau=%4.2f\n', min.cluster, max.cluster, min.sample, r.tau)) cat('Hclust ...\n') X2 <- normalize(X) # each row of X2 has mean 0 and standard deviation 1. clusters <- hclust(dist(t(X2)), method = 'average') +#saveRDS(clusters, file="clusters.rds") +#clusters <- readRDS(file="clusters.rds") cat(sprintf('Determine optimal number of clusters ...\n')) -tissue <- read.table(TISSUE.FILE, header=TRUE, check.names=FALSE, sep='\t') cn.result <- get.optimal.number.of.clusters(X, clusters, tissue, min.cluster, max.cluster) cat(sprintf('Best number of clusters %d, best mix rate %4.2f..\n', cn.result$cn, cn.result$mix.rate)) -cut <- cutree(clusters, cn.result$cn) -sample.names <- names(cut) +optimal.cut <- cutree(clusters, cn.result$cn) +sample.names <- names(optimal.cut) output.file <- paste('../Data/history/edges/one_target/edges.txt', 'fixed.group', format(Sys.time(), '%b.%d.%Y.%H%M%S'), sep='.') f <- file(output.file, 'w') @@ -132,6 +136,7 @@ for (i in 1:total.pair) { gene.tf <- as.vector(target.tf[i,2]) gene.target <- as.vector(target.tf[i,1]) + all.in <- gene.tf %in% all.genes & gene.target %in% all.genes if (!all.in) { next @@ -163,9 +168,9 @@ for (i in 1:total.pair) { max.neg.n <- 0 max.neg.samples <- c() - # cut tree into different number of clusters - for (c in unique(cut)) { # each cluster - sample.index <- (cut == c) + # check each cluster with the optimal cut + for (c in unique(optimal.cut)) { # each cluster + sample.index <- (optimal.cut == c) x <- as.vector(t(X[gene.tf, sample.index])) y <- as.vector(t(X[gene.target, sample.index])) n <- length(x) @@ -175,6 +180,7 @@ for (i in 1:total.pair) { r <- 0.0 } + # prefer larger n (number of samples) if (n > min.sample & r > r.tau & n > max.pos.n) { max.pos.r <- r max.pos.n <- n @@ -205,15 +211,11 @@ for (i in 1:total.pair) { num.sub.cond <- length(max.neg.samples) result.2 = sprintf('%s %s\t%s %s\t%4.2f\t%s\t%s\t%s\t%s\t%s\t%4.2f\t%s\n', gene.target, name2, gene.tf, name1, max.neg.r, 'mix', num.sub.cond, cond, loglik, curr.date, max.neg.r, 'hclust.fixed.group') } - if (result.1 != '' | result.2 != '') { - if (result.1 != '' & result.2 != '') { - result <- paste(result.1, result.2, sep='') - } else if (result.1 != '') { - result <- result.1 - } else if (result.2 != '') { - result <- result.2 - } - cat(result, file=f, sep='') + if (result.1 != '') { + cat(result.1, file=f, sep='') + } + if (result.2 != '') { + cat(result.2, file=f, sep='') } } |