summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHui Lan <lanhui@zjnu.edu.cn>2022-11-09 16:00:27 +0800
committerHui Lan <lanhui@zjnu.edu.cn>2022-11-09 16:00:27 +0800
commit5b13b624473ce0c016272af1ef44aeb4991f1a33 (patch)
tree34eb7ad90009dd5656123ed23073ca83f64c5718
parentf9b584c029a88036df36088372aed1664a4a6c20 (diff)
correlation_per_group_fixed_number.R: simplify
-rw-r--r--Code/correlation_per_group_fixed_number.R34
1 files changed, 18 insertions, 16 deletions
diff --git a/Code/correlation_per_group_fixed_number.R b/Code/correlation_per_group_fixed_number.R
index 3407973..dd28971 100644
--- a/Code/correlation_per_group_fixed_number.R
+++ b/Code/correlation_per_group_fixed_number.R
@@ -1,6 +1,6 @@
# Last modified on 9 Aug 2019
# Last modified on 11 Aug 2019
-
+# Last modified on 9 Nov 2022
# Purpose: divide the samples into fixed number of groups and compute
# correlation on each group. The optimal number of groups is
# determined using tissue labels, to maximize the agreement between
@@ -113,16 +113,20 @@ cat(sprintf('Read %s\n', TARGET.TF.FILE))
target.tf <- read.table(TARGET.TF.FILE, header=FALSE, check.names=FALSE, sep='\t')
total.pair <- dim(target.tf)[1]
+cat(sprintf('Read %s\n', TISSUE.FILE))
+tissue <- read.table(TISSUE.FILE, header=TRUE, check.names=FALSE, sep='\t')
+
cat(sprintf('min.cluster=%d, max.cluster=%d, min.sample=%d, r.tau=%4.2f\n', min.cluster, max.cluster, min.sample, r.tau))
cat('Hclust ...\n')
X2 <- normalize(X) # each row of X2 has mean 0 and standard deviation 1.
clusters <- hclust(dist(t(X2)), method = 'average')
+#saveRDS(clusters, file="clusters.rds")
+#clusters <- readRDS(file="clusters.rds")
cat(sprintf('Determine optimal number of clusters ...\n'))
-tissue <- read.table(TISSUE.FILE, header=TRUE, check.names=FALSE, sep='\t')
cn.result <- get.optimal.number.of.clusters(X, clusters, tissue, min.cluster, max.cluster)
cat(sprintf('Best number of clusters %d, best mix rate %4.2f..\n', cn.result$cn, cn.result$mix.rate))
-cut <- cutree(clusters, cn.result$cn)
-sample.names <- names(cut)
+optimal.cut <- cutree(clusters, cn.result$cn)
+sample.names <- names(optimal.cut)
output.file <- paste('../Data/history/edges/one_target/edges.txt', 'fixed.group', format(Sys.time(), '%b.%d.%Y.%H%M%S'), sep='.')
f <- file(output.file, 'w')
@@ -132,6 +136,7 @@ for (i in 1:total.pair) {
gene.tf <- as.vector(target.tf[i,2])
gene.target <- as.vector(target.tf[i,1])
+
all.in <- gene.tf %in% all.genes & gene.target %in% all.genes
if (!all.in) {
next
@@ -163,9 +168,9 @@ for (i in 1:total.pair) {
max.neg.n <- 0
max.neg.samples <- c()
- # cut tree into different number of clusters
- for (c in unique(cut)) { # each cluster
- sample.index <- (cut == c)
+ # check each cluster with the optimal cut
+ for (c in unique(optimal.cut)) { # each cluster
+ sample.index <- (optimal.cut == c)
x <- as.vector(t(X[gene.tf, sample.index]))
y <- as.vector(t(X[gene.target, sample.index]))
n <- length(x)
@@ -175,6 +180,7 @@ for (i in 1:total.pair) {
r <- 0.0
}
+ # prefer larger n (number of samples)
if (n > min.sample & r > r.tau & n > max.pos.n) {
max.pos.r <- r
max.pos.n <- n
@@ -205,15 +211,11 @@ for (i in 1:total.pair) {
num.sub.cond <- length(max.neg.samples)
result.2 = sprintf('%s %s\t%s %s\t%4.2f\t%s\t%s\t%s\t%s\t%s\t%4.2f\t%s\n', gene.target, name2, gene.tf, name1, max.neg.r, 'mix', num.sub.cond, cond, loglik, curr.date, max.neg.r, 'hclust.fixed.group')
}
- if (result.1 != '' | result.2 != '') {
- if (result.1 != '' & result.2 != '') {
- result <- paste(result.1, result.2, sep='')
- } else if (result.1 != '') {
- result <- result.1
- } else if (result.2 != '') {
- result <- result.2
- }
- cat(result, file=f, sep='')
+ if (result.1 != '') {
+ cat(result.1, file=f, sep='')
+ }
+ if (result.2 != '') {
+ cat(result.2, file=f, sep='')
}
}