brain: add python and R code to local repository.

author: Hui Lan <lanhui@zjnu.edu.cn> 2019-12-04 19:03:19 +0800
committer: Hui Lan <lanhui@zjnu.edu.cn> 2019-12-04 19:03:19 +0800
commit: 97fdefab064f63642fa3ece05b807d29b459df31 (patch)
tree: a058530023224f3e35b1783996f3530c80c04bc5 /Code/correlation_per_tissue.R
1 files changed, 101 insertions, 0 deletions
diff --git a/Code/correlation_per_tissue.R b/Code/correlation_per_tissue.R
new file mode 100644
index 0000000..d9aadf9
--- /dev/null
+++ b/Code/correlation_per_tissue.R
@@ -0,0 +1,101 @@
+# Last modified on 8 Aug 2019
+
+TISSUE.FILE <- '../Data/information/experiment.and.tissue.txt'
+DATA.FILE   <- '../Data/history/expr/TPM.txt'
+TEMP.DIR    <- '../Data/temp'
+TARGET.FILE <- '../Data/temp/all_targets.txt'
+TF.FILE     <- '../Data/temp/all_tfs.txt'
+tau         <- 0.60
+
+
+# Make sure we have required files
+if (! file.exists(TISSUE.FILE)) {
+   stop(sprintf('[correlation_per_tissue.R] Unable to find %s', TISSUE.FILE))
+}
+
+if (! file.exists(DATA.FILE)) {
+   stop(sprintf('[correlation_per_tissue.R] Unable to find %s', DATA.FILE))
+}
+
+if (! dir.exists(TEMP.DIR)) {
+   stop(sprintf('[correlation_per_tissue.R] Unable to find directory %s', TEMP.DIR))
+}
+
+if (! file.exists(TARGET.FILE)) {
+   stop(sprintf('[correlation_per_tissue.R] Unable to find %s', TARGET.FILE))
+}
+
+if (! file.exists(TF.FILE)) {
+   stop(sprintf('[correlation_per_tissue.R] Unable to find %s', TF.FILE))
+}
+
+
+X0            <- read.table(DATA.FILE, header=TRUE, check.names=FALSE)
+all.id        <- X0$gene_id
+X0$gene_id    <- NULL   # remove column gene_id
+row.names(X0) <- all.id # add row names
+all_genes     <- rownames(X0)
+
+tissue        <- read.table(TISSUE.FILE, header=TRUE, check.names=FALSE, sep='\t')
+labels        <- as.vector(tissue$suggested.tissue)
+labels        <- unlist(lapply(labels, function(x) {e<-regexpr("\\.", x)[1]; if (e > 0) {y<-substr(x, 1, e-1)} else {x} })) # remove subcategories
+unique.label  <- unique(labels)
+
+targets       <- read.table(TARGET.FILE, header=FALSE)
+tfs           <- read.table(TF.FILE, header=FALSE)
+targets       <- as.vector(targets$V1)
+tfs           <- as.vector(tfs$V1)
+
+
+##############################################################
+get.index <- function(X, tissue.matrix, tissue.name) {
+    labels <- as.vector(tissue.matrix$suggested.tissue)
+    labels <- unlist(lapply(labels, function(x) {e<-regexpr("\\.", x)[1]; if (e > 0) {y<-substr(x, 1, e-1)} else {x} })) # remove subcategories
+    index <- c()
+    count <- 1
+    for (rseqid in colnames(X)) {
+        i <- which(as.vector(tissue.matrix$run.id) == rseqid)
+	if (length(i) != 0) {
+   	    suggested.tissue.name <- labels[i]
+	    if (tissue.name == suggested.tissue.name) {
+	        index <- c(index, count)
+	    }
+	}
+	count <- count + 1
+    }
+    index
+}
+##############################################################
+
+
+# for each tissue type, get a correlation matrix
+for (ul in unique.label) {
+    index.rnaseq <- get.index(X0, tissue, ul)
+    if (length(index.rnaseq) >= 50) {
+        OUTPUT.FILE <- paste('../Data/temp/edges.txt.simple.correlation.tissue', ul, 'txt', sep='.')
+        
+        X           <- as.matrix(X0[, index.rnaseq])
+        sd.1        <- apply(X, 1, sd) # sd of each row
+        s0          <- apply(X, 1, function(c) sum(c==0)) # number of zeros in each row
+        sd.tau      <- (quantile(sd.1)[1] +  quantile(sd.1)[2]) / 2.0 # min SD
+        good        <- sd.1 > max(sd.tau, 0.05)
+        tf_good     <- which( good & (all_genes %in% tfs) == T )
+        target_good <- which( good & (all_genes %in% targets) == T )
+
+        # Compute correlation coefficient
+        X <- log(X + 1)
+        X[X<0.01] <- NA
+        if (length(tf_good) > 1) {
+            c <- cor(t(X[target_good,]), t(X[tf_good,]), use='pairwise.complete.obs')
+	} else {
+            c <- cor(t(X[target_good,]), t(X[c(tf_good, tf_good), ]), use='pairwise.complete.obs')            
+        }
+        index <- !is.na(c) & abs(c) >= tau &  abs(c) <= 0.99
+        row_names <- rownames(c)
+        col_names <- colnames(c)
+        result <- data.frame(row = row_names[row(c)[index]], col = col_names[col(c)[index]], r = c[index], tissue=rep(ul, sum(index)), numrnaseqids=rep(length(index.rnaseq), sum(index)))
+
+        # write results
+        write.table(result, OUTPUT.FILE, col.names=F, row.names=F, sep='\t', quote=F)
+    }
+}
author	Hui Lan <lanhui@zjnu.edu.cn>	2019-12-04 19:03:19 +0800
committer	Hui Lan <lanhui@zjnu.edu.cn>	2019-12-04 19:03:19 +0800
commit	97fdefab064f63642fa3ece05b807d29b459df31 (patch)
tree	a058530023224f3e35b1783996f3530c80c04bc5 /Code/correlation_per_tissue.R