summaryrefslogtreecommitdiff
path: root/Code/correlation_per_tissue.R
diff options
context:
space:
mode:
authorHui Lan <lanhui@zjnu.edu.cn>2019-12-04 19:03:19 +0800
committerHui Lan <lanhui@zjnu.edu.cn>2019-12-04 19:03:19 +0800
commit97fdefab064f63642fa3ece05b807d29b459df31 (patch)
treea058530023224f3e35b1783996f3530c80c04bc5 /Code/correlation_per_tissue.R
brain: add python and R code to local repository.
Diffstat (limited to 'Code/correlation_per_tissue.R')
-rw-r--r--Code/correlation_per_tissue.R101
1 files changed, 101 insertions, 0 deletions
diff --git a/Code/correlation_per_tissue.R b/Code/correlation_per_tissue.R
new file mode 100644
index 0000000..d9aadf9
--- /dev/null
+++ b/Code/correlation_per_tissue.R
@@ -0,0 +1,101 @@
+# Last modified on 8 Aug 2019
+
+TISSUE.FILE <- '../Data/information/experiment.and.tissue.txt'
+DATA.FILE <- '../Data/history/expr/TPM.txt'
+TEMP.DIR <- '../Data/temp'
+TARGET.FILE <- '../Data/temp/all_targets.txt'
+TF.FILE <- '../Data/temp/all_tfs.txt'
+tau <- 0.60
+
+
+# Make sure we have required files
+if (! file.exists(TISSUE.FILE)) {
+ stop(sprintf('[correlation_per_tissue.R] Unable to find %s', TISSUE.FILE))
+}
+
+if (! file.exists(DATA.FILE)) {
+ stop(sprintf('[correlation_per_tissue.R] Unable to find %s', DATA.FILE))
+}
+
+if (! dir.exists(TEMP.DIR)) {
+ stop(sprintf('[correlation_per_tissue.R] Unable to find directory %s', TEMP.DIR))
+}
+
+if (! file.exists(TARGET.FILE)) {
+ stop(sprintf('[correlation_per_tissue.R] Unable to find %s', TARGET.FILE))
+}
+
+if (! file.exists(TF.FILE)) {
+ stop(sprintf('[correlation_per_tissue.R] Unable to find %s', TF.FILE))
+}
+
+
+X0 <- read.table(DATA.FILE, header=TRUE, check.names=FALSE)
+all.id <- X0$gene_id
+X0$gene_id <- NULL # remove column gene_id
+row.names(X0) <- all.id # add row names
+all_genes <- rownames(X0)
+
+tissue <- read.table(TISSUE.FILE, header=TRUE, check.names=FALSE, sep='\t')
+labels <- as.vector(tissue$suggested.tissue)
+labels <- unlist(lapply(labels, function(x) {e<-regexpr("\\.", x)[1]; if (e > 0) {y<-substr(x, 1, e-1)} else {x} })) # remove subcategories
+unique.label <- unique(labels)
+
+targets <- read.table(TARGET.FILE, header=FALSE)
+tfs <- read.table(TF.FILE, header=FALSE)
+targets <- as.vector(targets$V1)
+tfs <- as.vector(tfs$V1)
+
+
+##############################################################
+get.index <- function(X, tissue.matrix, tissue.name) {
+ labels <- as.vector(tissue.matrix$suggested.tissue)
+ labels <- unlist(lapply(labels, function(x) {e<-regexpr("\\.", x)[1]; if (e > 0) {y<-substr(x, 1, e-1)} else {x} })) # remove subcategories
+ index <- c()
+ count <- 1
+ for (rseqid in colnames(X)) {
+ i <- which(as.vector(tissue.matrix$run.id) == rseqid)
+ if (length(i) != 0) {
+ suggested.tissue.name <- labels[i]
+ if (tissue.name == suggested.tissue.name) {
+ index <- c(index, count)
+ }
+ }
+ count <- count + 1
+ }
+ index
+}
+##############################################################
+
+
+# for each tissue type, get a correlation matrix
+for (ul in unique.label) {
+ index.rnaseq <- get.index(X0, tissue, ul)
+ if (length(index.rnaseq) >= 50) {
+ OUTPUT.FILE <- paste('../Data/temp/edges.txt.simple.correlation.tissue', ul, 'txt', sep='.')
+
+ X <- as.matrix(X0[, index.rnaseq])
+ sd.1 <- apply(X, 1, sd) # sd of each row
+ s0 <- apply(X, 1, function(c) sum(c==0)) # number of zeros in each row
+ sd.tau <- (quantile(sd.1)[1] + quantile(sd.1)[2]) / 2.0 # min SD
+ good <- sd.1 > max(sd.tau, 0.05)
+ tf_good <- which( good & (all_genes %in% tfs) == T )
+ target_good <- which( good & (all_genes %in% targets) == T )
+
+ # Compute correlation coefficient
+ X <- log(X + 1)
+ X[X<0.01] <- NA
+ if (length(tf_good) > 1) {
+ c <- cor(t(X[target_good,]), t(X[tf_good,]), use='pairwise.complete.obs')
+ } else {
+ c <- cor(t(X[target_good,]), t(X[c(tf_good, tf_good), ]), use='pairwise.complete.obs')
+ }
+ index <- !is.na(c) & abs(c) >= tau & abs(c) <= 0.99
+ row_names <- rownames(c)
+ col_names <- colnames(c)
+ result <- data.frame(row = row_names[row(c)[index]], col = col_names[col(c)[index]], r = c[index], tissue=rep(ul, sum(index)), numrnaseqids=rep(length(index.rnaseq), sum(index)))
+
+ # write results
+ write.table(result, OUTPUT.FILE, col.names=F, row.names=F, sep='\t', quote=F)
+ }
+}