# Last modified on 8 Aug 2019 TISSUE.FILE <- '../Data/information/experiment.and.tissue.txt' DATA.FILE <- '../Data/history/expr/TPM.txt' TEMP.DIR <- '../Data/temp' TARGET.FILE <- '../Data/temp/all_targets.txt' TF.FILE <- '../Data/temp/all_tfs.txt' tau <- 0.60 # Make sure we have required files if (! file.exists(TISSUE.FILE)) { stop(sprintf('[correlation_per_tissue.R] Unable to find %s', TISSUE.FILE)) } if (! file.exists(DATA.FILE)) { stop(sprintf('[correlation_per_tissue.R] Unable to find %s', DATA.FILE)) } if (! dir.exists(TEMP.DIR)) { stop(sprintf('[correlation_per_tissue.R] Unable to find directory %s', TEMP.DIR)) } if (! file.exists(TARGET.FILE)) { stop(sprintf('[correlation_per_tissue.R] Unable to find %s', TARGET.FILE)) } if (! file.exists(TF.FILE)) { stop(sprintf('[correlation_per_tissue.R] Unable to find %s', TF.FILE)) } X0 <- read.table(DATA.FILE, header=TRUE, check.names=FALSE) all.id <- X0$gene_id X0$gene_id <- NULL # remove column gene_id row.names(X0) <- all.id # add row names all_genes <- rownames(X0) tissue <- read.table(TISSUE.FILE, header=TRUE, check.names=FALSE, sep='\t') labels <- as.vector(tissue$suggested.tissue) labels <- unlist(lapply(labels, function(x) {e<-regexpr("\\.", x)[1]; if (e > 0) {y<-substr(x, 1, e-1)} else {x} })) # remove subcategories unique.label <- unique(labels) targets <- read.table(TARGET.FILE, header=FALSE) tfs <- read.table(TF.FILE, header=FALSE) targets <- as.vector(targets$V1) tfs <- as.vector(tfs$V1) ############################################################## get.index <- function(X, tissue.matrix, tissue.name) { labels <- as.vector(tissue.matrix$suggested.tissue) labels <- unlist(lapply(labels, function(x) {e<-regexpr("\\.", x)[1]; if (e > 0) {y<-substr(x, 1, e-1)} else {x} })) # remove subcategories index <- c() count <- 1 for (rseqid in colnames(X)) { i <- which(as.vector(tissue.matrix$run.id) == rseqid) if (length(i) != 0) { suggested.tissue.name <- labels[i] if (tissue.name == suggested.tissue.name) { index <- c(index, count) } } count <- count + 1 } index } ############################################################## # for each tissue type, get a correlation matrix for (ul in unique.label) { index.rnaseq <- get.index(X0, tissue, ul) if (length(index.rnaseq) >= 50) { OUTPUT.FILE <- paste('../Data/temp/edges.txt.simple.correlation.tissue', ul, 'txt', sep='.') X <- as.matrix(X0[, index.rnaseq]) sd.1 <- apply(X, 1, sd) # sd of each row s0 <- apply(X, 1, function(c) sum(c==0)) # number of zeros in each row sd.tau <- (quantile(sd.1, na.rm=TRUE)[1] + quantile(sd.1, na.rm=TRUE)[2]) / 2.0 # min SD good <- sd.1 > max(sd.tau, 0.05) tf_good <- which( good & (all_genes %in% tfs) == T ) target_good <- which( good & (all_genes %in% targets) == T ) # Compute correlation coefficient X <- log(X + 1) X[X<0.01] <- NA if (length(tf_good) > 1) { c <- cor(t(X[target_good,]), t(X[tf_good,]), use='pairwise.complete.obs') } else { c <- cor(t(X[target_good,]), t(X[c(tf_good, tf_good), ]), use='pairwise.complete.obs') } index <- !is.na(c) & abs(c) >= tau & abs(c) <= 0.99 row_names <- rownames(c) col_names <- colnames(c) result <- data.frame(row = row_names[row(c)[index]], col = col_names[col(c)[index]], r = c[index], tissue=rep(ul, sum(index)), numrnaseqids=rep(length(index.rnaseq), sum(index))) # write results write.table(result, OUTPUT.FILE, col.names=F, row.names=F, sep='\t', quote=F) } }