From 97fdefab064f63642fa3ece05b807d29b459df31 Mon Sep 17 00:00:00 2001 From: Hui Lan Date: Wed, 4 Dec 2019 19:03:19 +0800 Subject: brain: add python and R code to local repository. --- Code/create_edges_k2.R | 136 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 136 insertions(+) create mode 100644 Code/create_edges_k2.R (limited to 'Code/create_edges_k2.R') diff --git a/Code/create_edges_k2.R b/Code/create_edges_k2.R new file mode 100644 index 0000000..a044ed0 --- /dev/null +++ b/Code/create_edges_k2.R @@ -0,0 +1,136 @@ +# Last modified 13 August 2019 + +TARGET_TF_FILE <- "../Data/information/target_tf.txt" +DATA_FILE <- "../Data/history/expr/TPM.txt" # A TPM table +AGINAME_FILE <- "../Data/information/AGI-to-gene-names_v2.txt" +CORR_THRESHOLD <- 0.5 +MIN_SIZE <- 100 + + +# Make sure we have required files +if (! file.exists(TARGET_TF_FILE)) { + stop(sprintf('[create_edges2_k2.R] Unable to find %s', TARGET_TF_FILE)) +} + +if (! file.exists(DATA_FILE)) { + stop(sprintf('[create_edges_k2.R] Unable to find %s', DATA_FILE)) +} + +if (! file.exists(AGINAME_FILE)) { + stop(sprintf('[create_edges_k2.R] Unable to find %s', AGINAME_FILE)) +} + + +####### Read data ######################################### +X <- read.table(DATA_FILE, header=TRUE, check.names=FALSE) +gene_id <- X$gene_id +X$gene_id <- NULL +row.names(X) <- gene_id +X <- as.matrix(X) +rna.sample.id <- colnames(X) + +target_tf <- read.table(TARGET_TF_FILE, sep='\t', header=FALSE) +target_tf <- as.matrix(target_tf) +targets <- target_tf[,1] +tfs <- target_tf[,2] +conditions <- target_tf[,3] + +agi <- read.table(AGINAME_FILE, stringsAsFactors=F) # AGINAME_FILE cannot contain quotes +####################################################### + +library(mixtools) +options(max.print=999999999) +output.file <- paste('../Data/history/edges/one_target/edges.txt', 'mixtools', format(Sys.time(), '%b.%d.%Y.%H%M%S'), sep='.') +f <- file(output.file, 'w') + +for (i in 1:length(targets)) { + curr.date <- gsub('-','',Sys.Date()) + id1 <- tfs[i] + id2 <- targets[i] + if (id1 %in% gene_id == F || id2 %in% gene_id == F) { + next + } + + name1 <- agi$V2[which(agi$V1 == id1)] + name2 <- agi$V2[which(agi$V1 == id2)] + + cond <- conditions[i] + x <- X[id1,] + y <- X[id2,] + x <- log(x+1) + y <- log(y+1) + index <- x < 0.01 | y < 0.01 + x <- x[!index] + y <- y[!index] + if (length(x) < 3 | sd(x) < 0.1 | sd(y) < 0.1 ) { + next + } + r <- cor(x, y) + if (abs(r) >= CORR_THRESHOLD) { + s = sprintf('%s %s\t%s %s\t%4.2f\t%s\t%s\t%s\t%s\t%s\n', id2, name2,id1,name1, r, 'all', '.', cond, '.', curr.date) + #cat(s, file=result.file, sep='\n', append=T) + #cat(s, sep='\n') + #flush.console() + #write.table(s, file.name, quote=F, sep='', row.names=F, append=T, col.names=F) + next + } + + k <- 2 + N <- length(x) + tryCatch( em.out <- regmixEM(y, x, maxit=100, epsilon=1e-03, k=k), error=function(e) NULL ) + if (length(em.out) == 0) { # if there is an error when running regmixEM, we skip. + next + } + + pos_r_max <- -2 + pos_r_N <- 0 + pos_r_index <- c() + pos_r_loglik <- -100000000 + + neg_r_max <- 2 + neg_r_N <- 0 + neg_r_index <- c() + neg_r_loglik <- -100000000 + + for (j in seq(1,k,1)) { + + index <- which(max.col(em.out$posterior) == j) + size <- length(index) + r <- cor(em.out$x[index,2], em.out$y[index]) + + if (!is.na(r) && r >= CORR_THRESHOLD && size >= MIN_SIZE && r > pos_r_max && size > pos_r_N) { + pos_r_max <- r + pos_r_N <- size + pos_r_index <- index + pos_r_loglik <- em.out$loglik + } + if (!is.na(r) && r <= -CORR_THRESHOLD && size >= MIN_SIZE && r < neg_r_max && size > neg_r_N) { + neg_r_max <- r + neg_r_N <- size + neg_r_index <- index + neg_r_loglik <- em.out$loglik + } + } + + if (pos_r_max > 0) { + sub.cond <- paste(rna.sample.id[pos_r_index], collapse=' ') + num.sub.cond <- length(rna.sample.id[pos_r_index]) + s = sprintf('%s %s\t%s %s\t%4.2f\t%s\t%d\t%s\t%4.2f\t%s\t%4.2f\t%s\n', id2, name2, id1, name1, pos_r_max, 'mix', num.sub.cond, cond, pos_r_loglik, curr.date, pos_r_max, 'mixtool') + #cat(s, file=result.file, sep='\n', append=T) + #cat(s, sep='\n') + #write.table(s, file.name, quote=F, sep='', row.names=F, append=T, col.names=F) + cat(s, file=f, sep='') + } + + if (neg_r_max < 0) { + sub.cond <- paste(rna.sample.id[neg_r_index], collapse=' ') + num.sub.cond <- length(rna.sample.id[neg_r_index]) + s = sprintf('%s %s\t%s %s\t%4.2f\t%s\t%d\t%s\t%4.2f\t%s\t%4.2f\t%s\n', id2, name2, id1, name1, neg_r_max, 'mix', num.sub.cond, cond, neg_r_loglik, curr.date, neg_r_max, 'mixtool') + #cat(s, file=result.file, sep='\n', append=T) + #cat(s, sep='\n') + #write.table(s, file.name, quote=F, sep='', row.names=F, append=T, col.names=F) + cat(s, file=f, sep='') + } +} + +close(f) \ No newline at end of file -- cgit v1.2.1