# Last modified 13 August 2019 TARGET_TF_FILE <- "../Data/information/target_tf.txt" DATA_FILE <- "../Data/history/expr/TPM.txt" # A TPM table AGINAME_FILE <- "../Data/information/AGI-to-gene-names_v2.txt" CORR_THRESHOLD <- 0.5 MIN_SIZE <- 100 # Make sure we have required files if (! file.exists(TARGET_TF_FILE)) { stop(sprintf('[create_edges2_k2.R] Unable to find %s', TARGET_TF_FILE)) } if (! file.exists(DATA_FILE)) { stop(sprintf('[create_edges_k2.R] Unable to find %s', DATA_FILE)) } if (! file.exists(AGINAME_FILE)) { stop(sprintf('[create_edges_k2.R] Unable to find %s', AGINAME_FILE)) } ####### Read data ######################################### X <- read.table(DATA_FILE, header=TRUE, check.names=FALSE) gene_id <- X$gene_id X$gene_id <- NULL row.names(X) <- gene_id X <- as.matrix(X) rna.sample.id <- colnames(X) target_tf <- read.table(TARGET_TF_FILE, sep='\t', header=FALSE) target_tf <- as.matrix(target_tf) targets <- target_tf[,1] tfs <- target_tf[,2] conditions <- target_tf[,3] agi <- read.table(AGINAME_FILE, stringsAsFactors=F) # AGINAME_FILE cannot contain quotes ####################################################### library(mixtools) options(max.print=999999999) output.file <- paste('../Data/history/edges/one_target/edges.txt', 'mixtools', format(Sys.time(), '%b.%d.%Y.%H%M%S'), sep='.') f <- file(output.file, 'w') for (i in 1:length(targets)) { curr.date <- gsub('-','',Sys.Date()) id1 <- tfs[i] id2 <- targets[i] if (id1 %in% gene_id == F || id2 %in% gene_id == F) { next } name1 <- agi$V2[which(agi$V1 == id1)] name2 <- agi$V2[which(agi$V1 == id2)] cond <- conditions[i] x <- X[id1,] y <- X[id2,] x <- log(x+1) y <- log(y+1) index <- x < 0.01 | y < 0.01 x <- x[!index] y <- y[!index] if (length(x) < 3 | sd(x) < 0.1 | sd(y) < 0.1 ) { next } r <- cor(x, y) if (abs(r) >= CORR_THRESHOLD) { s = sprintf('%s %s\t%s %s\t%4.2f\t%s\t%s\t%s\t%s\t%s\n', id2, name2,id1,name1, r, 'all', '.', cond, '.', curr.date) #cat(s, file=result.file, sep='\n', append=T) #cat(s, sep='\n') #flush.console() #write.table(s, file.name, quote=F, sep='', row.names=F, append=T, col.names=F) next } k <- 2 N <- length(x) tryCatch( em.out <- regmixEM(y, x, maxit=100, epsilon=1e-03, k=k), error=function(e) NULL ) if (length(em.out) == 0) { # if there is an error when running regmixEM, we skip. next } pos_r_max <- -2 pos_r_N <- 0 pos_r_index <- c() pos_r_loglik <- -100000000 neg_r_max <- 2 neg_r_N <- 0 neg_r_index <- c() neg_r_loglik <- -100000000 for (j in seq(1,k,1)) { index <- which(max.col(em.out$posterior) == j) size <- length(index) r <- cor(em.out$x[index,2], em.out$y[index]) if (!is.na(r) && r >= CORR_THRESHOLD && size >= MIN_SIZE && r > pos_r_max && size > pos_r_N) { pos_r_max <- r pos_r_N <- size pos_r_index <- index pos_r_loglik <- em.out$loglik } if (!is.na(r) && r <= -CORR_THRESHOLD && size >= MIN_SIZE && r < neg_r_max && size > neg_r_N) { neg_r_max <- r neg_r_N <- size neg_r_index <- index neg_r_loglik <- em.out$loglik } } if (pos_r_max > 0) { sub.cond <- paste(rna.sample.id[pos_r_index], collapse=' ') num.sub.cond <- length(rna.sample.id[pos_r_index]) s = sprintf('%s %s\t%s %s\t%4.2f\t%s\t%d\t%s\t%4.2f\t%s\t%4.2f\t%s\n', id2, name2, id1, name1, pos_r_max, 'mix', num.sub.cond, cond, pos_r_loglik, curr.date, pos_r_max, 'mixtool') #cat(s, file=result.file, sep='\n', append=T) #cat(s, sep='\n') #write.table(s, file.name, quote=F, sep='', row.names=F, append=T, col.names=F) cat(s, file=f, sep='') } if (neg_r_max < 0) { sub.cond <- paste(rna.sample.id[neg_r_index], collapse=' ') num.sub.cond <- length(rna.sample.id[neg_r_index]) s = sprintf('%s %s\t%s %s\t%4.2f\t%s\t%d\t%s\t%4.2f\t%s\t%4.2f\t%s\n', id2, name2, id1, name1, neg_r_max, 'mix', num.sub.cond, cond, neg_r_loglik, curr.date, neg_r_max, 'mixtool') #cat(s, file=result.file, sep='\n', append=T) #cat(s, sep='\n') #write.table(s, file.name, quote=F, sep='', row.names=F, append=T, col.names=F) cat(s, file=f, sep='') } } close(f)