summaryrefslogtreecommitdiff
path: root/Code/create_edges_k2.R
diff options
context:
space:
mode:
authorHui Lan <lanhui@zjnu.edu.cn>2019-12-04 19:03:19 +0800
committerHui Lan <lanhui@zjnu.edu.cn>2019-12-04 19:03:19 +0800
commit97fdefab064f63642fa3ece05b807d29b459df31 (patch)
treea058530023224f3e35b1783996f3530c80c04bc5 /Code/create_edges_k2.R
brain: add python and R code to local repository.
Diffstat (limited to 'Code/create_edges_k2.R')
-rw-r--r--Code/create_edges_k2.R136
1 files changed, 136 insertions, 0 deletions
diff --git a/Code/create_edges_k2.R b/Code/create_edges_k2.R
new file mode 100644
index 0000000..a044ed0
--- /dev/null
+++ b/Code/create_edges_k2.R
@@ -0,0 +1,136 @@
+# Last modified 13 August 2019
+
+TARGET_TF_FILE <- "../Data/information/target_tf.txt"
+DATA_FILE <- "../Data/history/expr/TPM.txt" # A TPM table
+AGINAME_FILE <- "../Data/information/AGI-to-gene-names_v2.txt"
+CORR_THRESHOLD <- 0.5
+MIN_SIZE <- 100
+
+
+# Make sure we have required files
+if (! file.exists(TARGET_TF_FILE)) {
+ stop(sprintf('[create_edges2_k2.R] Unable to find %s', TARGET_TF_FILE))
+}
+
+if (! file.exists(DATA_FILE)) {
+ stop(sprintf('[create_edges_k2.R] Unable to find %s', DATA_FILE))
+}
+
+if (! file.exists(AGINAME_FILE)) {
+ stop(sprintf('[create_edges_k2.R] Unable to find %s', AGINAME_FILE))
+}
+
+
+####### Read data #########################################
+X <- read.table(DATA_FILE, header=TRUE, check.names=FALSE)
+gene_id <- X$gene_id
+X$gene_id <- NULL
+row.names(X) <- gene_id
+X <- as.matrix(X)
+rna.sample.id <- colnames(X)
+
+target_tf <- read.table(TARGET_TF_FILE, sep='\t', header=FALSE)
+target_tf <- as.matrix(target_tf)
+targets <- target_tf[,1]
+tfs <- target_tf[,2]
+conditions <- target_tf[,3]
+
+agi <- read.table(AGINAME_FILE, stringsAsFactors=F) # AGINAME_FILE cannot contain quotes
+#######################################################
+
+library(mixtools)
+options(max.print=999999999)
+output.file <- paste('../Data/history/edges/one_target/edges.txt', 'mixtools', format(Sys.time(), '%b.%d.%Y.%H%M%S'), sep='.')
+f <- file(output.file, 'w')
+
+for (i in 1:length(targets)) {
+ curr.date <- gsub('-','',Sys.Date())
+ id1 <- tfs[i]
+ id2 <- targets[i]
+ if (id1 %in% gene_id == F || id2 %in% gene_id == F) {
+ next
+ }
+
+ name1 <- agi$V2[which(agi$V1 == id1)]
+ name2 <- agi$V2[which(agi$V1 == id2)]
+
+ cond <- conditions[i]
+ x <- X[id1,]
+ y <- X[id2,]
+ x <- log(x+1)
+ y <- log(y+1)
+ index <- x < 0.01 | y < 0.01
+ x <- x[!index]
+ y <- y[!index]
+ if (length(x) < 3 | sd(x) < 0.1 | sd(y) < 0.1 ) {
+ next
+ }
+ r <- cor(x, y)
+ if (abs(r) >= CORR_THRESHOLD) {
+ s = sprintf('%s %s\t%s %s\t%4.2f\t%s\t%s\t%s\t%s\t%s\n', id2, name2,id1,name1, r, 'all', '.', cond, '.', curr.date)
+ #cat(s, file=result.file, sep='\n', append=T)
+ #cat(s, sep='\n')
+ #flush.console()
+ #write.table(s, file.name, quote=F, sep='', row.names=F, append=T, col.names=F)
+ next
+ }
+
+ k <- 2
+ N <- length(x)
+ tryCatch( em.out <- regmixEM(y, x, maxit=100, epsilon=1e-03, k=k), error=function(e) NULL )
+ if (length(em.out) == 0) { # if there is an error when running regmixEM, we skip.
+ next
+ }
+
+ pos_r_max <- -2
+ pos_r_N <- 0
+ pos_r_index <- c()
+ pos_r_loglik <- -100000000
+
+ neg_r_max <- 2
+ neg_r_N <- 0
+ neg_r_index <- c()
+ neg_r_loglik <- -100000000
+
+ for (j in seq(1,k,1)) {
+
+ index <- which(max.col(em.out$posterior) == j)
+ size <- length(index)
+ r <- cor(em.out$x[index,2], em.out$y[index])
+
+ if (!is.na(r) && r >= CORR_THRESHOLD && size >= MIN_SIZE && r > pos_r_max && size > pos_r_N) {
+ pos_r_max <- r
+ pos_r_N <- size
+ pos_r_index <- index
+ pos_r_loglik <- em.out$loglik
+ }
+ if (!is.na(r) && r <= -CORR_THRESHOLD && size >= MIN_SIZE && r < neg_r_max && size > neg_r_N) {
+ neg_r_max <- r
+ neg_r_N <- size
+ neg_r_index <- index
+ neg_r_loglik <- em.out$loglik
+ }
+ }
+
+ if (pos_r_max > 0) {
+ sub.cond <- paste(rna.sample.id[pos_r_index], collapse=' ')
+ num.sub.cond <- length(rna.sample.id[pos_r_index])
+ s = sprintf('%s %s\t%s %s\t%4.2f\t%s\t%d\t%s\t%4.2f\t%s\t%4.2f\t%s\n', id2, name2, id1, name1, pos_r_max, 'mix', num.sub.cond, cond, pos_r_loglik, curr.date, pos_r_max, 'mixtool')
+ #cat(s, file=result.file, sep='\n', append=T)
+ #cat(s, sep='\n')
+ #write.table(s, file.name, quote=F, sep='', row.names=F, append=T, col.names=F)
+ cat(s, file=f, sep='')
+ }
+
+ if (neg_r_max < 0) {
+ sub.cond <- paste(rna.sample.id[neg_r_index], collapse=' ')
+ num.sub.cond <- length(rna.sample.id[neg_r_index])
+ s = sprintf('%s %s\t%s %s\t%4.2f\t%s\t%d\t%s\t%4.2f\t%s\t%4.2f\t%s\n', id2, name2, id1, name1, neg_r_max, 'mix', num.sub.cond, cond, neg_r_loglik, curr.date, neg_r_max, 'mixtool')
+ #cat(s, file=result.file, sep='\n', append=T)
+ #cat(s, sep='\n')
+ #write.table(s, file.name, quote=F, sep='', row.names=F, append=T, col.names=F)
+ cat(s, file=f, sep='')
+ }
+}
+
+close(f) \ No newline at end of file