summaryrefslogtreecommitdiff
path: root/Code/create_edges4_k2.py
diff options
context:
space:
mode:
authorHui Lan <lanhui@zjnu.edu.cn>2019-12-04 19:03:19 +0800
committerHui Lan <lanhui@zjnu.edu.cn>2019-12-04 19:03:19 +0800
commit97fdefab064f63642fa3ece05b807d29b459df31 (patch)
treea058530023224f3e35b1783996f3530c80c04bc5 /Code/create_edges4_k2.py
brain: add python and R code to local repository.
Diffstat (limited to 'Code/create_edges4_k2.py')
-rw-r--r--Code/create_edges4_k2.py253
1 files changed, 253 insertions, 0 deletions
diff --git a/Code/create_edges4_k2.py b/Code/create_edges4_k2.py
new file mode 100644
index 0000000..cffd160
--- /dev/null
+++ b/Code/create_edges4_k2.py
@@ -0,0 +1,253 @@
+# Usage: python create_edges4.py parameter_for_net.txt
+# Purpose:
+# This script will generate MAX_PROCESS work_on_AGI#.R scripts, each for a target gene. So treat each target separately.
+# The results will be saved as edges.txt.AT3G12580.20170308, where AT3G12580 is target gene id, and 20170308 is date.
+# The edges.txt files will be merged together later.
+# Hopeful it will be faster.
+# Make it faster by handling each target separately.
+# Make memory footprint smaller by spliting TPM.txt into small json files, and converting binding.txt to target_tf.txt first.
+# So we don't need to load the big matrices, TPM.txt and binding.txt.
+#
+# 7 Mar 2017, slcu, hui
+
+import sys, os, operator, itertools
+from datetime import datetime
+import time
+import json
+import subprocess
+from geneid2name import make_gene_name_AGI_map_dict
+from param4net import make_global_param_dict
+
+EDGE_FILE = '../Data/history/edges/edges.txt'
+EDGE_DIR = '../Data/history/edges/one_target' # a directory storing all edge files, one for each target gene
+GENE_ID_TO_GENE_NAME = '../Data/information/AGI-to-gene-names_v2.txt' # for gene names
+TIME_INTERVAL = 10 # wait this long in seconds between before launching a R Rscript
+MAX_PROCESS = 5 # CHANGE
+K = 2 # CHANGE number of components to use
+
+####################################
+DATA_SYMBOL = '@'
+####################################
+
+def get_gene_list(fname):
+ result = []
+ f = open(fname)
+ for line in f:
+ line = line.strip()
+ lst = line.split()
+ result.append(lst[0])
+ f.close()
+ return result
+
+def get_ordered_gene_list(fname):
+ gene_list = get_gene_list(fname)
+ d = {}
+ f = open(EDGE_FILE)
+ lines = f.readlines()
+ f.close()
+ for line in lines:
+ line = line.strip()
+ lst = line.split('\t')
+ target = lst[0].split()[0]
+ tf = lst[1].split()[0]
+ if not target in d:
+ d[target] = 1
+ else:
+ d[target] += 1
+
+ result_gene_lst = []
+ for t in sorted(d.items(), key=operator.itemgetter(1)): # targets with fewer edges will be on the top
+ g = t[0]
+ if g in gene_list:
+ result_gene_lst.append(g)
+ return result_gene_lst
+
+
+def make_tf_dict(fname):
+ d = {}
+ f = open(fname)
+ for line in f:
+ line = line.strip()
+ lst = line.split('\t')
+ target = lst[0]
+ tf = lst[1]
+ cond = lst[2]
+ if not target in d:
+ d[target] = {tf:cond}
+ else:
+ d[target][tf] = cond
+ f.close()
+ return d
+
+def make_r_script(fname, target, tf_dict, abs_jsonTPM_dir, num_component):
+ head = 'k.lst <- c(%d)\n' % (num_component)
+ head += 'target <- \'%s\'\n' % (target)
+ head += 'id2 <- target\n'
+ tfs = ''
+ conds = ''
+ for k in tf_dict.keys():
+ tfs += '\'%s\',' % (k)
+ conds += '\'%s\',' % (tf_dict[k])
+ head += 'tfs <- c(' + tfs.rstrip(',') + ')\n'
+ head += 'conditions <- c(' + conds.rstrip(',') + ')\n'
+ head += 'jsonTPM.dir <- \'%s\'\n' % (abs_jsonTPM_dir)
+ head += 'AGINAME_FILE <- \'%s\'\n' % (os.path.abspath(GENE_ID_TO_GENE_NAME))
+ head += 'output.file <- paste(\'%s/edges.txt\', id2, gsub(\'-\',\'\',Sys.Date()), \'k%d\', sep=\'.\')\n' % (EDGE_DIR, num_component)
+ body = '''
+ ####### Read data #########################################
+ CORR_THRESHOLD <- 0.7
+ MIN_SIZE <- 100
+ agi <- read.table(AGINAME_FILE, sep='\\t', header=FALSE, row.names=1, stringsAsFactors=F) # AGINAME_FILE cannot contain quotes
+ #######################################################
+ library(mixtools)
+ library(rjson)
+ name2 <- agi[id2,1]
+ result <- ''
+ for (i in 1:length(tfs)) {
+ curr.date <- gsub('-','',Sys.Date())
+ id1 <- tfs[i]
+ name1 <- agi[id1,1]
+ cond <- conditions[i]
+
+ file.x <- paste(jsonTPM.dir, paste(id1, '.json', sep=''), sep='/')
+ if (!file.exists(file.x)) { next }
+ x <- as.data.frame(fromJSON(file = file.x))
+ x <- log(x+1)
+ rcond.x <- names(x)
+ x <- as.vector(t(x)) # convert it to a vector
+
+ file.y <- paste(jsonTPM.dir, paste(id2, '.json', sep=''), sep='/')
+ if (!file.exists(file.y)) { break }
+ y <- as.data.frame(fromJSON(file = file.y))
+ y <- log(y+1)
+ rcond.y <- names(y)
+ y <- as.vector(t(y)) # convert it to a vector
+
+ rna.sample.id <- rcond.x
+ if (all(rcond.x == rcond.y) == FALSE | id1 == id2) { # if the IDs in two json files do not match, or target is the same as tf, then ignore
+ next
+ }
+
+ index <- x < 0.01 | y < 0.01 # don't include data that is too small
+ x <- x[!index]
+ y <- y[!index]
+ r <- cor(x, y)
+ if (abs(r) >= CORR_THRESHOLD) {
+ s = sprintf('%s %s\\t%s %s\\t%4.2f\\t%s\\t%s\\t%s\\t%s\\t%s\\n', id2, name2, id1, name1, r, 'all', '.', cond, '.', curr.date)
+ result <- paste(result, s, sep='')
+ next # a good correlation is found using all experiments, so not necessary to look further
+ }
+
+ rna.sample.id <- rna.sample.id[!index] # important to make the index work
+
+ pos_r_max <- -2
+ pos_r_N <- 0
+ pos_r_index <- c()
+ pos_r_loglik <- -100000000
+
+ neg_r_max <- 2
+ neg_r_N <- 0
+ neg_r_index <- c()
+ neg_r_loglik <- -100000000
+
+ for (k in k.lst) {
+ em.out <- regmixEM(y, x, maxit=150, epsilon=1e-04, k=k)
+ for (j in seq(1,k,1)) {
+ index <- which(max.col(em.out$posterior) == j)
+ size <- length(index)
+ r <- cor(em.out$x[index,2], em.out$y[index])
+ if (!is.na(r) && r >= CORR_THRESHOLD && size >= MIN_SIZE && r > pos_r_max && size > pos_r_N) {
+ pos_r_max <- r
+ pos_r_N <- size
+ pos_r_index <- index
+ pos_r_loglik <- em.out$loglik
+ }
+ if (!is.na(r) && r <= -CORR_THRESHOLD && size >= MIN_SIZE && r < neg_r_max && size > neg_r_N) {
+ neg_r_max <- r
+ neg_r_N <- size
+ neg_r_index <- index
+ neg_r_loglik <- em.out$loglik
+ }
+ }
+ }
+
+ if (pos_r_max > 0) { # has a good positive correlation
+ sub.cond <- paste(rna.sample.id[pos_r_index], collapse=' ')
+ s = sprintf('%s %s\\t%s %s\\t%4.2f\\t%s\\t%s\\t%s\\t%4.2f\\t%s\\n', id2, name2, id1, name1, pos_r_max, 'mix', sub.cond, cond, pos_r_loglik, curr.date)
+ result <- paste(result, s, sep='')
+ }
+ if (neg_r_max < 0) { # has a good negative correlation
+ sub.cond <- paste(rna.sample.id[neg_r_index], collapse=' ')
+ s = sprintf('%s %s\\t%s %s\\t%4.2f\\t%s\\t%s\\t%s\\t%4.2f\\t%s\\n', id2, name2, id1, name1, neg_r_max, 'mix', sub.cond, cond, neg_r_loglik, curr.date)
+ result <- paste(result, s, sep='')
+ }
+ }
+ cat(result, file=output.file, sep='')
+ '''
+ f = open(fname, 'w')
+ content = head + body
+ f.write('\n'.join([line.lstrip('\t').rstrip() for line in content.split('\n')]))
+ f.close()
+ return fname
+
+def wait_a_moment(n, prefix):
+ ''' if there are more than n work_on...R scripts running, wait... '''
+ time.sleep(TIME_INTERVAL)
+ ps = subprocess.Popen('ps aux | grep %s' % (prefix), shell=True, stdout=subprocess.PIPE) # CHANGE
+ process_lst = ps.communicate()[0].split('\n')
+ while (len(process_lst) > n):
+ #print('number of running processes %d' % (len(process_lst)))
+ time.sleep(TIME_INTERVAL)
+ ps = subprocess.Popen('ps aux | grep %s' % (prefix), shell=True, stdout=subprocess.PIPE)
+ process_lst = ps.communicate()[0].split('\n')
+
+def establish_edges(jsonTPM_dir, d, glb_param_dict, rprefix):
+ ''' d - binding dictionary {target:{tf1:c1, tf2:c2}, ... }, c1 c2 are strings of conditions '''
+
+ gene_lst = get_ordered_gene_list(glb_param_dict['GENE_LIST']) # targets with fewer edges will get higher priority. For example, those targets never having an edge will be treated first
+ high_gene_lst = glb_param_dict['HIGH_PRIORITY_GENE'].split() # high priority genes CHANGE
+
+ if not os.path.isdir(EDGE_DIR):
+ os.makedirs(EDGE_DIR)
+
+ # make a list of targets, putting high-priority target in the beginning
+ final_gene_lst = high_gene_lst
+ for x in gene_lst:
+ if not x in high_gene_lst:
+ final_gene_lst.append(x)
+
+ # process each target
+ for target in final_gene_lst: # high priority genes are processed first
+ if target in d: # target g is in binding dictionary d
+ tf_dict = d[target] # in the form of {tf1:c1, tf2:c2}
+ if len(tf_dict) > 0: # it has TFs, usually it is the case
+ r_file = '../Data/temp/%s_%s_K%d.R' % (rprefix, target, K)
+ fname = make_r_script(r_file, target, tf_dict, jsonTPM_dir, K)
+ cmd = 'Rscript %s &' % (r_file) # run the Rscript in background
+ os.system(cmd) # UNCOMMENT ME
+ wait_a_moment(MAX_PROCESS, rprefix) # make sure there are not too many R process running in the same time. If too many, wait. MAX_PROCESS sets the limit.
+
+## main
+param_file = sys.argv[1] # a single prameter file for building network, parameter_for_net.txt
+glb_param_dict = make_global_param_dict(param_file)
+agi2name_dict = make_gene_name_AGI_map_dict(GENE_ID_TO_GENE_NAME) # for gene names
+
+#print('Make jsonTPM ...') # CHANGE
+cmd = 'python TPM2JSON.py %s' % (param_file) # make jsonTPM directory. The TPM values are not log-transformed.
+os.system(cmd)
+curr_time = datetime.now().strftime('%Y%m%d_%H%M%S')
+JSON_DIR = '../Data/history/expr/jsonTPM_%s' % (curr_time) # for each TPM.txt, there should be a unique jsonTPM directory.
+cmd = 'mv ../Data/history/expr/jsonTPM %s' % (JSON_DIR)
+os.system(cmd)
+
+#print('Make target tf using binding.txt')
+target_tf_fname = '../Data/information/target_tf.txt.' + curr_time
+cmd = 'python make_target_tf.py %s > %s' % (param_file, target_tf_fname) # make target_tf.txt CHANGE better to make a temperory copy for this program
+os.system(cmd)
+
+#JSON_DIR = '../Data/history/expr/jsonTPM_20170310_1153'
+#target_tf_fname = '../Data/information/target_tf.txt.20170310_1153'
+#print('Establish edges')
+big_tf_dict = make_tf_dict(target_tf_fname)
+rscript_prefix = 'WORK%s' % (datetime.now().strftime('%Y%m%d_%H%M'))
+establish_edges(os.path.abspath(JSON_DIR), big_tf_dict, glb_param_dict, rscript_prefix)