summaryrefslogtreecommitdiff
path: root/Code/create_edges4.py
diff options
context:
space:
mode:
Diffstat (limited to 'Code/create_edges4.py')
-rw-r--r--Code/create_edges4.py450
1 files changed, 450 insertions, 0 deletions
diff --git a/Code/create_edges4.py b/Code/create_edges4.py
new file mode 100644
index 0000000..29c87ff
--- /dev/null
+++ b/Code/create_edges4.py
@@ -0,0 +1,450 @@
+# Usage: python create_edges4.py parameter_for_net.txt
+# Purpose:
+# This script will generate a few WORK20170328_1026_AGI_one_K2/3.R scripts, each for a target gene. Treat each target separately and at the same, to speed things up.
+# The results will be saved as edges.txt.AT2G40300.Apr.04.2017.11:45:30.k3, where AT3G12580 is gene id, and k3 means K=3 in Mixture of Regressions.
+# The edges.txt files will be merged together later by update_network.py.
+# Make it faster by handling each target separately.
+# Make memory footprint smaller by spliting TPM.txt into small json files (in JSON_DIR), and converting binding.txt to target_tf.txt (in target_tf_fname) first.
+# So we only load necessary gene expression vectors each time. So we don't need to load the big matrices, TPM.txt and binding.txt.
+#
+# 7 Mar 2017, slcu, hui
+# Last modified 23 Mar 2017, slcu, hui
+# Last modified 23 Mar 2017, slcu, hui. Check edges.txt to determine update.
+
+import sys, os, operator, itertools
+from datetime import datetime
+import time
+import json
+import subprocess
+from geneid2name import make_gene_name_AGI_map_dict
+from param4net import make_global_param_dict
+
+EDGE_FILE = '../Data/history/edges/edges.txt' # recent, merged edges from various sources
+EDGE_DIR = '../Data/history/edges/one_target' # a directory for storing all edge files generated by this script, one for each target gene
+TIME_INTERVAL = 2 # wait 5 seconds before launching a R Rscript
+MAX_PROCESS = 10 # CHANGE this to a larger number if you have many CPUs
+AVERAGE_LIKELIHOOD_TAU = -0.5 # a value betweeo -0.1 to -998. must be negative, lower this value make less effort in creating brand-new edges.
+EDGE_AGE = 30 # if an edge's age is less than 30 days, don't need to update it.
+
+####################################
+DATA_SYMBOL = '@'
+####################################
+
+def get_gene_list(fname):
+ result = []
+ f = open(fname)
+ for line in f:
+ line = line.strip()
+ lst = line.split()
+ result.append(lst[0])
+ f.close()
+ return result
+
+def get_ordered_gene_list(fname):
+ gene_list = get_gene_list(fname)
+ d = {}
+ if not os.path.exists(EDGE_FILE):
+ return gene_list
+ f = open(EDGE_FILE)
+ lines = f.readlines()
+ f.close()
+ for line in lines:
+ line = line.strip()
+ lst = line.split('\t')
+ target = lst[0].split()[0]
+ tf = lst[1].split()[0]
+ if not target in d:
+ d[target] = 1
+ else:
+ d[target] += 1
+
+ result_gene_lst = []
+ for t in sorted(d.items(), key=operator.itemgetter(1)): # targets with fewer edges will be on the top
+ g = t[0]
+ if g in gene_list:
+ result_gene_lst.append(g)
+ return result_gene_lst
+
+
+def make_tf_dict(fname):
+ d = {}
+ f = open(fname)
+ for line in f:
+ line = line.strip()
+ lst = line.split('\t')
+ target = lst[0]
+ tf = lst[1]
+ cond = lst[2]
+ if not target in d:
+ d[target] = {tf:cond}
+ else:
+ d[target][tf] = cond
+ f.close()
+ return d
+
+def make_target_dict(fname):
+ d = {}
+ f = open(fname)
+ for line in f:
+ line = line.strip()
+ lst = line.split('\t')
+ target = lst[0]
+ tf = lst[1]
+ cond = lst[2]
+ if not tf in d:
+ d[tf] = {target:cond}
+ else:
+ d[tf][target] = cond
+ f.close()
+ return d
+
+def not_bad_line(s):
+ if s.strip() == '':
+ return False
+ if 'WARNING' in s:
+ return False
+ if 'number' in s:
+ return False
+ if 'Need' in s:
+ return False
+ if 'Error' in s:
+ return False
+ if 'Too' in s:
+ return False
+ if not s.startswith('AT'):
+ return False
+ return True
+
+def make_edge_dict(fname):
+ d = {}
+ if not os.path.exists(fname):
+ return d
+ f = open(fname)
+ for line in f:
+ line = line.strip()
+ if not_bad_line(line):
+ lst = line.split('\t')
+ target_id = lst[0].split()[0]
+ tf_id = lst[1].split()[0]
+ date = '20161201'
+ num_rcond = len(lst[4].split())
+ avg_loglik = -999 # very low likelihood
+ loglik = lst[6]
+ if loglik != '.':
+ if '=' in loglik:
+ avg_loglik = float(loglik.split('=')[1])/num_rcond
+ else:
+ avg_loglik = float(loglik)/num_rcond
+ if len(lst) == 8:
+ date = lst[7]
+ if not tf_id in d:
+ d[tf_id] = {target_id:{'date':date, 'loglikelihood':avg_loglik}}
+ else:
+ d[tf_id][target_id] = {'date':date, 'loglikelihood':avg_loglik}
+ f.close()
+ return d
+
+def make_r_script(fname, target, tf_dict, abs_jsonTPM_dir, num_component, edge_dict):
+ head = 'k.lst <- c(%d)\n' % (num_component)
+ head += 'target <- \'%s\'\n' % (target)
+ head += 'id2 <- target\n'
+ tfs = ''
+ conds = ''
+ recent_edge = ''
+ curr_time = datetime.now().strftime('%Y%m%d')
+ for k in tf_dict.keys(): # k is tf
+ tfs += '\'%s\',' % (k)
+ conds += '\'%s\',' % (tf_dict[k])
+ if k in edge_dict and target in edge_dict[k] and (int(curr_time) - int(edge_dict[k][target]['date']) <= EDGE_AGE or edge_dict[k][target]['loglikelihood'] >= AVERAGE_LIKELIHOOD_TAU): # recent and good
+ recent_edge += '%d,' % (1)
+ else:
+ recent_edge += '%d,' % (0)
+
+ head += 'tfs <- c(' + tfs.rstrip(',') + ')\n'
+ head += 'conditions <- c(' + conds.rstrip(',') + ')\n'
+ head += 'recent.edge <- c(' + recent_edge.rstrip(',') + ')\n'
+ head += 'jsonTPM.dir <- \'%s\'\n' % (abs_jsonTPM_dir)
+ head += 'AGINAME_FILE <- \'%s\'\n' % (os.path.abspath(GENE_ID_TO_GENE_NAME))
+ body = '''
+ post.translation <- function(x, y) {
+ mean.x <- mean(x)
+ sd.x <- sd(x)
+ index <- x > mean.x - sd.x & x < mean.x + sd.x
+ sd.y <- sd(y[index])
+ result <- list(value=ifelse(mean.x < 2.0, 0.0, (mean.x/max(x)) * sd.y * sum(index)/length(index)), index=which(index==T), percent=sum(index)/length(index))
+ }
+
+ post.translation.2 <- function(x, y) {
+ # x is consititutively high while y varies a lot
+ mean.x <- mean(x)
+ sd.x <- max(sd(x), 1) # a number above 1
+ index <- x > mean.x - sd.x & x < mean.x + sd.x # points within the window +/- sd.x
+ sd.y <- quantile(y[index],0.85)-quantile(y[index],0.15) # dispersion of y within the window
+ sd.y.2 <- quantile(y,0.85)-quantile(y,0.15) # dispersion of all y
+ v.disp <- sd.y/max(1, sd.y.2) # how disperse y is within the windown, a number between 0 and 1
+ # value measure dispersion of y and percent of points within a window
+ result <- list(value=ifelse(mean.x < 2.0, 0.0, v.disp * sum(index)/length(index)), index=which(index==T), percent=sum(index)/length(index))
+ }
+
+ post.translation.3 <- function(x, y) {
+ # x is consititutively high while y varies a lot
+ mean.x <- mean(x)
+ upper.percentile <- 0.85 # used for computing vertical dispersion
+ lowest.n <- 3 # number of points with lowest x values
+ min.mean.x <- max(2.0, quantile(x, 0.25)) # mean of x must be greater than this value
+ sd.x <- min(sd(x), 1) # a number between 0 and 1
+ index <- x > mean.x - sd.x & x < mean.x + sd.x # points within the window +/- sd.x
+ sd.y <- quantile(y[index],upper.percentile)-quantile(y[index],1-upper.percentile) # dispersion of y within the window
+ sd.y.2 <- quantile(y,upper.percentile)-quantile(y,1-upper.percentile) # dispersion of all y
+ v.disp <- sd.y/max(1, sd.y.2) # how disperse y is within the window, a number between 0 and 1
+
+ rst <- sort(x, index.return=T)
+ top.n <- sum(rst$x < 1)
+ top.n <- max(1, min(top.n, lowest.n))
+ small.y <- min(mean(y[rst$ix[1:top.n]]), mean(y[x<1])) # use the smaller value
+ small.y <- ifelse(is.nan(small.y)==T, 999, small.y)
+ # value measure dispersion of y and percent of points within a window
+ result <- list(valid=small.y, value=ifelse(mean.x < min.mean.x, 0.0, v.disp * sum(index)/length(index)), index=which(index==T), percent=sum(index)/length(index))
+ }
+
+ in.component <- function(posterior, k) {
+ # posterior is an Nxk matrix, each row is a data points, and each col is prob belonging to a component
+ p = posterior[,k]
+ n = length(p)
+ index <- rep(F,n)
+ for (i in 1:n) {
+ if (p[i] > runif(1)) {
+ index[i] = T
+ }
+ }
+ result <- index
+ }
+
+ ####### Read data #########################################
+ CORR_THRESHOLD <- 0.7
+ agi <- read.table(AGINAME_FILE, sep='\\t', header=FALSE, row.names=1, stringsAsFactors=F) # AGINAME_FILE cannot contain quotes
+ #######################################################
+ library(mixtools)
+ library(rjson)
+ name2 <- agi[id2,1]
+ result <- ''
+ for (i in 1:length(tfs)) {
+ if (recent.edge[i] == 1) {
+ next
+ }
+ curr.date <- gsub('-','',Sys.Date())
+ id1 <- tfs[i]
+ name1 <- agi[id1,1]
+ cond <- conditions[i]
+
+ file.x <- paste(jsonTPM.dir, paste(id1, '.json', sep=''), sep='/')
+ if (!file.exists(file.x)) { next }
+ x <- as.data.frame(fromJSON(file = file.x))
+ x <- log(x+1)
+ rcond.x <- names(x)
+ x <- as.vector(t(x)) # convert it to a vector
+
+ file.y <- paste(jsonTPM.dir, paste(id2, '.json', sep=''), sep='/')
+ if (!file.exists(file.y)) { break }
+ y <- as.data.frame(fromJSON(file = file.y))
+ y <- log(y+1)
+ rcond.y <- names(y)
+ y <- as.vector(t(y)) # convert it to a vector
+
+ rna.sample.id <- rcond.x
+ if (all(rcond.x == rcond.y) == FALSE | id1 == id2) { # if the IDs in two json files do not match, or target is the same as tf, then ignore
+ next
+ }
+
+ MIN_SIZE <- min(100, max(10, ceiling(0.5 * length(x))))
+
+ index <- x < 0.01 | y < 0.01 # don't include data that is too small
+ x <- x[!index]
+ y <- y[!index]
+
+ if (length(x) < MIN_SIZE) {
+ next
+ }
+ r <- cor(x, y)
+ if (abs(r) >= CORR_THRESHOLD) {
+ #s = sprintf('%s %s\\t%s %s\\t%4.2f\\t%s\\t%s\\t%s\\t%s\\t%s\\n', id2, name2, id1, name1, r, 'all', '.', cond, '.', curr.date)
+ #result <- paste(result, s, sep='')
+ next # a good correlation is found using all experiments, so not necessary to look further
+ }
+
+ rna.sample.id <- rna.sample.id[!index] # this step is important to make the following index work
+
+ pos_r_max <- -2
+ pos_r_N <- 0
+ pos_r_index <- c()
+ pos_r_loglik <- -100000000
+
+ neg_r_max <- 2
+ neg_r_N <- 0
+ neg_r_index <- c()
+ neg_r_loglik <- -100000000
+
+ for (k in k.lst) {
+ em.out <- regmixEM(y, x, maxit=150, epsilon=1e-04, k=k)
+ for (j in seq(1,k,1)) {
+ index <- in.component(em.out$posterior, j)
+ size <- sum(index)
+ r <- cor(em.out$x[index,2], em.out$y[index])
+ if (!is.na(r) && r >= CORR_THRESHOLD && size >= MIN_SIZE && r > pos_r_max && size > pos_r_N) {
+ pos_r_max <- r
+ pos_r_N <- size
+ pos_r_index <- index
+ pos_r_loglik <- em.out$loglik
+ }
+ if (!is.na(r) && r <= -CORR_THRESHOLD && size >= MIN_SIZE && r < neg_r_max && size > neg_r_N) {
+ neg_r_max <- r
+ neg_r_N <- size
+ neg_r_index <- index
+ neg_r_loglik <- em.out$loglik
+ }
+ }
+ }
+ hit <- 0
+ if (pos_r_max > 0) { # has a good positive correlation
+ sub.cond <- paste(rna.sample.id[pos_r_index], collapse=' ')
+ s = sprintf('%s %s\\t%s %s\\t%4.2f\\t%s\\t%s\\t%s\\t%4.2f\\t%s\\n', id2, name2, id1, name1, pos_r_max, 'mix', sub.cond, cond, pos_r_loglik, curr.date)
+ result <- paste(result, s, sep='')
+ hit <- hit + 1
+ }
+ if (neg_r_max < 0) { # has a good negative correlation
+ sub.cond <- paste(rna.sample.id[neg_r_index], collapse=' ')
+ s = sprintf('%s %s\\t%s %s\\t%4.2f\\t%s\\t%s\\t%s\\t%4.2f\\t%s\\n', id2, name2, id1, name1, neg_r_max, 'mix', sub.cond, cond, neg_r_loglik, curr.date)
+ result <- paste(result, s, sep='')
+ hit <- hit + 1
+ }
+ if (hit == 0) {
+ t <- post.translation.3(x, y)
+ post.r <- t$percent
+ if (t$valid < quantile(y,0.25) & t$value > 0.69 & post.r >= 0.70 & length(t$index) > MIN_SIZE) {
+ sub.cond <- paste(rna.sample.id[t$index], collapse=' ')
+ s = sprintf('%s %s\\t%s %s\\t%4.2f\\t%s\\t%s\\t%s\\t%s\\t%s\\n', id2, name2, id1, name1, post.r, 'mix', sub.cond, cond, '.', curr.date)
+ result <- paste(result, s, sep='')
+ }
+ }
+ }
+ '''
+ tail = '\n'
+ tail += 'output.file <- paste(\'%s/edges.txt\', id2, format(Sys.time(), \'%%b.%%d.%%Y.%%X\'), \'k%d\', sep=\'.\')\n' % (EDGE_DIR, num_component)
+ tail += 'if (result != \'\') cat(result, file=output.file, sep=\'\')\n'
+ f = open(fname, 'w')
+ content = head + body + tail
+ f.write('\n'.join([line.lstrip('\t').rstrip() for line in content.split('\n')]))
+ f.close()
+ return fname
+
+def number_of_running_process(lst):
+ ''' get number of running processes (with CPU usage greater than 0) '''
+ count = 0
+ for x in lst:
+ x = x.strip()
+ if x != '':
+ count += 1 if x.split()[2] > '0.0' else 0 # CPU usage great than 0.0
+ return count
+
+def wait_a_moment(n, prefix):
+ ''' if there are more than n work_on...R scripts running, wait... '''
+ time.sleep(TIME_INTERVAL)
+ ps = subprocess.Popen('ps aux | grep %s' % (prefix), shell=True, stdout=subprocess.PIPE) # CHANGE
+ num_proc = number_of_running_process(ps.communicate()[0].split('\n'))
+ while (num_proc > n):
+ #print('number of running processes %d' % (len(process_lst)))
+ time.sleep(TIME_INTERVAL)
+ ps = subprocess.Popen('ps aux | grep %s' % (prefix), shell=True, stdout=subprocess.PIPE)
+ process_lst = ps.communicate()[0].split('\n')
+ num_proc = number_of_running_process(process_lst)
+
+def establish_edges(jsonTPM_dir, d, d2, glb_param_dict, rprefix, edge_dict):
+ '''
+ jsonTPM_dir -- contain gene expression json files, one for each gene
+ d - binding dictionary {target:{tf1:c1, tf2:c2}, ... }, c1 c2 are strings of conditions
+ d2 - binding dictionary {tf:{target1:c1, target2:c2}, ...}, c1 c2 are strings of conditions
+ '''
+
+ gene_lst = get_ordered_gene_list(glb_param_dict['GENE_LIST']) # targets with fewer edges will get higher priority. For example, those targets never having an edge will be treated first
+ high_gene_lst = glb_param_dict['HIGH_PRIORITY_GENE'].split() # high priority genes CHANGE
+
+ if not os.path.isdir(EDGE_DIR):
+ os.makedirs(EDGE_DIR)
+
+ # make a list of targets, putting high-priority target in the beginning
+ final_gene_lst = high_gene_lst
+ for x in gene_lst:
+ if not x in high_gene_lst:
+ final_gene_lst.append(x)
+
+ # process each target. First consider all TFs of the target (if any). then consider the target's targets (if any).
+ for target in final_gene_lst: # high priority genes are processed first
+ if target in d: # target g is in binding dictionary d
+ tf_dict = d[target] # a dictionary of upstream genes, in the form of {tf1:c1, tf2:c2}
+ if target in d2:
+ target_dict = d2[target] # a dictionary downstream genes, in the form of {target1:c1, target2:c2}
+ else:
+ target_dict = {}
+
+ if len(tf_dict) > 0: # the target has TFs (upstream genes)
+ r_file = '../Data/temp/%s_%s_K%d.R' % (rprefix, target, 2) # k is 2
+ fname = make_r_script(r_file, target, tf_dict, jsonTPM_dir, 2, edge_dict)
+ cmd = 'Rscript %s &' % (r_file) # run the Rscript in background
+ os.system(cmd) # UNCOMMENT ME
+ r_file = '../Data/temp/%s_%s_K%d.R' % (rprefix, target, 3) # k is 3
+ fname = make_r_script(r_file, target, tf_dict, jsonTPM_dir, 3, edge_dict)
+ cmd = 'Rscript %s &' % (r_file) # run the Rscript in background
+ os.system(cmd) # UNCOMMENT ME
+ wait_a_moment(MAX_PROCESS, rprefix) # make sure there are not too many R process running in the same time. If too many, wait. MAX_PROCESS sets the limit.
+
+ if len(target_dict) > 0: # the target has targets
+ count = 0
+ for k in target_dict:
+ successor = k # successos is target's target
+ tf_dict2 = {target:target_dict[k]} # now target becomes TF, and its successor becomes targets
+ r_file = '../Data/temp/%s_%s_one_K%d.R' % (rprefix, successor, 2) # k is 2, one means consider one edge each time.
+ fname = make_r_script(r_file, successor, tf_dict2, jsonTPM_dir, 2, edge_dict)
+ cmd = 'Rscript %s &' % (r_file) # run the Rscript in background
+ os.system(cmd)
+ r_file = '../Data/temp/%s_%s_one_K%d.R' % (rprefix, successor, 3) # k is 3
+ fname = make_r_script(r_file, successor, tf_dict2, jsonTPM_dir, 3, edge_dict)
+ cmd = 'Rscript %s &' % (r_file) # run the Rscript in background
+ os.system(cmd)
+ count = count + 1
+ if count % MAX_PROCESS == 0:
+ wait_a_moment(MAX_PROCESS, rprefix) # make sure there are not too many R process running in the same time. If too many, wait. MAX_PROCESS sets the limit.
+
+
+## main
+param_file = sys.argv[1] # a single prameter file for building network, parameter_for_net.txt, in Data/parameter/
+glb_param_dict = make_global_param_dict(param_file)
+GENE_ID_TO_GENE_NAME = glb_param_dict['GENE_ID_AND_GENE_NAME']
+agi2name_dict = make_gene_name_AGI_map_dict(GENE_ID_TO_GENE_NAME)# for gene names
+curr_time = datetime.now().strftime('%Y%m%d_%H%M%S')
+
+#print('Make target tf using binding.txt')
+#if os.path.exists('../Data/information/target_tf.txt'):
+# cmd = 'cp ../Data/information/target_tf.txt ../Data/information/target_tf.txt.%s' % (curr_time)
+# os.system(cmd)
+
+target_tf_fname = '../Data/information/target_tf.txt.%s' % (curr_time)
+cmd = 'python make_target_tf.py %s > %s' % (param_file, target_tf_fname) # make target_tf.txt CHANGE better to make a temperory copy for this program
+os.system(cmd)
+
+
+#print('Make jsonTPM ...') # CHANGE
+cmd = 'python TPM2JSON.py %s' % (param_file) # make jsonTPM directory. The TPM values are not log-transformed.
+os.system(cmd)
+JSON_DIR = '../Data/history/expr/jsonTPM_%s' % (curr_time) # for each TPM.txt, there should be a unique jsonTPM directory.
+cmd = 'mv ../Data/history/expr/jsonTPM %s' % (JSON_DIR)
+os.system(cmd)
+
+
+#JSON_DIR = '../Data/history/expr/jsonTPM_20170424_154323'
+#target_tf_fname = '../Data/information/target_tf.txt.20170424_154323'
+#print('Establish edges')
+big_tf_dict = make_tf_dict(target_tf_fname) # key is target
+big_target_dict = make_target_dict(target_tf_fname) # key is tf
+rscript_prefix = 'Work' + datetime.now().strftime('%Y%m%d%H%M') # each R script's name starts with WORK followed by time
+edge_dict = make_edge_dict(EDGE_FILE)
+establish_edges(os.path.abspath(JSON_DIR), big_tf_dict, big_target_dict, glb_param_dict, rscript_prefix, edge_dict)