1 files changed, 450 insertions, 0 deletions
diff --git a/Code/create_edges4.py b/Code/create_edges4.py
new file mode 100644
index 0000000..29c87ff
--- /dev/null
+++ b/Code/create_edges4.py
@@ -0,0 +1,450 @@
+# Usage: python create_edges4.py parameter_for_net.txt
+# Purpose:
+# This script will generate a few WORK20170328_1026_AGI_one_K2/3.R scripts, each for a target gene. Treat each target separately and at the same, to speed things up.
+# The results will be saved as edges.txt.AT2G40300.Apr.04.2017.11:45:30.k3, where AT3G12580 is gene id, and k3 means K=3 in Mixture of Regressions.
+# The edges.txt files will be merged together later by update_network.py.
+# Make it faster by handling each target separately.
+# Make memory footprint smaller by spliting TPM.txt into small json files (in JSON_DIR), and converting binding.txt to target_tf.txt (in target_tf_fname) first.
+# So we only load necessary gene expression vectors each time.  So we don't need to load the big matrices, TPM.txt and binding.txt.
+#
+#  7 Mar 2017, slcu, hui
+#  Last modified 23 Mar 2017, slcu, hui
+#  Last modified 23 Mar 2017, slcu, hui. Check edges.txt to determine update.
+
+import sys, os, operator, itertools
+from datetime import datetime
+import time
+import json
+import subprocess
+from geneid2name import make_gene_name_AGI_map_dict
+from param4net import make_global_param_dict
+
+EDGE_FILE = '../Data/history/edges/edges.txt'  # recent, merged edges from various sources
+EDGE_DIR = '../Data/history/edges/one_target'  # a directory for storing all edge files generated by this script, one for each target gene
+TIME_INTERVAL = 2 # wait 5 seconds before launching a R Rscript
+MAX_PROCESS   = 10 # CHANGE this to a larger number if you have many CPUs
+AVERAGE_LIKELIHOOD_TAU = -0.5 # a value betweeo -0.1 to -998.  must be negative, lower this value make less effort in creating brand-new edges.
+EDGE_AGE = 30 # if an edge's age is less than 30 days, don't need to update it.
+
+####################################
+DATA_SYMBOL         = '@'
+####################################
+
+def get_gene_list(fname):
+    result = []
+    f = open(fname)
+    for line in f:
+        line = line.strip()
+        lst = line.split()
+        result.append(lst[0])
+    f.close()
+    return result
+
+def get_ordered_gene_list(fname):
+    gene_list = get_gene_list(fname)
+    d = {}
+    if not os.path.exists(EDGE_FILE):
+        return gene_list    
+    f = open(EDGE_FILE)
+    lines = f.readlines()
+    f.close()
+    for line in lines:
+        line = line.strip()
+        lst = line.split('\t')
+        target = lst[0].split()[0]
+        tf     = lst[1].split()[0]
+        if not target in d:
+            d[target] = 1
+        else:
+            d[target] += 1
+            
+    result_gene_lst = []
+    for t in sorted(d.items(), key=operator.itemgetter(1)): # targets with fewer edges will be on the top
+        g = t[0]
+        if g in gene_list:
+            result_gene_lst.append(g)
+    return result_gene_lst
+
+
+def make_tf_dict(fname):
+    d = {}
+    f = open(fname)
+    for line in f:
+        line = line.strip()
+        lst = line.split('\t')
+        target = lst[0]
+        tf     = lst[1]
+        cond   = lst[2]
+        if not target in d:
+            d[target] = {tf:cond}
+        else:
+            d[target][tf] = cond
+    f.close()
+    return d
+
+def make_target_dict(fname):
+    d = {}
+    f = open(fname)
+    for line in f:
+        line = line.strip()
+        lst = line.split('\t')
+        target = lst[0]
+        tf     = lst[1]
+        cond   = lst[2]
+        if not tf in d:
+            d[tf] = {target:cond}
+        else:
+            d[tf][target] = cond
+    f.close()
+    return d
+
+def not_bad_line(s):
+    if s.strip() == '':
+        return False
+    if 'WARNING' in s:
+        return False
+    if 'number' in s:
+        return False
+    if 'Need' in s:
+        return False
+    if 'Error' in s:
+        return False
+    if 'Too' in s:
+        return False
+    if not s.startswith('AT'):
+        return False
+    return True
+
+def make_edge_dict(fname):
+    d = {}
+    if not os.path.exists(fname):
+        return d
+    f = open(fname)
+    for line in f:
+        line = line.strip()
+        if not_bad_line(line):
+            lst = line.split('\t')
+            target_id = lst[0].split()[0]
+            tf_id = lst[1].split()[0]
+            date = '20161201'
+            num_rcond = len(lst[4].split())
+            avg_loglik = -999 # very low likelihood
+            loglik = lst[6]
+            if loglik != '.':
+                if '=' in loglik:
+                    avg_loglik = float(loglik.split('=')[1])/num_rcond
+                else:
+                    avg_loglik = float(loglik)/num_rcond
+            if len(lst) == 8:
+                date = lst[7]
+            if not tf_id in d:
+                d[tf_id] = {target_id:{'date':date, 'loglikelihood':avg_loglik}}
+            else:
+                d[tf_id][target_id] = {'date':date, 'loglikelihood':avg_loglik}
+    f.close()
+    return d
+
+def make_r_script(fname, target, tf_dict, abs_jsonTPM_dir, num_component, edge_dict):
+    head =  'k.lst <- c(%d)\n' % (num_component) 
+    head += 'target <- \'%s\'\n' % (target)
+    head += 'id2 <- target\n'
+    tfs = ''
+    conds = ''
+    recent_edge = ''
+    curr_time = datetime.now().strftime('%Y%m%d')
+    for k in tf_dict.keys(): # k is tf
+        tfs += '\'%s\',' % (k)
+        conds += '\'%s\',' % (tf_dict[k])
+        if k in edge_dict and target in edge_dict[k] and (int(curr_time) - int(edge_dict[k][target]['date']) <= EDGE_AGE or edge_dict[k][target]['loglikelihood'] >= AVERAGE_LIKELIHOOD_TAU): # recent and good
+            recent_edge += '%d,' % (1)
+        else:
+            recent_edge += '%d,' % (0)
+
+    head += 'tfs <- c(' + tfs.rstrip(',') + ')\n'
+    head += 'conditions <- c(' + conds.rstrip(',') + ')\n'
+    head += 'recent.edge <- c(' + recent_edge.rstrip(',') + ')\n'    
+    head += 'jsonTPM.dir <- \'%s\'\n' % (abs_jsonTPM_dir)
+    head += 'AGINAME_FILE   <- \'%s\'\n' % (os.path.abspath(GENE_ID_TO_GENE_NAME))
+    body = '''
+	post.translation <- function(x, y) {
+	  mean.x <- mean(x)
+	  sd.x  <- sd(x)
+	  index <- x > mean.x - sd.x & x < mean.x + sd.x
+	  sd.y <- sd(y[index])
+	  result <- list(value=ifelse(mean.x < 2.0, 0.0, (mean.x/max(x)) * sd.y * sum(index)/length(index)), index=which(index==T), percent=sum(index)/length(index))
+	}
+
+	post.translation.2 <- function(x, y) {
+	  # x is consititutively high while y varies a lot
+	  mean.x <- mean(x)
+	  sd.x  <- max(sd(x), 1) # a number above 1
+	  index <- x > mean.x - sd.x & x < mean.x + sd.x # points within the window +/- sd.x
+	  sd.y <- quantile(y[index],0.85)-quantile(y[index],0.15) # dispersion of y within the window
+	  sd.y.2 <- quantile(y,0.85)-quantile(y,0.15) # dispersion of all y
+	  v.disp <- sd.y/max(1, sd.y.2) # how disperse y is within the windown, a number between 0 and 1
+	  # value measure dispersion of y and percent of points within a window
+	  result <- list(value=ifelse(mean.x < 2.0, 0.0, v.disp * sum(index)/length(index)), index=which(index==T), percent=sum(index)/length(index))
+	}
+
+	post.translation.3 <- function(x, y) {
+	  # x is consititutively high while y varies a lot
+	  mean.x <- mean(x)
+	  upper.percentile <- 0.85 # used for computing vertical dispersion
+	  lowest.n <- 3 # number of points with lowest x values
+	  min.mean.x <- max(2.0, quantile(x, 0.25)) # mean of x must be greater than this value
+	  sd.x  <- min(sd(x), 1) # a number between 0 and 1
+	  index <- x > mean.x - sd.x & x < mean.x + sd.x # points within the window +/- sd.x
+	  sd.y <- quantile(y[index],upper.percentile)-quantile(y[index],1-upper.percentile) # dispersion of y within the window
+	  sd.y.2 <- quantile(y,upper.percentile)-quantile(y,1-upper.percentile) # dispersion of all y
+	  v.disp <- sd.y/max(1, sd.y.2) # how disperse y is within the window, a number between 0 and 1
+	  
+	  rst <- sort(x, index.return=T)
+	  top.n <- sum(rst$x < 1)
+	  top.n <- max(1, min(top.n, lowest.n))
+	  small.y <- min(mean(y[rst$ix[1:top.n]]), mean(y[x<1])) # use the smaller value
+	  small.y <- ifelse(is.nan(small.y)==T, 999, small.y)
+	  # value measure dispersion of y and percent of points within a window
+	  result <- list(valid=small.y, value=ifelse(mean.x < min.mean.x, 0.0, v.disp * sum(index)/length(index)), index=which(index==T), percent=sum(index)/length(index))
+	}
+	
+	in.component <- function(posterior, k) {
+	  # posterior is an Nxk matrix, each row is a data points, and each col is prob belonging to a component
+	  p = posterior[,k]
+	  n = length(p)
+	  index <- rep(F,n)
+	  for (i in 1:n) {
+	    if (p[i] > runif(1)) {
+	      index[i] = T
+	    }
+	  }
+	  result <- index
+	}
+			
+	####### Read data #########################################
+	CORR_THRESHOLD <- 0.7
+	agi        <- read.table(AGINAME_FILE, sep='\\t', header=FALSE, row.names=1, stringsAsFactors=F) # AGINAME_FILE cannot contain quotes
+	#######################################################
+	library(mixtools)
+	library(rjson)
+	name2 <- agi[id2,1]
+	result <- ''
+	for (i in 1:length(tfs)) {
+	    if (recent.edge[i] == 1) {
+	        next
+	    }
+	    curr.date <- gsub('-','',Sys.Date())
+	    id1 <- tfs[i]
+	    name1 <- agi[id1,1]
+	    cond <- conditions[i]
+
+	    file.x <- paste(jsonTPM.dir, paste(id1, '.json', sep=''), sep='/')
+	    if (!file.exists(file.x)) { next }
+	    x <- as.data.frame(fromJSON(file = file.x))
+	    x <- log(x+1)
+	    rcond.x <- names(x)
+	    x <- as.vector(t(x)) # convert it to a vector
+
+	    file.y <- paste(jsonTPM.dir, paste(id2, '.json', sep=''), sep='/')
+	    if (!file.exists(file.y)) { break }
+	    y <- as.data.frame(fromJSON(file = file.y))
+	    y <- log(y+1)
+	    rcond.y <- names(y)
+	    y <- as.vector(t(y)) # convert it to a vector
+
+	    rna.sample.id <- rcond.x
+	    if (all(rcond.x == rcond.y) == FALSE | id1 == id2) { # if the IDs in two json files do not match, or target is the same as tf, then ignore 
+	       next
+	    }
+
+	    MIN_SIZE <- min(100, max(10, ceiling(0.5 * length(x))))
+
+	    index <- x < 0.01 | y < 0.01 # don't include data that is too small
+	    x <- x[!index]
+	    y <- y[!index]
+
+	    if (length(x) < MIN_SIZE) {
+	       next
+	    }
+	    r <- cor(x, y)
+	    if (abs(r) >= CORR_THRESHOLD) {
+	        #s = sprintf('%s %s\\t%s %s\\t%4.2f\\t%s\\t%s\\t%s\\t%s\\t%s\\n', id2, name2, id1, name1, r, 'all', '.', cond, '.', curr.date)
+	        #result <- paste(result, s, sep='')
+	        next  # a good correlation is found using all experiments, so not necessary to look further
+	    }
+	
+	    rna.sample.id <- rna.sample.id[!index] # this step is important to make the following index work
+
+	    pos_r_max   <- -2
+	    pos_r_N     <- 0
+	    pos_r_index <- c()
+	    pos_r_loglik <- -100000000
+	
+	    neg_r_max   <- 2
+	    neg_r_N     <- 0
+	    neg_r_index <- c()
+	    neg_r_loglik <- -100000000
+
+	    for (k in k.lst) {
+	        em.out <- regmixEM(y, x, maxit=150, epsilon=1e-04, k=k)
+	        for (j in seq(1,k,1)) {
+	            index <- in.component(em.out$posterior, j)
+	            size <- sum(index)
+	            r <- cor(em.out$x[index,2], em.out$y[index])
+	            if (!is.na(r) && r >= CORR_THRESHOLD && size >= MIN_SIZE && r > pos_r_max && size > pos_r_N) {
+	                pos_r_max <- r
+	                pos_r_N   <- size
+	                pos_r_index <- index
+	                pos_r_loglik <- em.out$loglik
+	            }
+	            if (!is.na(r) && r <= -CORR_THRESHOLD && size >= MIN_SIZE && r < neg_r_max && size > neg_r_N) {
+	                neg_r_max <- r
+	                neg_r_N   <- size
+	                neg_r_index <- index
+	                neg_r_loglik <- em.out$loglik
+	            }
+	        }
+	    }
+	    hit <- 0
+	    if (pos_r_max > 0) { # has a good positive correlation
+	        sub.cond <- paste(rna.sample.id[pos_r_index], collapse=' ')
+	        s = sprintf('%s %s\\t%s %s\\t%4.2f\\t%s\\t%s\\t%s\\t%4.2f\\t%s\\n', id2, name2, id1, name1, pos_r_max, 'mix', sub.cond, cond, pos_r_loglik, curr.date)
+	        result <- paste(result, s, sep='')
+	        hit <- hit + 1
+	    } 
+	    if (neg_r_max < 0) { # has a good negative correlation
+	        sub.cond <- paste(rna.sample.id[neg_r_index], collapse=' ')
+	        s = sprintf('%s %s\\t%s %s\\t%4.2f\\t%s\\t%s\\t%s\\t%4.2f\\t%s\\n', id2, name2, id1, name1, neg_r_max, 'mix', sub.cond, cond, neg_r_loglik, curr.date)
+	        result <- paste(result, s, sep='')
+	        hit <- hit + 1
+	    }
+	    if (hit == 0) {
+	        t <- post.translation.3(x, y)
+	        post.r <- t$percent
+	        if (t$valid < quantile(y,0.25) & t$value > 0.69 & post.r >= 0.70 & length(t$index) > MIN_SIZE) {
+	          sub.cond <- paste(rna.sample.id[t$index], collapse=' ')
+	          s = sprintf('%s %s\\t%s %s\\t%4.2f\\t%s\\t%s\\t%s\\t%s\\t%s\\n', id2, name2, id1, name1, post.r, 'mix', sub.cond, cond, '.', curr.date)
+	          result <- paste(result, s, sep='')
+	        }
+	    }
+	}
+    '''
+    tail = '\n'
+    tail += 'output.file <- paste(\'%s/edges.txt\', id2, format(Sys.time(), \'%%b.%%d.%%Y.%%X\'), \'k%d\', sep=\'.\')\n' % (EDGE_DIR, num_component)
+    tail += 'if (result != \'\') cat(result, file=output.file, sep=\'\')\n'
+    f = open(fname, 'w')
+    content = head + body + tail
+    f.write('\n'.join([line.lstrip('\t').rstrip() for line in content.split('\n')]))
+    f.close()
+    return fname
+
+def number_of_running_process(lst):
+    ''' get number of running processes (with CPU usage greater than 0) '''
+    count = 0
+    for x in lst:
+        x = x.strip()
+        if x != '':
+            count += 1 if x.split()[2] > '0.0' else 0 # CPU usage great than 0.0
+    return count
+
+def wait_a_moment(n, prefix):
+    ''' if there are more than n work_on...R scripts running, wait... '''
+    time.sleep(TIME_INTERVAL)
+    ps = subprocess.Popen('ps aux | grep %s' % (prefix), shell=True, stdout=subprocess.PIPE)  # CHANGE
+    num_proc = number_of_running_process(ps.communicate()[0].split('\n'))
+    while (num_proc > n):
+        #print('number of running processes %d' % (len(process_lst)))
+        time.sleep(TIME_INTERVAL)        
+        ps = subprocess.Popen('ps aux | grep %s' % (prefix), shell=True, stdout=subprocess.PIPE)
+        process_lst = ps.communicate()[0].split('\n')
+        num_proc = number_of_running_process(process_lst)
+    
+def establish_edges(jsonTPM_dir, d, d2, glb_param_dict, rprefix, edge_dict):
+    ''' 
+    jsonTPM_dir -- contain gene expression json files, one for each gene
+    d - binding dictionary {target:{tf1:c1, tf2:c2}, ...  }, c1 c2 are strings of conditions  
+    d2 - binding dictionary {tf:{target1:c1, target2:c2}, ...}, c1 c2 are strings of conditions
+    '''
+
+    gene_lst = get_ordered_gene_list(glb_param_dict['GENE_LIST']) # targets with fewer edges will get higher priority.  For example, those targets never having an edge will be treated first
+    high_gene_lst = glb_param_dict['HIGH_PRIORITY_GENE'].split() # high priority genes CHANGE
+    
+    if not os.path.isdir(EDGE_DIR):
+        os.makedirs(EDGE_DIR)
+
+    # make a list of targets, putting high-priority target in the beginning
+    final_gene_lst = high_gene_lst
+    for x in gene_lst:
+        if not x in high_gene_lst:
+            final_gene_lst.append(x)
+            
+    # process each target.  First consider all TFs of the target (if any).  then consider the target's targets (if any).
+    for target in final_gene_lst: # high priority genes are processed first
+        if target in d: # target g is in binding dictionary d
+            tf_dict = d[target] # a dictionary of upstream genes, in the form of {tf1:c1, tf2:c2}
+            if target in d2:
+                target_dict = d2[target] # a dictionary downstream genes, in the form of {target1:c1, target2:c2}
+            else:
+                target_dict = {}
+                
+            if len(tf_dict) > 0: # the target has TFs (upstream genes)
+                r_file = '../Data/temp/%s_%s_K%d.R' % (rprefix, target, 2) # k is 2
+                fname = make_r_script(r_file, target, tf_dict, jsonTPM_dir, 2, edge_dict)
+                cmd = 'Rscript %s &' % (r_file) # run the Rscript in background
+                os.system(cmd) # UNCOMMENT ME
+                r_file = '../Data/temp/%s_%s_K%d.R' % (rprefix, target, 3) # k is 3
+                fname = make_r_script(r_file, target, tf_dict, jsonTPM_dir, 3, edge_dict)
+                cmd = 'Rscript %s &' % (r_file) # run the Rscript in background
+                os.system(cmd) # UNCOMMENT ME
+                wait_a_moment(MAX_PROCESS, rprefix) # make sure there are not too many R process running in the same time.  If too many, wait.  MAX_PROCESS sets the limit.
+
+            if len(target_dict) > 0: # the target has targets
+                count = 0
+                for k in target_dict:
+                    successor = k # successos is target's target
+                    tf_dict2 = {target:target_dict[k]} # now target becomes TF, and its successor becomes targets
+                    r_file = '../Data/temp/%s_%s_one_K%d.R' % (rprefix, successor, 2) # k is 2, one means consider one edge each time.
+                    fname = make_r_script(r_file, successor, tf_dict2, jsonTPM_dir, 2, edge_dict)
+                    cmd = 'Rscript %s &' % (r_file) # run the Rscript in background
+                    os.system(cmd)
+                    r_file = '../Data/temp/%s_%s_one_K%d.R' % (rprefix, successor, 3) # k is 3
+                    fname = make_r_script(r_file, successor, tf_dict2, jsonTPM_dir, 3, edge_dict)
+                    cmd = 'Rscript %s &' % (r_file) # run the Rscript in background
+                    os.system(cmd)
+                    count = count + 1
+                    if count % MAX_PROCESS == 0:
+                        wait_a_moment(MAX_PROCESS, rprefix) # make sure there are not too many R process running in the same time.  If too many, wait.  MAX_PROCESS sets the limit.
+                
+                
+## main
+param_file = sys.argv[1] # a single prameter file for building network, parameter_for_net.txt, in Data/parameter/
+glb_param_dict = make_global_param_dict(param_file)
+GENE_ID_TO_GENE_NAME = glb_param_dict['GENE_ID_AND_GENE_NAME']
+agi2name_dict = make_gene_name_AGI_map_dict(GENE_ID_TO_GENE_NAME)# for gene names
+curr_time = datetime.now().strftime('%Y%m%d_%H%M%S')
+
+#print('Make target tf using binding.txt')
+#if os.path.exists('../Data/information/target_tf.txt'):
+#    cmd = 'cp ../Data/information/target_tf.txt ../Data/information/target_tf.txt.%s' % (curr_time)
+#    os.system(cmd)
+
+target_tf_fname = '../Data/information/target_tf.txt.%s' % (curr_time) 
+cmd = 'python make_target_tf.py %s > %s' % (param_file, target_tf_fname)  # make target_tf.txt CHANGE better to make a temperory copy for this program
+os.system(cmd)
+
+
+#print('Make jsonTPM ...')  # CHANGE
+cmd = 'python TPM2JSON.py %s' % (param_file) # make jsonTPM directory. The TPM values are not log-transformed.
+os.system(cmd)
+JSON_DIR = '../Data/history/expr/jsonTPM_%s' % (curr_time) # for each TPM.txt, there should be a unique jsonTPM directory.
+cmd = 'mv ../Data/history/expr/jsonTPM %s' % (JSON_DIR)
+os.system(cmd)
+
+
+#JSON_DIR = '../Data/history/expr/jsonTPM_20170424_154323'
+#target_tf_fname = '../Data/information/target_tf.txt.20170424_154323'
+#print('Establish edges')
+big_tf_dict = make_tf_dict(target_tf_fname) # key is target
+big_target_dict = make_target_dict(target_tf_fname) # key is tf
+rscript_prefix = 'Work' + datetime.now().strftime('%Y%m%d%H%M') # each R script's name starts with WORK followed by time
+edge_dict = make_edge_dict(EDGE_FILE)
+establish_edges(os.path.abspath(JSON_DIR), big_tf_dict, big_target_dict, glb_param_dict, rscript_prefix, edge_dict)