From 97fdefab064f63642fa3ece05b807d29b459df31 Mon Sep 17 00:00:00 2001
From: Hui Lan <lanhui@zjnu.edu.cn>
Date: Wed, 4 Dec 2019 19:03:19 +0800
Subject: brain: add python and R code to local repository.

---
 Code/create_edges3.py | 615 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 615 insertions(+)
 create mode 100644 Code/create_edges3.py

(limited to 'Code/create_edges3.py')

diff --git a/Code/create_edges3.py b/Code/create_edges3.py
new file mode 100644
index 0000000..3cd5ba4
--- /dev/null
+++ b/Code/create_edges3.py
@@ -0,0 +1,615 @@
+# Usage: python create_edges3.py parameter_for_net.txt
+#
+# Make it faster by spawning subprocesses.
+#
+# Make it smaller by spliting TPM.txt into small json files, and converting binding.txt to target_tf.txt first.
+# So we don't need to load the big matrices, TPM.txt and binding.txt.
+#
+# 19 JAN 2017, hui, slcu
+
+import sys, os, operator, itertools
+import numpy as np
+import scipy.stats as stat
+from datetime import datetime
+
+import rpy2.robjects as r
+from rpy2.robjects.packages import importr
+from rpy2.robjects import FloatVector
+
+import warnings
+
+import multiprocessing
+import time
+
+import json
+from geneid2name import make_gene_name_AGI_map_dict
+from param4net import make_global_param_dict, get_gene_name
+
+EDGE_DIR = '../Data/history/edges/one_target'  # a directory storing all edge files, one for each target gene
+GENE_ID_TO_GENE_NAME = '../Data/information/AGI-to-gene-names_v2.txt'
+MAX_NUM_PROCESS = 20
+
+
+
+####################################
+DATA_SYMBOL         = '@'
+TOP_N_TF            = 50
+MIN_NUM_CONDITION   = 20
+SIGNAL_INPUT_RATIO_TAU = 1.5
+REMOVE_HORIZONTAL_STRIP_TAU = 0.01
+REMOVE_VERTICAL_STRIP_TAU   = 0.01
+MIN_NUMBER_OF_POINTS_FOR_MIXTURE_OF_GAUSSIAN = 30
+####################################
+
+def get_three_components_mixtools(y, x, cond_lst):
+
+    # Remove NaNs or Infs
+    warn_msg = ''
+    sz = len(x)
+    if sz < MIN_NUMBER_OF_POINTS_FOR_MIXTURE_OF_GAUSSIAN: # too few points, ignore.
+        return None, None, None, None, None, None, None, None, None, 'IGNORE'
+
+    index = np.isfinite(x) & np.isfinite(y)
+    if sum(index) < sz:
+        warn_msg = 'HAS_NAN_OR_INIFNITE'
+        if sum(index) < MIN_NUMBER_OF_POINTS_FOR_MIXTURE_OF_GAUSSIAN:
+            return None, None, None, None, None, None, None, None, None, 'IGNORE'
+            
+    xx = np.array(x[index])
+    yy = np.array(y[index])
+    cond_lst2 = np.array(cond_lst)
+    cond_lst2 = cond_lst2[index]
+
+    # Train the model
+    mixtools =  importr('mixtools')
+    try:
+        result = mixtools.regmixEM(FloatVector(yy), FloatVector(xx), epsilon = 1e-04, k=3, maxit=100)
+    except:
+        return None, None, None, None, None, None, None, None, None, 'IGNORE'
+    posterior = result[result.names.index('posterior')]
+    posterior = np.array(posterior)
+    l = np.argmax(posterior, axis=1) # class information
+    idx1 = l == 0
+    idx2 = l == 1
+    idx3 = l == 2    
+    warn_msg = 'loglik=%4.2f' % (np.array(result[result.names.index('loglik')])[0])
+
+    return xx[idx1], yy[idx1], xx[idx2], yy[idx2], xx[idx3], yy[idx3], list(cond_lst2[idx1]), list(cond_lst2[idx2]), list(cond_lst2[idx3]), warn_msg
+
+
+def read_experiment_id(fname):
+    ''' Read column names (RNA-seq ids) and row names (gene ids) from TPM.txt '''
+
+    colid = []
+    rowid = []
+
+    f = open(fname)
+    lines = f.readlines()
+    f.close()
+
+    head_line = lines[0].strip()
+    lst = head_line.split()
+    colid = lst[1:]
+    rowid = []
+
+    for line in lines[1:]:
+        line = line.strip()
+        lst = line.split()
+        g = lst[0]
+        rowid.append(g)
+    
+    return (colid, rowid)
+
+
+def read_matrix_data(fname):
+    ''' 
+    fname - a file, first line is head, first column is row name.
+    '''
+    
+    lineno = 0
+    colid = []
+    rowid = []
+    d =  {}  # {gene1:{cond1:val1, cond2:val2, ...}, gene2: {...}, ...}
+    d2 = {} # {cond1:{gene1:val1, gene2:val2, ...}, cond2: {...}, ...}
+    d3 = {} # {gene1: [], gene2: [], ...}
+    d4 = {} # {cond1:[], cond2:[], ...}
+
+    f = open(fname)
+    lines = f.readlines()
+    f.close()
+
+    head_line = lines[0].strip()
+    lst = head_line.split()
+    colid = lst[1:]
+
+    for c in colid:
+        d2[c] = {}
+        d4[c] = []
+    
+    for line in lines[1:]:
+        line = line.strip()
+        lst = line.split()
+        g = lst[0]
+        rowid.append(g)
+        d[g] = {}
+        levels = lst[1:]
+        if len(levels) != len(colid):
+            print('Incomplete columns at row %s' % (g))
+            sys.exit()
+            
+        d3[g] = []
+        for i in range(len(colid)):
+            c = colid[i]
+            d[g][c]  = float(levels[i])
+            d2[c][g] = float(levels[i])
+            d3[g].append(float(levels[i]))
+            d4[c].append(float(levels[i]))
+        lineno += 1
+
+    d_return = {}
+    d_return['xy'] = d  # first gene, then condition
+    d_return['yx'] = d2 # first condition, then gene
+    d_return['xx'] = d3 # each item is an array of gene expression levels, i.e., each item is a row
+    d_return['yy'] = d4 # each item is an array of gene expression levels, i.e., each item is a column
+    d_return['nrow'] = lineno - 1
+    d_return['ncol'] = len(colid)
+    d_return['rowid'] = rowid
+    d_return['colid'] = colid    
+
+    d4_sorted = {}
+    for k in d4:
+        d4_sorted[k] = sorted(d4[k], reverse=True)
+    d_return['yy_sorted'] = d4_sorted
+
+    return d_return
+
+
+def get_value(s, delimit):
+    lst = s.split(delimit)
+    return lst[1].strip()
+
+
+def read_info_data(fname):
+    ''' Read chip-seq data information '''
+
+    if not os.path.exists(fname):
+        print('%s not exists.' % (fname) )
+        sys.exit()
+        
+    d = {'ID_LIST':[]}
+    f = open(fname)
+    lines = f.readlines()
+    f.close()
+    for line in lines:
+        line = line.strip()
+        if line == '' or line.startswith('#') or line.startswith('%'):
+            continue
+        if line.startswith(DATA_SYMBOL):
+            s = line[line.rfind(DATA_SYMBOL[-1])+1:]
+            s = s.strip()
+            if s in d:
+                print('ID %s duplicate' % (s))
+                sys.exit()
+            d[s] = {'PROTEIN_ID':'', 'PROTEN_NAME':'', 'DATA_NAME':'', 'DATA_FORMAT':'', 'DESCRIPTION':'', 'LOCATION':'', 'NOTE':''}
+            d['ID_LIST'].append(s)
+        if line.startswith('DESCRIPTION:'):
+            d[s]['DESCRIPTION'] = get_value(line, ':')
+        elif line.startswith('PROTEN_NAME:'):
+            d[s]['PROTEN_NAME'] = get_value(line, ':')
+        elif line.startswith('PROTEIN_ID:'):
+            d[s]['PROTEIN_ID'] = get_value(line, ':')
+        elif line.startswith('DATA_NAME:'):
+            d[s]['DATA_NAME'] = get_value(line, ':')                        
+        elif line.startswith('DATA_FORMAT:'):
+            d[s]['DATA_FORMAT'] = get_value(line, ':')
+        elif line.startswith('LOCATION:'):
+            d[s]['LOCATION'] = get_value(line, ':')
+        elif line.startswith('NOTE:'):
+            d[s]['NOTE'] = get_value(line, ':')
+
+    return d
+
+
+def get_gene_list(fname):
+    result = []
+    f = open(fname)
+    for line in f:
+        line = line.strip()
+        lst = line.split()
+        result.append(lst[0])
+    f.close()
+    return result
+
+
+def get_related_condition(s, info_dict):
+    lst = s.split(';')
+    result = [] # a list of sample IDs
+    result = info_dict['ID_LIST'] # TBD
+    return result
+
+def update_global_param_dict(glb_param_dict, info_dict):
+    if glb_param_dict['RESEARCH_KEYWORDS'] == '':
+        glb_param_dict['USER_CONDITION_LIST'] = info_dict['ID_LIST']
+    glb_param_dict['USER_CONDITION_LIST'] = get_related_condition(glb_param_dict['RESEARCH_KEYWORDS'], info_dict)
+    
+
+def float_equal(x, y):
+    return np.abs(x-y) < 0.001
+
+
+def get_gene_expression2(gene_id1, gene_id2, cond_lst, expr_dict, takelog=False):
+    ''' get gene expression for two genes.  Conditions in which two genes have zero TPM values are ignored. '''
+    num_cond = len(cond_lst)
+    elst1 = [None]*num_cond
+    elst2 = [None]*num_cond
+    clst  = [None]*num_cond
+    d = expr_dict['xy']
+    j = 0
+    for i in range(num_cond):
+        c = cond_lst[i]
+        x = expr_dict['xy'][gene_id1][c]
+        y = expr_dict['xy'][gene_id2][c]
+        #print('DEBUG %s %s %g %g c=%s' % (gene_id1, gene_id2, x, y, c))
+        #print('DEBUG at2g07745 at R0000SRR1802166XX %g' % (expr_dict['xy']['AT2G07754']['R0000SRR1802166XX']))
+        if not float_equal(x,0.0) or not float_equal(y,0.0): # at least one is not zero
+            if takelog == True: # increase gene expression uniformly by 1 for taking logarithm
+                elst1[j] = np.log(x+1)
+                elst2[j] = np.log(y+1)
+            else:
+                elst1[j] = x
+                elst2[j] = y
+            clst[j] = c
+            j += 1
+    return ( np.array(elst1[0:j]), np.array(elst2[0:j]), clst[0:j] )
+
+
+def get_gene_expression3(gene_id1, gene_id2, cond_lst, expr_dict, takelog=False):
+    ''' 
+    get gene expression for two genes.  Conditions in which two genes have zero TPM values are ignored. 
+    In addition, vertical strip (as seen in their scatter plot) and horizontal strip are removed.
+    '''
+    num_cond = len(cond_lst)
+    elst1 = [None]*num_cond
+    elst2 = [None]*num_cond
+    mark_cond = [True]*num_cond # indicate if a condition should be included
+    clst  = []
+    d = expr_dict['xy']
+
+    for i in range(num_cond):
+        c = cond_lst[i]
+        x = expr_dict['xy'][gene_id1][c]
+        y = expr_dict['xy'][gene_id2][c]
+        if not float_equal(x,0.0) or not float_equal(y,0.0): # at least one is not zero
+            if takelog == True: # increase gene expression uniformly by 1 for taking logarithm
+                elst1[i] = np.log(x+1)
+                elst2[i] = np.log(y+1)
+            else:
+                elst1[i] = x
+                elst2[i] = y
+            if elst1[i] < REMOVE_VERTICAL_STRIP_TAU or elst2[i] < REMOVE_HORIZONTAL_STRIP_TAU: # don't include this condition if its values are in the strip
+                mark_cond[i] = False
+            else:
+                clst.append(c)
+        else:
+            mark_cond[i] = False
+
+    a = np.array(elst1)
+    a = a[mark_cond]
+    b = np.array(elst2)
+    b = b[mark_cond]
+    return (a.astype(np.float64), b.astype(np.float64), clst)
+
+
+#################### select condition stuff ###############
+
+def common_elements(list1, list2):
+     return sorted(list(set(list1).intersection(list2)))
+
+def correlation_is_significant(r, p, glb_param_dict):
+    return (not np.isnan(r)) and np.abs(r) > float(glb_param_dict['TWO_WAY_CORRELATION_CUTOFF']) and p < float(glb_param_dict['TWO_WAY_CORRELATION_PVALUE_CUTOFF'])
+
+def subset_list(lst, bool_index):
+    if len(lst) != len(bool_index):
+        print('subset_list: list size not equal (%d %s)', len(lst), len(bool_index))
+        sys.exit()
+
+    n = len(lst)
+    result = []
+    for i in range(n):
+        if bool_index[i] == True:
+            result.append(lst[i])
+    return result
+
+
+def write_dict(d_lst, glb_param_dict, fname):
+    ''' write a list of dictionary content to fname.  Each dictionary contains a target-TF edge. '''
+    agi2name_dict = glb_param_dict['name_conversion_dict']
+    curr_date = datetime.now().strftime('%Y%m%d_%H%M%S') # add date to the end of each line, for future reference or filtering
+    fname = os.path.join(EDGE_DIR, fname)
+    f = open(fname, 'w')
+    for d in d_lst:
+        if d['type'] == 'two-way':
+            gene_id = d['target']
+            head = '%s %s\t' % (gene_id, get_gene_name(gene_id, agi2name_dict))
+            gene_id = d['TF']
+            head += '%s %s\t' % (gene_id, get_gene_name(gene_id, agi2name_dict))
+            d2 = d['significant']
+            if 'all' in d2:
+                s = '%4.2f\t%s\t%s\t%s\t%s' % (d2['all']['score'], 'all', '.', ' '.join(d2['all']['chip_id']), '.')
+                s += '\t' + curr_date
+                f.write(head + s + '\n')
+            if 'mix' in d2:
+                n = len(d2['mix']['score'])
+                for i in range(n):
+                    s = '%4.2f\t%s\t%s\t%s\t%s' % (d2['mix']['score'][i], 'mix', ' '.join(d2['mix']['signal_set'][i]), ' '.join(d2['mix']['chip_id']), d2['mix']['message'][i])
+                    s += '\t' + curr_date
+                    f.write(head + s + '\n')
+    f.close()
+
+
+def two_way(target, tf_dict, expr_dict, expr_info_dict, glb_param_dict):
+    '''
+
+    Check if target has relationship with each of TFs.
+    
+    tf_dict: a dictionary of TFs, {tf_name:ChIP_ID_LIST)
+
+    Return a list of dictionaries. Each dictionary has the following format:  
+
+        'type'  :'two-way'
+        'target':''
+        'TF'    :''
+        'significant': {
+            'all':          {'signal_set':[], score=2, chip_id:''}
+            'pos_direction':{'signal_set':[], score:.0, chip_id:''}
+            'neg_direction':{'signal_set':[], score:.1, chip_id:''}
+            'user_defined': {'signal_set':[], score:.3, chip_id:''}
+            'mix':          {'signal_set':[], score:[.7,-.5], chip_id:''}
+        }
+
+    '''
+    
+    result_dict_lst = [] # a list of dictionaries, one for each TF, each dict contains info for a Target-TF pair
+    
+    target_gene_id = target
+    all_cond_lst = expr_dict['colid'] # Use all RNA-seq samples.   TBD, can be glb_param_dict['USER_CONDITION_LIST']
+    logrithmize = glb_param_dict['LOGRITHMIZE'].upper() == 'YES' # take logarithmic of TPM values
+
+    if not target_gene_id in expr_dict['rowid']: # target gene not in expression table, cannot do anything
+        return None
+    
+    for tf_gene_id in sorted(tf_dict.keys()):  # y is a TF gene id
+
+        chip_id = tf_dict[tf_gene_id] # a list of chip experiment IDs, e.g., C00000000000
+
+        if not tf_gene_id in expr_dict['rowid']: # tf gene not in expression table, cannot do anything
+            continue
+
+        # get gene expression profiles for target and TF.  If in a RNA-seq sample, both target and TF is 0, then this sample is ignored.
+        target_elst, tf_elst, clist = get_gene_expression3(target_gene_id, tf_gene_id, all_cond_lst, expr_dict, takelog=logrithmize)
+
+        r, p = stat.pearsonr(target_elst, tf_elst)
+
+        d = {}
+        d['target'] = target_gene_id
+        d['TF'] = tf_gene_id
+        d['type'] = 'two-way'
+        d['significant'] = {}
+
+        all_good = False
+        if correlation_is_significant(r, p, glb_param_dict):
+            d['significant']['all'] =  {}
+            d['significant']['all']['signal_set'] =  clist # a list of sample IDs, returned by get_gene_expression3
+            d['significant']['all']['score'] =  r
+            d['significant']['all']['chip_id']  = chip_id            
+            all_good = True
+            
+        user_cond_lst =  glb_param_dict['USER_CONDITION_LIST']
+        if glb_param_dict['RESEARCH_KEYWORDS'] != '' and user_cond_lst != []:
+             target_elst_user, tf_elst_user, clist_user = get_gene_expression3(target_gene_id, tf_gene_id, user_cond_lst, expr_dict, takelog=logrithmize)
+             
+             r, p = stat.pearsonr(target_elst_user, tf_elst_user)
+             if correlation_is_significant(r, p, glb_param_dict):
+                d['significant']['user'] =  {}
+                d['significant']['user']['signal_set'] = user_cond_lst
+                d['significant']['user']['score'] = r
+                d['significant']['user']['chip_id']  = chip_id
+
+        # obsolete
+        max_diff = glb_param_dict['SELECT_POINTS_DIAGONAL_MAX_DIFF']
+        if glb_param_dict['LOOK_FOR_POS_CORRELATION'] == 'YES':
+            aa, bb, index_pos = select_points_diagonal(target_elst, tf_elst, max_diff, 'pos')
+            r_pos, p_pos = stat.pearsonr(aa, bb)
+            if correlation_is_significant(r_pos, p_pos, glb_param_dict) and sum(index_pos) >= MIN_NUM_CONDITION:
+                d['significant']['pos'] =  {}
+                d['significant']['pos']['signal_set'] = subset_list(all_cond_lst, index_pos)
+                d['significant']['pos']['score'] = r_pos
+                d['significant']['pos']['chip_id']  = chip_id
+
+        # obsolete
+        if glb_param_dict['LOOK_FOR_NEG_CORRELATION'] == 'YES':
+            aa, bb, index_neg = select_points_diagonal(target_elst, tf_elst, max_diff, 'neg')
+            r_neg, p_neg = stat.pearsonr(aa, bb)
+            if correlation_is_significant(r_neg, p_neg, glb_param_dict) and sum(index_neg) >= MIN_NUM_CONDITION:
+                d['significant']['neg'] =  {}
+                d['significant']['neg']['signal_set'] = subset_list(all_cond_lst, index_neg)
+                d['significant']['neg']['score'] = r_neg
+                d['significant']['neg']['chip_id']  = chip_id
+
+        K = int(glb_param_dict['NUMBER_OF_COMPONENTS'])
+        if glb_param_dict['MIXTURE_OF_REGRESSION'] == 'YES' and not all_good: # look hard only when using all RNA-seq data does not produce good results
+            if K == 2: # for now consider two components
+                #print('DEBUG len1=%d, len=%d' % (len(target_elst), len(tf_elst)))
+                #print('DEBUG %s, %s, %s' % (target_gene_id, tf_gene_id, ' '.join(clist)))                   
+                index1, index2, msg = get_two_components(target_elst, tf_elst) # get two Gaussian Mixture Model components
+                if msg != 'IGNORE':
+                    aa = target_elst[index1]
+                    bb = tf_elst[index1]
+                    r_mix1, p_mix1 = stat.pearsonr(aa, bb)
+                    aa = target_elst[index2]
+                    bb = tf_elst[index2]
+                    r_mix2, p_mix2 = stat.pearsonr(aa, bb)
+                    #print('DEBUG %s %s r_mix1:%g r_mix2:%g' % (target_gene_id, tf_gene_id, r_mix1, r_mix2))
+                    flag1 = correlation_is_significant(r_mix1, p_mix1, glb_param_dict)
+                    flag2 = correlation_is_significant(r_mix2, p_mix2, glb_param_dict)
+                    if flag1 or flag2:
+                        d['significant']['mix'] =  {}
+                        d['significant']['mix']['signal_set'] = []
+                        d['significant']['mix']['score'] = []
+                        d['significant']['mix']['chip_id']  = chip_id
+                        if flag1:
+                            d['significant']['mix']['signal_set'].append(subset_list(clist, index1))
+                            d['significant']['mix']['score'].append(r_mix1)
+                        if flag2:
+                            d['significant']['mix']['signal_set'].append(subset_list(clist, index2))
+                            d['significant']['mix']['score'].append(r_mix2)
+
+            if K == 3: # three components
+                aa1, bb1, aa2, bb2, aa3, bb3, cond1, cond2, cond3, msg = get_three_components_mixtools(target_elst, tf_elst, clist) # get two Gaussian Mixture Model components
+                if msg != 'IGNORE':
+                    r_mix1, p_mix1 = stat.pearsonr(aa1, bb1)
+                    r_mix2, p_mix2 = stat.pearsonr(aa2, bb2)
+                    r_mix3, p_mix3 = stat.pearsonr(aa3, bb3)
+                    #print('DEBUG %s, %s' % (target_gene_id, tf_gene_id))
+                    #print('DEBUG rmix1=%g, pmix1=%g' % (r_mix1, p_mix1))
+                    #print('DEBUG rmix2=%g, pmix2=%g' % (r_mix2, p_mix2))
+                    #print('DEBUG rmix3=%g, pmix3=%g' % (r_mix3, p_mix3))
+                    #print('DEBUG %d %d %d' %(len(aa1), len(aa2), len(aa3)))
+                    min_num_points = int(glb_param_dict['CORRELATION_BASED_ON_AT_LEAST_N_POINTS'])
+                    flag1 = correlation_is_significant(r_mix1, p_mix1, glb_param_dict) and len(aa1) > min_num_points
+                    flag2 = correlation_is_significant(r_mix2, p_mix2, glb_param_dict) and len(aa2) > min_num_points
+                    flag3 = correlation_is_significant(r_mix3, p_mix3, glb_param_dict) and len(aa3) > min_num_points                   
+                    if flag1 or flag2 or flag3:
+                        d['significant']['mix'] =  {}
+                        d['significant']['mix']['signal_set'] = []
+                        d['significant']['mix']['score'] = []
+                        d['significant']['mix']['chip_id']  = chip_id
+                        d['significant']['mix']['message'] = []
+                        if flag1:
+                            d['significant']['mix']['signal_set'].append(cond1)
+                            d['significant']['mix']['score'].append(r_mix1)
+                            d['significant']['mix']['message'].append(msg)
+                        if flag2:
+                            d['significant']['mix']['signal_set'].append(cond2)
+                            d['significant']['mix']['score'].append(r_mix2)
+                            d['significant']['mix']['message'].append(msg)                            
+                        if flag3:
+                            d['significant']['mix']['signal_set'].append(cond3)
+                            d['significant']['mix']['score'].append(r_mix3)
+                            d['significant']['mix']['message'].append(msg)                            
+                            
+        if len(d['significant']) > 0: # significant edges exist
+            result_dict_lst.append(d)
+
+
+    curr_time = datetime.now().strftime('%Y%m%d')
+    fname = 'edges.txt' + '.' + curr_time +  '.' + target_gene_id
+    if result_dict_lst != []:
+        write_dict(result_dict_lst, glb_param_dict, fname)
+
+
+def three_way(target, tf_lst, expr_dict, expr_info_dict, glb_param_dict):
+    ''' TBD '''
+    return []
+
+
+def make_small_expr_dict(lst, colid, rowid, dirname):
+    d_return = {}
+    d = {}
+    for g in lst:
+        fname = os.path.join(dirname, g + '.json')
+        if os.path.exists(fname):
+            with open(fname) as f:
+                d[g] = json.load(f)
+
+    d_return['colid'] = colid
+    d_return['rowid'] = rowid    
+    d_return['xy'] = d
+
+    return d_return
+
+def make_tf_dict(fname):
+    d = {}
+    f = open(fname)
+    for line in f:
+        line = line.strip()
+        lst = line.split('\t')
+        target = lst[0]
+        tf     = lst[1]
+        cond   = lst[2].split()
+        if not target in d:
+            d[target] = {tf:cond}
+        else:
+            d[target][tf] = cond
+    f.close()
+    return d
+
+
+def establish_edges(jsonTPM_dir, expr_info_dict, rna_experiment_ids, rna_gene_ids, d, bind_info_dict, glb_param_dict):
+    ''' d - binding dictionary {target:{tf:[c1,c2], tf:[c2,c3]}, ...  }  '''
+
+    gene_lst = get_gene_list(glb_param_dict['GENE_LIST'])
+    high_gene_lst = glb_param_dict['HIGH_PRIORITY_GENE'].split() # high priority genes
+    update_global_param_dict(glb_param_dict, expr_info_dict)
+
+    if not os.path.isdir(EDGE_DIR):
+        os.makedirs(EDGE_DIR)
+        
+    process_lst = [] # a list of processes
+    final_gene_lst = high_gene_lst
+    for x in gene_lst:
+        if not x in high_gene_lst:
+            final_gene_lst.append(x)
+
+    for g in final_gene_lst: # high priority genes are in the front
+        if g in d: # target g is in binding dictionary d
+            tf_dict = d[g]
+            if len(tf_dict) > 0: # it has TFs, usually it is the case
+                if glb_param_dict['TWO_WAY'] == 'YES':
+                    small_gene_lst = tf_dict.keys()
+                    small_gene_lst.append(g)
+                    expr_dict = make_small_expr_dict(small_gene_lst, rna_experiment_ids, rna_gene_ids, jsonTPM_dir)
+                    p = multiprocessing.Process(target=two_way, args=(g, tf_dict, expr_dict, expr_info_dict, glb_param_dict))
+                    process_lst.append(p)
+                    p.start()
+                    if len(process_lst) > MAX_NUM_PROCESS:
+                        temp_lst = []
+                        for p in process_lst:
+                            p.join(timeout=60) # wait for subprocess to finish, wait for 60 secs, if it has not finished yet, check next subprocess
+                            if p.is_alive():
+                                temp_lst.append(p)
+                        process_lst = temp_lst
+
+
+########## main ##################################################
+r.r['options'](warn=-1) # supress warning message from rpy2
+warnings.filterwarnings("ignore")
+
+param_file = sys.argv[1] # a single prameter file
+glb_param_dict = make_global_param_dict(param_file)
+agi2name_dict = make_gene_name_AGI_map_dict(GENE_ID_TO_GENE_NAME)
+glb_param_dict['name_conversion_dict'] = agi2name_dict
+
+#print('Read expression data')
+cmd = 'python TPM2JSON.py %s' % (param_file) # make jsonTPM directory
+os.system(cmd)
+curr_time = datetime.now().strftime('%Y%m%d_%H%M')
+JSON_DIR = '../Data/history/expr/jsonTPM_%s' % (curr_time)
+cmd = 'mv ../Data/history/expr/jsonTPM %s' % (JSON_DIR)
+os.system(cmd)
+
+expr_info_dict = read_info_data(glb_param_dict['EXPRESSION_INFO'])
+colid, rowid = read_experiment_id(glb_param_dict['EXPRESSION_MATRIX'])
+
+#print('Read binding data')
+cmd = 'python make_target_tf.py %s' % (param_file)  # make target_tf.txt
+os.system(cmd)   # change
+curr_time = datetime.now().strftime('%Y%m%d_%H%M')
+target_tf_fname = 'target_tf.txt.' + curr_time
+cmd = 'cp target_tf.txt %s' % (target_tf_fname)
+os.system(cmd)
+big_tf_dict = make_tf_dict(target_tf_fname)
+bind_info_dict = read_info_data(glb_param_dict['BINDING_INFO'])
+
+#print('Establish edges')
+establish_edges(JSON_DIR, expr_info_dict, colid, rowid, big_tf_dict, bind_info_dict, glb_param_dict)
-- 
cgit v1.2.1